From 72b1e22dfcad1daca6906148fd956ffe404bb0bc Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:45 -0700 Subject: x64, x2apic/intr-remap: generic irq migration support from process context Generic infrastructure for migrating the irq from the process context in the presence of CONFIG_GENERIC_PENDING_IRQ. This will be used later for migrating irq in the presence of interrupt-remapping. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 46d6611a33bb..628b5572a7c2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -87,7 +87,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) set_balance_irq_affinity(irq, cpumask); #ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->chip->set_affinity(irq, cpumask); + spin_unlock_irqrestore(&desc->lock, flags); + } else + set_pending_irq(irq, cpumask); #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); -- cgit v1.2.1 From 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 6 Jul 2008 17:23:55 +0800 Subject: rcu classic: simplify the next pending batch use a batch number(rcp->pending) instead of a flag(rcp->next_pending) rcu_start_batch() need to change this flag, so mb()s is needed for memory-access safe. but(after this patch applied) rcu_start_batch() do not change this batch number(rcp->pending), rcp->pending is managed by __rcu_process_callbacks only, and troublesome mb()s are eliminated. And codes look simpler and clearer. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Cc: Gautham Shenoy Cc: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 16eeeaa9d618..03726eb95193 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -60,12 +60,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -276,14 +278,8 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; /* @@ -441,16 +437,14 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, /* determine batch number */ rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() - */ - smp_rmb(); - if (!rcp->next_pending) { + if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } spin_unlock(&rcp->lock); } } -- cgit v1.2.1 From 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 6 Jul 2008 17:23:59 +0800 Subject: rcu classic: new algorithm for callbacks-processing(v2) This is v2, it's a little deference from v1 that I had send to lkml. use ACCESS_ONCE use rcu_batch_after/rcu_batch_before for batch # comparison. rcutorture test result: (hotplugs: do cpu-online/offline once per second) No CONFIG_NO_HZ: OK, 12hours No CONFIG_NO_HZ, hotplugs: OK, 12hours CONFIG_NO_HZ=y: OK, 24hours CONFIG_NO_HZ=y, hotplugs: Failed. (Failed also without my patch applied, exactly the same bug occurred, http://lkml.org/lkml/2008/7/3/24) v1's email thread: http://lkml.org/lkml/2008/6/2/539 v1's description: The code/algorithm of the implement of current callbacks-processing is very efficient and technical. But when I studied it and I found a disadvantage: In multi-CPU systems, when a new RCU callback is being queued(call_rcu[_bh]), this callback will be invoked after the grace period for the batch with batch number = rcp->cur+2 has completed very very likely in current implement. Actually, this callback can be invoked after the grace period for the batch with batch number = rcp->cur+1 has completed. The delay of invocation means that latency of synchronize_rcu() is extended. But more important thing is that the callbacks usually free memory, and these works are delayed too! it's necessary for reclaimer to free memory as soon as possible when left memory is few. A very simple way can solve this problem: a field(struct rcu_head::batch) is added to record the batch number for the RCU callback. And when a new RCU callback is being queued, we determine the batch number for this callback(head->batch = rcp->cur+1) and we move this callback to rdp->donelist if we find that head->batch <= rcp->completed when we process callbacks. This simple way reduces the wait time for invocation a lot. (about 2.5Grace Period -> 1.5Grace Period in average in multi-CPU systems) This is my algorithm. But I do not add any field for struct rcu_head in my implement. We just need to memorize the last 2 batches and their batch number, because these 2 batches include all entries that for whom the grace period hasn't completed. So we use a special linked-list rather than add a field. Please see the comment of struct rcu_data. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Cc: Gautham Shenoy Cc: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 157 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 03726eb95193..d3553ee55f64 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -120,6 +120,43 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + smp_mb(); /* reads the most recently updated value of rcu->cur. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -135,18 +172,11 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -171,20 +201,11 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -213,12 +234,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -360,13 +375,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + if (list) { + local_irq_disable(); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_enable(); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, @@ -380,9 +397,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp, if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + /* spin_lock implies smp_mb() */ + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); local_irq_disable(); this_rdp->qlen += rdp->qlen; @@ -416,27 +433,37 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } - - if (rdp->nxtlist && !rdp->curlist) { + if (rdp->nxtlist) { local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily + */ + if (!rcu_batch_before(rcp->completed, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(rcp->completed, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; + + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } - /* determine batch number */ - rdp->batch = rcp->cur + 1; + local_irq_enable(); if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ @@ -462,15 +489,26 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + if (rdp->nxtlist) { + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(rcp->completed, rdp->batch)) + return 1; + if (!rcu_batch_before(rcp->completed, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -506,7 +544,7 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } void rcu_check_callbacks(int cpu, int user) @@ -553,8 +591,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; -- cgit v1.2.1 From f84dbb912f344270f31d5cce974f12908a47798d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 10 Jul 2008 14:48:54 -0700 Subject: genirq: enable polling for disabled screaming irqs When we disable a screaming irq we never see it again. If the irq line is shared or if the driver half works this is a real pain. So periodically poll the handlers for screaming interrupts. I use a timer instead of the classic irq poll technique of working off the timer interrupt because when we use the local apic timers note_interrupt is never called (bug?). Further on a system with dynamic ticks the timer interrupt might not even fire unless there is a timer telling it it needs to. I forced this case on my test system with an e1000 nic and my ssh session remained responsive despite the interrupt handler only being called every 10th of a second. Signed-off-by: Eric W. Biederman Signed-off-by: Ingo Molnar --- kernel/irq/spurious.c | 146 +++++++++++++++++++++++++++++++------------------- 1 file changed, 91 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c66d3f10e853..19fe9d6ebfe8 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -12,83 +12,117 @@ #include #include #include +#include static int irqfixup __read_mostly; +#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) +static void poll_spurious_irqs(unsigned long dummy); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); + /* * Recovery handler for misrouted interrupts. */ -static int misrouted_irq(int irq) +static int try_one_irq(int irq, struct irq_desc *desc) { - int i; + struct irqaction *action; int ok = 0; int work = 0; /* Did we do work for a real IRQ */ - for (i = 1; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - struct irqaction *action; - - if (i == irq) /* Already tried */ - continue; - - spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); - continue; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; + spin_lock(&desc->lock); + /* Already running on another processor */ + if (desc->status & IRQ_INPROGRESS) { + /* + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too + */ + if (desc->action && (desc->action->flags & IRQF_SHARED)) + desc->status |= IRQ_PENDING; spin_unlock(&desc->lock); + return ok; + } + /* Honour the normal IRQ locking */ + desc->status |= IRQ_INPROGRESS; + action = desc->action; + spin_unlock(&desc->lock); - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(i, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; + while (action) { + /* Only shared IRQ handlers are safe to call */ + if (action->flags & IRQF_SHARED) { + if (action->handler(irq, action->dev_id) == + IRQ_HANDLED) + ok = 1; } - local_irq_disable(); - /* Now clean up the flags */ - spin_lock(&desc->lock); - action = desc->action; + action = action->next; + } + local_irq_disable(); + /* Now clean up the flags */ + spin_lock(&desc->lock); + action = desc->action; + /* + * While we were looking for a fixup someone queued a real + * IRQ clashing with our walk: + */ + while ((desc->status & IRQ_PENDING) && action) { /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: - */ - while ((desc->status & IRQ_PENDING) && action) { - /* - * Perform real IRQ processing for the IRQ we deferred - */ - work = 1; - spin_unlock(&desc->lock); - handle_IRQ_event(i, action); - spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too + * Perform real IRQ processing for the IRQ we deferred */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(i); + work = 1; spin_unlock(&desc->lock); + handle_IRQ_event(irq, action); + spin_lock(&desc->lock); + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + /* + * If we did actual work for the real IRQ line we must let the + * IRQ controller clean up too + */ + if (work && desc->chip && desc->chip->end) + desc->chip->end(irq); + spin_unlock(&desc->lock); + + return ok; +} + +static int misrouted_irq(int irq) +{ + int i; + int ok = 0; + + for (i = 1; i < NR_IRQS; i++) { + struct irq_desc *desc = irq_desc + i; + + if (i == irq) /* Already tried */ + continue; + + if (try_one_irq(i, desc)) + ok = 1; } /* So the caller can adjust the irq error counts */ return ok; } +static void poll_spurious_irqs(unsigned long dummy) +{ + int i; + for (i = 1; i < NR_IRQS; i++) { + struct irq_desc *desc = irq_desc + i; + unsigned int status; + + /* Racy but it doesn't matter */ + status = desc->status; + barrier(); + if (!(status & IRQ_SPURIOUS_DISABLED)) + continue; + + try_one_irq(i, desc); + } + + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); +} + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -212,6 +246,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; desc->chip->disable(irq); + + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } -- cgit v1.2.1 From 8d00a6c8f6b08e7167bc03bf955cdc7e47c5132e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Jul 2008 08:39:57 +0200 Subject: genirq: remove last NO_IDLE_HZ leftovers Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 5fa6198e9139..f4c8a03a9fbb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -131,8 +131,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - handle_dynamic_tick(action); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); -- cgit v1.2.1 From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Aug 2008 18:35:38 -0700 Subject: rcu, debug: detect stalled grace periods this is a diagnostic patch for Classic RCU. The approach is to record a timestamp at the beginning of the grace period (in rcu_start_batch()), then have rcu_check_callbacks() complain if: 1. it is running on a CPU that has holding up grace periods for a long time (say one second). This will identify the culprit assuming that the culprit has not disabled hardware irqs, instruction execution, or some such. 2. it is running on a CPU that is not holding up grace periods, but grace periods have been held up for an even longer time (say two seconds). It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter. Rather than exponential backoff, it backs off to once per 30 seconds. My feeling upon thinking on it was that if you have stalled RCU grace periods for that long, a few extra printk() messages are probably the least of your worries... Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Yinghai Lu Cc: David Witbrodt Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d4271146a9bd..d7ec731de75c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + +#ifdef CONFIG_DEBUG_RCU_STALL + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_check = get_seconds() + 3; +} +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock(&rcp->lock); + delta = get_seconds() - rcp->gp_check; + if (delta < 2L || + cpus_empty(rcp->cpumask)) { + spin_unlock(&rcp->lock); + return; + rcp->gp_check = get_seconds() + 30; + } + spin_unlock(&rcp->lock); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_cpu_mask(cpu, rcp->cpumask) + printk(" %d", cpu); + printk(" (detected by %d, t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); +} +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); + dump_stack(); + spin_lock(&rcp->lock); + if ((long)(get_seconds() - rcp->gp_check) >= 0L) + rcp->gp_check = get_seconds() + 30; + spin_unlock(&rcp->lock); +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long delta; + + delta = get_seconds() - rcp->gp_check; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { + + /* We haven't checked in, so go dump stack. */ + + print_cpu_stall(rcp); + + } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + + /* They had two seconds to dump stack, so complain. */ + + print_other_cpu_stall(rcp); + + } +} + +#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; + record_gp_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp, rdp); + if (rdp->nxtlist) { /* * This cpu has pending rcu entries and the grace period -- cgit v1.2.1 From 78635fc739b1254f3e0362ac430edbdd2cff01dc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 Aug 2008 13:34:15 +0200 Subject: rcu, debug: detect stalled grace periods, cleanups small cleanups. Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d7ec731de75c..56b8712fe5aa 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -294,6 +294,7 @@ static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) { rcp->gp_check = get_seconds() + 3; } + static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) { int cpu; @@ -303,11 +304,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_lock(&rcp->lock); delta = get_seconds() - rcp->gp_check; - if (delta < 2L || - cpus_empty(rcp->cpumask)) { + if (delta < 2L || cpus_empty(rcp->cpumask)) { spin_unlock(&rcp->lock); return; - rcp->gp_check = get_seconds() + 30; } spin_unlock(&rcp->lock); @@ -319,6 +318,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(" (detected by %d, t=%lu/%lu)\n", smp_processor_id(), get_seconds(), rcp->gp_check); } + static void print_cpu_stall(struct rcu_ctrlblk *rcp) { printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", @@ -329,8 +329,8 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp) rcp->gp_check = get_seconds() + 30; spin_unlock(&rcp->lock); } -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { long delta; @@ -341,12 +341,11 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, print_cpu_stall(rcp); - } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - - /* They had two seconds to dump stack, so complain. */ - - print_other_cpu_stall(rcp); - + } else { + if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } } } @@ -355,8 +354,9 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) { } -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + +static inline void +check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { } -- cgit v1.2.1 From b845b517b5e3706a3729f6ea83b88ab85f0725b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Aug 2008 21:47:09 +0200 Subject: printk: robustify printk Avoid deadlocks against rq->lock and xtime_lock by deferring the klogd wakeup by polling from the timer tick. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/printk.c | 19 +++++++++++++++++-- kernel/time/tick-sched.c | 2 +- kernel/timer.c | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b51b1567bb55..655cc2ca10cc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -982,10 +982,25 @@ int is_console_locked(void) return console_locked; } -void wake_up_klogd(void) +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) + if (__get_cpu_var(printk_pending)) { + __get_cpu_var(printk_pending) = 0; wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + return per_cpu(printk_pending, cpu); +} + +void wake_up_klogd(void) +{ + if (waitqueue_active(&log_wait)) + __get_cpu_var(printk_pending) = 1; } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c00fe44..c13d4f182370 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -255,7 +255,7 @@ void tick_nohz_stop_sched_tick(int inidle) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - if (rcu_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) delta_jiffies = 1; /* * Do not stop the tick, if we are only one off diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1f1593..510fe69351ca 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -978,6 +978,7 @@ void update_process_times(int user_tick) run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } -- cgit v1.2.1 From 293a17ebc944c958e24e6ffbd1d5a49abdbf489e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Aug 2008 17:25:03 -0700 Subject: rcu: prevent console flood when one CPU sees another AWOL via RCU One small change needed to keep from flooding the console when one CPU notices that another is AWOL. Unless I am missing something subtle. Otherwise the cleanups look good! Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 56b8712fe5aa..dab2676d9d72 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -308,6 +308,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_unlock(&rcp->lock); return; } + rcp->gp_check = get_seconds() + 30; spin_unlock(&rcp->lock); /* OK, time to rat on our buddy... */ -- cgit v1.2.1 From 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Aug 2008 09:21:44 -0700 Subject: rcu: classic RCU locking and memory-barrier cleanups This patch simplifies the locking and memory-barrier usage in the Classic RCU grace-period-detection mechanism, incorporating Lai Jiangshan's feedback from the earlier version (http://lkml.org/lkml/2008/8/1/400 and http://lkml.org/lkml/2008/8/3/43). Passed 10 hours of rcutorture concurrent with CPUs being put online and taken offline on a 128-hardware-thread Power machine. My apologies to whoever in the Eastern Hemisphere was planning to use this machine over the Western Hemisphere night, but it was sitting idle and... So this is ready for tip/core/rcu. This patch is in preparation for moving to a hierarchical algorithm to allow the very large SMP machines -- requested by some people at OLS, and there seem to have been a few recent patches in the 4096-CPU direction as well. The general idea is to move to a much more conservative concurrency design, then apply a hierarchy to reduce contention on the global lock by a few orders of magnitude (larger machines would see greater reductions). The reason for taking a conservative approach is that this code isn't on any fast path. Prototype in progress. This patch is against the linux-tip git tree (tip/core/rcu). If you wish to test this against 2.6.26, use the following set of patches: http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimp-1.patch http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimpfix-3.patch The first patch combines commits 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 and 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 from Lai Jiangshan , and the second patch contains my changes. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index dab2676d9d72..5de126630b10 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -87,6 +87,7 @@ static void force_quiescent_state(struct rcu_data *rdp, int cpu; cpumask_t cpumask; set_need_resched(); + spin_lock(&rcp->lock); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -112,6 +113,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock(&rcp->lock); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -125,7 +127,9 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { long batch; - smp_mb(); /* reads the most recently updated value of rcu->cur. */ + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ /* * Determine the batch number of this callback. @@ -175,7 +179,6 @@ void call_rcu(struct rcu_head *head, unsigned long flags; head->func = func; - head->next = NULL; local_irq_save(flags); __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); @@ -204,7 +207,6 @@ void call_rcu_bh(struct rcu_head *head, unsigned long flags; head->func = func; - head->next = NULL; local_irq_save(flags); __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); @@ -467,17 +469,17 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ spin_lock_bh(&rcp->lock); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - /* spin_lock implies smp_mb() */ rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock_bh(&rcp->lock); local_irq_disable(); this_rdp->qlen += rdp->qlen; @@ -511,16 +513,19 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + long completed_snap; + if (rdp->nxtlist) { local_irq_disable(); + completed_snap = ACCESS_ONCE(rcp->completed); /* * move the other grace-period-completed entries to * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ - if (!rcu_batch_before(rcp->completed, rdp->batch)) + if (!rcu_batch_before(completed_snap, rdp->batch)) rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(rcp->completed, rdp->batch - 1)) + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) rdp->nxttail[0] = rdp->nxttail[1]; /* @@ -561,8 +566,24 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -571,13 +592,15 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) check_cpu_stall(rcp, rdp); if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + /* * This cpu has pending rcu entries and the grace period * for them has completed. */ - if (!rcu_batch_before(rcp->completed, rdp->batch)) + if (!rcu_batch_before(completed_snap, rdp->batch)) return 1; - if (!rcu_batch_before(rcp->completed, rdp->batch - 1) && + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && rdp->nxttail[0] != rdp->nxttail[1]) return 1; if (rdp->nxttail[0] != &rdp->nxtlist) @@ -628,6 +651,12 @@ int rcu_needs_cpu(int cpu) return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -671,6 +700,7 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + spin_lock(&rcp->lock); memset(rdp, 0, sizeof(*rdp)); rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; @@ -678,6 +708,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock(&rcp->lock); } static void __cpuinit rcu_online_cpu(int cpu) -- cgit v1.2.1 From 5802294f1b1895ee19a3d0ae72805da453afb9de Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 Jul 2008 14:20:55 -0400 Subject: rcu: trace fix possible mem-leak In the initialization of the RCU trace module, if rcupreempt_debugfs_init() fails, we never free the the trace buffer. This patch frees the trace buffer in case the debugfs fails. Signed-off-by: Steven Rostedt Reviewed-by: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/rcupreempt_trace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bbc..35c2d3360ecf 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) -- cgit v1.2.1 From cd95851785bcfe95fdf73689e8ecb5a1c5959231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Aug 2008 07:37:15 -0700 Subject: rcu: fix classic RCU locking cleanup lockdep problem On Fri, Aug 15, 2008 at 04:24:30PM +0200, Ingo Molnar wrote: > > Paul, > > one of your two recent RCU patches caused this lockdep splat in -tip > testing: > > -------------------> > Brought up 2 CPUs > Total of 2 processors activated (6850.87 BogoMIPS). > PM: Adding info for No Bus:platform > khelper used greatest stack depth: 3124 bytes left > > ================================= > [ INFO: inconsistent lock state ] > 2.6.27-rc3-tip #1 > --------------------------------- > inconsistent {softirq-on-W} -> {in-softirq-W} usage. > ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes: > (&rcu_ctrlblk.lock){-+..}, at: [] __rcu_process_callbacks+0x1ac/0x1f0 > {softirq-on-W} state was registered at: > [] __lock_acquire+0x3f4/0x5b0 > [] lock_acquire+0x89/0xc0 > [] _spin_lock+0x3b/0x70 > [] rcu_init_percpu_data+0x29/0x80 > [] rcu_cpu_notify+0xaf/0xd0 > [] notifier_call_chain+0x2d/0x60 > [] __raw_notifier_call_chain+0x1e/0x30 > [] _cpu_up+0x79/0x110 > [] cpu_up+0x4d/0x70 > [] kernel_init+0xb1/0x200 > [] kernel_thread_helper+0x7/0x10 > [] 0xffffffff > irq event stamp: 14 > hardirqs last enabled at (14): [] trace_hardirqs_on+0xb/0x10 > hardirqs last disabled at (13): [] trace_hardirqs_off+0xb/0x10 > softirqs last enabled at (0): [] copy_process+0x276/0x1190 > softirqs last disabled at (11): [] call_on_stack+0x1a/0x30 > > other info that might help us debug this: > no locks held by ksoftirqd/0/4. > > stack backtrace: > Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27-rc3-tip #1 > [] print_usage_bug+0x16c/0x1b0 > [] mark_lock+0xa75/0xb10 > [] ? sched_clock+0x15/0x30 > [] __lock_acquire+0x3ad/0x5b0 > [] lock_acquire+0x89/0xc0 > [] ? __rcu_process_callbacks+0x1ac/0x1f0 > [] _spin_lock+0x3b/0x70 > [] ? __rcu_process_callbacks+0x1ac/0x1f0 > [] __rcu_process_callbacks+0x1ac/0x1f0 > [] rcu_process_callbacks+0x26/0x50 > [] __do_softirq+0x95/0x120 > [] ? __do_softirq+0x0/0x120 > [] call_on_stack+0x1a/0x30 > [] ? ksoftirqd+0x96/0x110 > [] ? ksoftirqd+0x0/0x110 > [] ? kthread+0x47/0x80 > [] ? kthread+0x0/0x80 > [] ? kernel_thread_helper+0x7/0x10 > ======================= > calling init_cpufreq_transition_notifier_list+0x0/0x20 > initcall init_cpufreq_transition_notifier_list+0x0/0x20 returned 0 after 0 msecs > calling net_ns_init+0x0/0x190 > net_namespace: 676 bytes > initcall net_ns_init+0x0/0x190 returned 0 after 0 msecs > calling cpufreq_tsc+0x0/0x20 > initcall cpufreq_tsc+0x0/0x20 returned 0 after 0 msecs > calling reboot_init+0x0/0x20 > initcall reboot_init+0x0/0x20 returned 0 after 0 msecs > calling print_banner+0x0/0x10 > Booting paravirtualized kernel on bare hardware > > <----------------------- > > my guess is on: > > commit 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 > Author: Paul E. McKenney > Date: Tue Aug 5 09:21:44 2008 -0700 > > rcu: classic RCU locking and memory-barrier cleanups > > Ingo Fixes a problem detected by lockdep in which rcu->lock was acquired both in irq context and in process context, but without disabling from process context. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 5de126630b10..fb1f1cc45142 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -700,7 +700,9 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - spin_lock(&rcp->lock); + long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; @@ -708,7 +710,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) -- cgit v1.2.1 From ded00a56e99555c3f4000ef3eebfd5fe0d574565 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Aug 2008 12:50:36 -0700 Subject: rcu: remove redundant ACCESS_ONCE definition from rcupreempt.c Remove the redundant definition of ACCESS_ONCE() from rcupreempt.c in favor of the one in compiler.h. Also merge the comment header from rcupreempt.c's definition into that in compiler.h. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0d..ca4bbbe04aa4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -58,14 +58,6 @@ #include #include -/* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - /* * PREEMPT_RCU data structures. */ -- cgit v1.2.1 From eff9b713ee3540ddab862095aaf4b1511a6758bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Aug 2008 17:51:08 -0700 Subject: rcu: fix locking cleanup fallout Given that the rcp->lock is now acquired from call_rcu(), which can be invoked from irq-disable regions, all acquisitions need to disable irqs. The following patch fixes this. Although I don't have any reason to believe that this is the cause of Yinghai's oops, it does need to be fixed. Signed-off-by: Paul E. McKenney Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index fb1f1cc45142..c6b6cf55f3e2 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -86,8 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -113,7 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -301,17 +303,18 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) { int cpu; long delta; + unsigned long flags; /* Only let one CPU complain about others per time interval. */ - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); delta = get_seconds() - rcp->gp_check; if (delta < 2L || cpus_empty(rcp->cpumask)) { spin_unlock(&rcp->lock); return; } rcp->gp_check = get_seconds() + 30; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); /* OK, time to rat on our buddy... */ @@ -324,13 +327,15 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) static void print_cpu_stall(struct rcu_ctrlblk *rcp) { + unsigned long flags; + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", smp_processor_id(), get_seconds(), rcp->gp_check); dump_stack(); - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if ((long)(get_seconds() - rcp->gp_check) >= 0L) rcp->gp_check = get_seconds() + 30; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -413,6 +418,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -436,7 +443,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -444,7 +451,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -469,21 +476,22 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + /* * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock_bh(&rcp->lock); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -550,12 +558,12 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { rcp->pending = rdp->batch; rcu_start_batch(rcp); } - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } } -- cgit v1.2.1 From 0c925d79234fe77589d8ff3861f9f8bb9e7fc3f6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 18 Aug 2008 21:49:51 -0700 Subject: rcuclassic: fix compilation NG fix: CC kernel/rcuclassic.o kernel/rcuclassic.c: In function '__rcu_process_callbacks': kernel/rcuclassic.c:561: error: 'flags' undeclared (first use in this function) kernel/rcuclassic.c:561: error: (Each undeclared identifier is reported only once kernel/rcuclassic.c:561: error: for each function it appears in.) Declare missing variable flags. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index c6b6cf55f3e2..01e761a6b38c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -557,6 +557,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_enable(); if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags; + /* and start it/schedule start if it's a new batch */ spin_lock_irqsave(&rcp->lock, flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { -- cgit v1.2.1 From af4491e51632d01fbc2b856ffa9ebcd4b38db68c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:02 +0200 Subject: sched: rt-bandwidth for user grouping interface rt_runtime is a signed value Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); -- cgit v1.2.1 From 6f0d5c390e4206dcb3804a5072a048fdb7d2b428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b4543..77340b04a538 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -491,9 +488,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.2.1 From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:04 +0200 Subject: sched: rt-bandwidth group disable fixes More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime to disable full group bandwidth control. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- kernel/sched_rt.c | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb84e26d..c1bee5fb8154 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +static inline int rt_bandwidth_enabled(void); + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 77340b04a538..94daace5ee15 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); -- cgit v1.2.1 From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:05 +0200 Subject: sched: extract walk_tg_tree() Extract walk_tg_tree() and make it a little more generic so we can use it in the schedulablity test. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1bee5fb8154..8c019a19d052 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1426,14 +1412,42 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; } +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else -- cgit v1.2.1 From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:06 +0200 Subject: sched: rt-bandwidth fixes The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting for the group scheduler - however it doesn't deal with sched_setscheduler(), which will keep tasks out of groups that have no assigned runtime. If we relax this, we get into the situation where RT tasks can get into a group when we disable bandwidth control, and then starve them by enabling it again. Rework the schedulability code to check for this condition and fail to turn on bandwidth control with -EBUSY when this situation is found. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 125 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8c019a19d052..e41bdae2778d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -5082,7 +5082,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + total = to_ratio(period, runtime); - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void) rt_runtime = tg->rt_bandwidth.rt_runtime; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(tg, rt_period, rt_runtime); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; -- cgit v1.2.1 From fa33507a22623b3bd543b15a21c362cf364b6cff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2008 09:31:26 +0200 Subject: printk: robustify printk, fix #2 Dmitry Adamushko reported: > [*] btw., with DEBUG being enabled, pr_debug() generates [1] when > debug_smp_processor_id() is used (CONFIG_DEBUG_PREEMPT). > > the problem seems to be caused by the following commit: > commit b845b517b5e3706a3729f6ea83b88ab85f0725b0 > Author: Peter Zijlstra > Date: Fri Aug 8 21:47:09 2008 +0200 > > printk: robustify printk > > > wake_up_klogd() -> __get_cpu_var() -> smp_processor_id() > > and that's being called from release_console_sem() which is, in turn, > said to be "may be called from any context" [2] > > and in this case, it seems to be called from some non-preemptible > context (although, it can't be printk()... > although, I haven't looked carefully yet). > > Provided [2], __get_cpu_var() is perhaps not the right solution there. > > > [1] > > [ 7697.942005] BUG: using smp_processor_id() in preemptible [00000000] code: syslogd/3542 > [ 7697.942005] caller is wake_up_klogd+0x1b/0x50 > [ 7697.942005] Pid: 3542, comm: syslogd Not tainted 2.6.27-rc3-tip-git #2 > [ 7697.942005] Call Trace: > [ 7697.942005] [] debug_smp_processor_id+0xe8/0xf0 > [ 7697.942005] [] wake_up_klogd+0x1b/0x50 > [ 7697.942005] [] release_console_sem+0x1e7/0x200 > [ 7697.942005] [] do_con_write+0xb7/0x1f30 > [ 7697.942005] [] ? show_trace+0x10/0x20 > [ 7697.942005] [] ? dump_stack+0x72/0x80 > [ 7697.942005] [] ? __ratelimit+0xbd/0xe0 > [ 7697.942005] [] ? debug_smp_processor_id+0xe8/0xf0 > [ 7697.942005] [] ? wake_up_klogd+0x1b/0x50 > [ 7697.942005] [] ? release_console_sem+0x1e7/0x200 > [ 7697.942005] [] con_write+0x19/0x30 > [ 7697.942005] [] write_chan+0x276/0x3c0 > [ 7697.942005] [] ? default_wake_function+0x0/0x10 > [ 7697.942005] [] ? _spin_lock_irqsave+0x22/0x50 > [ 7697.942005] [] tty_write+0x194/0x260 > [ 7697.942005] [] ? write_chan+0x0/0x3c0 > [ 7697.942005] [] redirected_tty_write+0xa4/0xb0 > [ 7697.942005] [] ? redirected_tty_write+0x0/0xb0 > [ 7697.942005] [] do_loop_readv_writev+0x52/0x80 > [ 7697.942005] [] do_readv_writev+0x1bd/0x1d0 > [ 7697.942005] [] vfs_writev+0x39/0x60 > [ 7697.942005] [] sys_writev+0x50/0x90 > [ 7697.942005] [] system_call_fastpath+0x16/0x1b Signed-off-by: Peter Zijlstra Reported-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 655cc2ca10cc..57e9cd7a9581 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1000,7 +1000,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - __get_cpu_var(printk_pending) = 1; + __raw_get_cpu_var(printk_pending) = 1; } /** -- cgit v1.2.1 From 1fa63a817d27af7dc0d5ed454eb8fe5dec65fac7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 20 Aug 2008 14:40:55 +0200 Subject: printk: robustify printk, update comment Remove the comment describing the possibility of printk() deadlocking on runqueue lock. Signed-off-by: Jiri Kosina Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 57e9cd7a9581..6f27c6a4bdc9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -577,9 +577,6 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output -- cgit v1.2.1 From 1aa5dfb751d275ae7117d3b73ac423b4a46f2a73 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Aug 2008 16:37:28 -0700 Subject: clocksource: keep track of original clocksource frequency The clocksource frequency is represented by clocksource->mult/2^(clocksource->shift). Currently, when NTP makes adjustments to the clock frequency, they are made directly to the mult value. This has the drawback that once changed, we cannot know what the orignal mult value was, or how much adjustment has been applied. This property causes problems in calculating proper ntp intervals when switching back and forth between clocksources. This patch separates the current mult value into a mult and mult_orig pair. The mult_orig value stays constant, while the ntp clocksource adjustments are done only to the mult value. This allows for correct ntp interval calculation and additionally lays the groundwork for a new notion of time, what I'm calling the monotonic-raw time, which is introduced in a following patch. Signed-off-by: John Stultz Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 3 +++ kernel/time/jiffies.c | 1 + 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4acf993b..9ed2eec97526 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; + /* save mult_orig on registration */ + c->mult_orig = c->mult; + spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fdb8875..1ca99557e929 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; -- cgit v1.2.1 From 9a055117d3d9cb562f83f8d4cd88772761f4cab0 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 20 Aug 2008 16:37:28 -0700 Subject: clocksource: introduce clocksource_forward_now() To keep the raw monotonic patch simple first introduce clocksource_forward_now(), which takes care of the offset since the last update_wall_time() call and adds it to the clock, so there is no need anymore to deal with it explicitly at various places, which need to make significant changes to the clock. This is also gets rid of the timekeeping_suspend_nsecs, instead of waiting until resume, the value is accumulated during suspend. In the end there is only a single user of __get_nsec_offset() left, so I integrated it back to getnstimeofday(). Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 71 ++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f961c9..83d3555a6998 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,23 @@ struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; - s64 ns_offset; + s64 nsec; - /* read clocksource: */ cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); - - return ns_offset; + nsec = cyc2ns(clock, cycle_delta); + timespec_add_ns(&xtime, nsec); } /** @@ -89,6 +85,7 @@ static inline s64 __get_nsec_offset(void) */ void getnstimeofday(struct timespec *ts) { + cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -96,7 +93,15 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - nsecs = __get_nsec_offset(); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +134,22 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(struct timespec *tv) { + struct timespec ts_delta; unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); - nsec -= __get_nsec_offset(); + clocksource_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + xtime = *tv; - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@ -170,22 +175,17 @@ EXPORT_SYMBOL(do_settimeofday); static void change_clocksource(void) { struct clocksource *new; - cycle_t now; - u64 nsec; new = clocksource_get_next(); if (clock == new) return; - new->cycle_last = 0; - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); + clocksource_forward_now(); clock = new; - clock->cycle_last = now; - + clock->cycle_last = 0; + clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,8 +200,8 @@ static void change_clocksource(void) */ } #else +static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; } #endif /** @@ -265,8 +265,6 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +290,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } - /* Make sure that we have the correct xtime reference */ - timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@ -319,8 +315,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* Get the current xtime offset */ - timekeeping_suspend_nsecs = __get_nsec_offset(); + clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -461,10 +456,10 @@ void update_wall_time(void) */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; + clock->cycle_last += clock->cycle_interval; + clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; -- cgit v1.2.1 From 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Aug 2008 16:37:30 -0700 Subject: clocksource: introduce CLOCK_MONOTONIC_RAW In talking with Josip Loncaric, and his work on clock synchronization (see btime.sf.net), he mentioned that for really close synchronization, it is useful to have access to "hardware time", that is a notion of time that is not in any way adjusted by the clock slewing done to keep close time sync. Part of the issue is if we are using the kernel's ntp adjusted representation of time in order to measure how we should correct time, we can run into what Paul McKenney aptly described as "Painting a road using the lines we're painting as the guide". I had been thinking of a similar problem, and was trying to come up with a way to give users access to a purely hardware based time representation that avoided users having to know the underlying frequency and mask values needed to deal with the wide variety of possible underlying hardware counters. My solution is to introduce CLOCK_MONOTONIC_RAW. This exposes a nanosecond based time value, that increments starting at bootup and has no frequency adjustments made to it what so ever. The time is accessed from userspace via the posix_clock_gettime() syscall, passing CLOCK_MONOTONIC_RAW as the clock_id. Signed-off-by: John Stultz Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/posix-timers.c | 15 +++++++++++++++ kernel/time/timekeeping.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbff..d3c66b53dff6 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -222,6 +222,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) return 0; } +/* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -235,9 +244,15 @@ static __init int init_posix_timers(void) .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 83d3555a6998..5099c95b8aa2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -75,6 +75,9 @@ static void clocksource_forward_now(void) nsec = cyc2ns(clock, cycle_delta); timespec_add_ns(&xtime, nsec); + + nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + clock->raw_time.tv_nsec += nsec; } /** @@ -183,6 +186,8 @@ static void change_clocksource(void) clocksource_forward_now(); + new->raw_time = clock->raw_time; + clock = new; clock->cycle_last = 0; clock->cycle_last = clocksource_read(new); @@ -204,6 +209,39 @@ static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } #endif +/** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + cycle_t cycle_now, cycle_delta; + + do { + seq = read_seqbegin(&xtime_lock); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + + *ts = clock->raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ @@ -466,6 +504,12 @@ void update_wall_time(void) second_overflow(); } + clock->raw_time.tv_nsec += clock->raw_interval; + if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { + clock->raw_time.tv_nsec -= NSEC_PER_SEC; + clock->raw_time.tv_sec++; + } + /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); -- cgit v1.2.1 From d82f0b0f6f1a0a25afc288fb7135b1601fe6df18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:46:04 -0700 Subject: migrate_timers: add comment, use spinlock_irq() Add the comment to explain why the double lock in migrate_timers() can't deadlock. Change the code to use spinlock_irq() instead of local_irq_disable() + spin_lock(). Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 11 ++++++----- kernel/timer.c | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a74..03ea1378c43b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1620,9 +1620,11 @@ static void migrate_hrtimers(int cpu) new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1631,8 +1633,7 @@ static void migrate_hrtimers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1f1593..e8019cc3418d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1435,9 +1435,11 @@ static void __cpuinit migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1452,8 +1454,7 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.2.1 From 275a89bdd3868af3008852594d2e169eaf69441b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Aug 2008 06:14:55 -0700 Subject: rcu: use irq-safe locks Some earlier tip/core/rcu patches caused RCU to incorrectly enable irqs too early in boot. This caused Yinghai's repeated-kexec testing to hit oopses, presumably due to so that device interrupts left over from the prior kernel instance (which would oops the newly booting kernel before it got a chance to reset said devices). This patch therefore converts all the local_irq_disable()s in rcuclassic.c to local_irq_save(). Besides, I never did like local_irq_disable() anyway. ;-) Signed-off-by: Paul E. McKenney Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 01e761a6b38c..3f6918966bda 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -247,6 +247,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -261,9 +262,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -464,12 +465,14 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, struct rcu_head **tail, long batch) { + unsigned long flags; + if (list) { - local_irq_disable(); + local_irq_save(flags); this_rdp->batch = batch; *this_rdp->nxttail[2] = list; this_rdp->nxttail[2] = tail; - local_irq_enable(); + local_irq_restore(flags); } } @@ -521,10 +524,11 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; long completed_snap; if (rdp->nxtlist) { - local_irq_disable(); + local_irq_save(flags); completed_snap = ACCESS_ONCE(rcp->completed); /* @@ -554,7 +558,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, rdp->nxttail[0] = &rdp->nxtlist; } - local_irq_enable(); + local_irq_restore(flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { unsigned long flags; -- cgit v1.2.1 From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 20 Aug 2008 16:46:08 -0700 Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup Thanks to the review by Michael Kerrisk a bug in the recent ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was inadvertently set by it. This fixes this by making the adjtime code more separate from the ntp_adjtime code (both of which really want to be separate syscalls). Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Acked-by: John Stultz Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 76 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196b..c6921aa1a42a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { } int do_adjtimex(struct timex *txc) { struct timespec ts; - long save_adjust, sec; int result; - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* if the quartz is off by more than 10% something is VERY wrong! */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); } - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + goto adj_done; + } if (txc->modes) { + long sec; + if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -389,19 +396,16 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + +adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); -- cgit v1.2.1 From 377bf1e4ac2d894791733270604594c7c851ef83 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 21 Aug 2008 22:58:28 +0400 Subject: genirq: fix irq_desc->depth handling with DEBUG_SHIRQ When DEBUG_SHIRQ is selected, a spurious IRQ is issued before the setup_irq() initializes the desc->depth. An IRQ handler may call disable_irq_nosync(), but then setup_irq() will overwrite desc->depth, and upon enable_irq() we'll catch this WARN: ------------[ cut here ]------------ Badness at kernel/irq/manage.c:180 NIP: c0061ab8 LR: c0061f10 CTR: 00000000 REGS: cf83be50 TRAP: 0700 Not tainted (2.6.27-rc3-23450-g74919b0) MSR: 00021032 CR: 22042022 XER: 20000000 TASK = cf829100[5] 'events/0' THREAD: cf83a000 GPR00: c0061f10 cf83bf00 cf829100 c038e674 00000016 00000000 cf83bef8 00000038 GPR08: c0298910 00000000 c0310d28 cf83a000 00000c9c 1001a1a8 0fffe000 00800000 GPR16: ffffffff 00000000 007fff00 00000000 007ffeb0 c03320a0 c031095c c0310924 GPR24: cf8292ec cf807190 cf83a000 00009032 c038e6a4 c038e674 cf99b1cc c038e674 NIP [c0061ab8] __enable_irq+0x20/0x80 LR [c0061f10] enable_irq+0x50/0x70 Call Trace: [cf83bf00] [c038e674] irq_desc+0x630/0x9000 (unreliable) [cf83bf10] [c0061f10] enable_irq+0x50/0x70 [cf83bf30] [c01abe94] phy_change+0x68/0x108 [cf83bf50] [c0046394] run_workqueue+0xc4/0x16c [cf83bf90] [c0046834] worker_thread+0x74/0xd4 [cf83bfd0] [c004ab7c] kthread+0x48/0x84 [cf83bff0] [c00135e0] kernel_thread+0x44/0x60 Instruction dump: 4e800020 3d20c031 38a94214 4bffffcc 9421fff0 7c0802a6 93e1000c 7c7f1b78 90010014 8123001c 2f890000 409e001c <0fe00000> 80010014 83e1000c 38210010 That trace corresponds to this line: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); The patch fixes the problem by moving the SHIRQ code below the setup_irq(). Unfortunately we can't easily move the SHIRQ code inside the setup_irq(), since it grabs a spinlock, so to prvent a 'real' IRQ from interfere us we should disable that IRQ. p.s. The driver in question is drivers/net/phy/phy.c. Signed-off-by: Anton Vorontsov Cc: David Woodhouse Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 77a51be36010..ae1b684e048c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -596,26 +596,29 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; + retval = setup_irq(irq, action); + if (retval) + kfree(action); + #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... - * We do this before actually registering it, to make sure that - * a 'real' IRQ doesn't run in parallel with our fake + * We disable the irq to make sure that a 'real' IRQ doesn't + * run in parallel with our fake. */ unsigned long flags; + disable_irq(irq); local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); + enable_irq(irq); } #endif - - retval = setup_irq(irq, action); - if (retval) - kfree(action); - return retval; } EXPORT_SYMBOL(request_irq); -- cgit v1.2.1 From a38409fbb5181324fb73b359189a72e02b6c7030 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 12:16:09 +0200 Subject: dma-coherent: export dma_[alloc|release]_from_coherent methods fixes modular builds: ERROR: "dma_alloc_from_coherent" [sound/core/snd-page-alloc.ko] undefined! ERROR: "dma_release_from_coherent" [sound/core/snd-page-alloc.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index c1d4d5b4c61c..f013a0c2e111 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } return (mem != NULL); } +EXPORT_SYMBOL(dma_alloc_from_coherent); /** * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool @@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) } return 0; } +EXPORT_SYMBOL(dma_release_from_coherent); -- cgit v1.2.1 From 94d3d8247de22c5b0624aa00616ceca459498e55 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:41 -0700 Subject: sched: do_wait_for_common: use signal_pending_state() Change do_wait_for_common() to use signal_pending_state() instead of open coding. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406ca..da7c5d23cc03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4599,10 +4599,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } -- cgit v1.2.1 From f31e11d87a5d7601636710195891ba462ad99f11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:44 -0700 Subject: wait_task_inactive(): don't consider task->nivcsw If wait_task_inactive() returns success the task was deactivated. In that case schedule() always increments ->nvcsw which alone can be used as a "generation counter". If the next call returns the same number, we can be sure that the task was unscheduled. Otherwise, because we know that .on_rq == 0 again, ->nvcsw should have been changed in between. Q: perhaps it is better to do "ncsw = (p->nvcsw << 1) | 1" ? This decreases the possibility of "was it unscheduled" false positive when ->nvcsw == 0. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index da7c5d23cc03..908670aa215a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1921,11 +1921,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw ?: 1; task_rq_unlock(rq, &flags); /* -- cgit v1.2.1 From 93dcf55f828b035fc93fc19eb03c1390e1e6d570 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:44 -0700 Subject: wait_task_inactive: "improve" the returned value for ->nvcsw == 0 wait_task_inactive() returns 1 when p->nvcsw == 0 || p->nvcsw == 1. This means that two subsequent calls can return the same number while the task was scheduled in between. Change the code to return "nvcsw | LONG_MIN" instead of "nvcsw ?: 1", now the overlap always needs LONG_MAX schedules. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 908670aa215a..6a43c8942b05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1922,7 +1922,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) on_rq = p->se.on_rq; ncsw = 0; if (!match_state || p->state == match_state) - ncsw = p->nvcsw ?: 1; + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* -- cgit v1.2.1 From 65eb3dc609dec17deea48dcd4de2e549d29a9824 Mon Sep 17 00:00:00 2001 From: Kevin Diggs Date: Tue, 26 Aug 2008 10:26:54 +0200 Subject: sched: add kernel doc for the completion, fix kernel-doc-nano-HOWTO.txt This patch adds kernel doc for the completion feature. An error in the split-man.pl PERL snippet in kernel-doc-nano-HOWTO.txt is also fixed. Signed-off-by: Kevin Diggs Signed-off-by: Ingo Molnar --- kernel/sched.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 29e2ec0bd831..93f5ea08be97 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4565,6 +4565,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4576,6 +4585,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4624,12 +4639,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4637,6 +4671,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4646,6 +4687,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4654,6 +4703,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); -- cgit v1.2.1 From 0cd418ddb1ee88df7d16d5df06cb2da68eceb9e4 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 18 Aug 2008 14:39:21 -0700 Subject: rcuclassic: fix compiler warning CC kernel/rcuclassic.o kernel/rcuclassic.c: In function 'rcu_init_percpu_data': kernel/rcuclassic.c:705: warning: comparison of distinct pointer types lacks a cast kernel/rcuclassic.c:713: warning: comparison of distinct pointer types lacks a cast flags should be unsigned long. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 3f6918966bda..743cf0550ff4 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -714,7 +714,7 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - long flags; + unsigned long flags; spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); -- cgit v1.2.1 From aec0a5142cb52aaa152d962d84a838e25d520742 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Thu, 28 Aug 2008 14:42:49 +0530 Subject: sched: call resched_task() conditionally from new task wake up path - During wake up of a new task, task_new_fair() can do a resched_task() on the current task. Later in the code path, check_preempt_curr() also ends up doing the same, which can be avoided. Check if TIF_NEED_RESCHED is already set for the current task. - task_new_fair() does a resched_task() on the current task unconditionally. This can be done only in case when child runs before the parent. So this is a small speedup. Signed-off-by: Bharata B Rao Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..8264bb5dbd51 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1348,6 +1348,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; + /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + cfs_rq_of(pse)->next = pse; /* @@ -1620,10 +1627,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* -- cgit v1.2.1 From 29cbef4869bf288256ab76c7dc674cb132b35de2 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 27 Aug 2008 11:21:39 -0400 Subject: make might_sleep() display the oopsing process Expand might_sleep's printk to indicate the oopsing process. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93f5ea08be97..6e283dc7679a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8233,8 +8233,8 @@ void __might_sleep(char *file, int line) prev_jiffy = jiffies; printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -- cgit v1.2.1 From aef745fca016aea45adae5c98e8698904dd8ad51 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Aug 2008 11:34:43 +0200 Subject: sched: clean up __might_sleep() add KERN_ to the printout and clean up the flow a bit. Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6e283dc7679a..b112caaa400a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8226,20 +8226,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->pid, current->comm); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); -- cgit v1.2.1 From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 13:40:47 +0200 Subject: sched: extract walk_tg_tree(), fix fix: kernel/sched.c: In function '__rt_schedulable': kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree' kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function) kernel/sched.c:8771: error: (Each undeclared identifier is reported only once kernel/sched.c:8771: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e41bdae2778d..703f56d5db5f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* -- cgit v1.2.1 From 268364a0f48aee2f851f9d1ef8a6cda0f3039ef1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:02:44 +0200 Subject: IO resources: add reserve_region_with_split() add reserve_region_with_split() to not lose e820 reserved entries if they overlap with existing IO regions: with test case by extend 0xe0000000 - 0xeffffff to 0xdd800000 - we get: e0000000-efffffff : PCI MMCONFIG 0 e0000000-efffffff : reserved and in /proc/iomem we get: found conflict for reserved [dd800000, efffffff], try to reserve with split __reserve_region_with_split: (PCI Bus #80) [dd000000, ddffffff], res: (reserved) [dd800000, efffffff] __reserve_region_with_split: (PCI Bus #00) [de000000, dfffffff], res: (reserved) [de000000, efffffff] initcall pci_subsys_init+0x0/0x121 returned 0 after 381 msecs in dmesg various fixes and improvements suggested by Linus. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/resource.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 03d796c1b2e9..414d6fc9131e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -516,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t return result; } +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", + conflict->name, conflict->start, conflict->end, + name, start, end); + + /* failed, split and try again */ + + /* conflict coverred whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + EXPORT_SYMBOL(adjust_resource); /** -- cgit v1.2.1 From 1cf44baad76b6f20f95ece397c6f643320aa44c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Sep 2008 21:26:06 +0200 Subject: IO resources: fix/remove printk Andrew Morton noticed that the printk in kernel/resource.c was buggy: | start and end have type resource_size_t. Such types CANNOT be printed | unless cast to a known type. | | Because there is a %s following an incorrect %lld, the above code will | crash the machine. ... and it's probably quite unneeded as well, so remove it. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/resource.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 414d6fc9131e..fc59dcc4795b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -549,13 +549,9 @@ static void __init __reserve_region_with_split(struct resource *root, } if (!res) { - printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", - conflict->name, conflict->start, conflict->end, - name, start, end); - /* failed, split and try again */ - /* conflict coverred whole area */ + /* conflict covered whole area */ if (conflict->start <= start && conflict->end >= end) return; -- cgit v1.2.1 From 7bb67439bf6bd3782f07f1d7be1e63406453d5de Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 31 Aug 2008 08:05:58 -0700 Subject: select: Introduce a hrtimeout function This patch adds a schedule_hrtimeout() function, to be used by select() and poll() in a later patch. This function works similar to schedule_timeout() in most ways, but takes a timespec rather than jiffies. With a lot of contributions/fixes from Thomas Signed-off-by: Arjan van de Ven Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a74..782137dc755f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1678,3 +1678,68 @@ void __init hrtimers_init(void) #endif } +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "inifinte" + */ + if (!expires) { + schedule(); + __set_current_state(TASK_RUNNING); + return -EINTR; + } + + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); + t.timer.expires = *expires; + + hrtimer_init_sleeper(&t, current); + + hrtimer_start(&t.timer, t.timer.expires, mode); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); -- cgit v1.2.1 From df0cc0539b4127bd02f64de2c335b4af1fdb3845 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 31 Aug 2008 08:09:53 -0700 Subject: select: add a timespec_add_safe() function For the select() rework, it's important to be able to add timespec structures in an overflow-safe manner. This patch adds a timespec_add_safe() function for this which is similar in operation to ktime_add_safe(), but works on a struct timespec. Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven --- kernel/time.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 6a08660b4fac..d63a4336fad6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64); #endif EXPORT_SYMBOL(jiffies); + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} -- cgit v1.2.1 From cc584b213f252bf698849cf4be2377cd3ec7501a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 15:02:30 -0700 Subject: hrtimer: convert kernel/* to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts kernel/* to these accessors. Signed-off-by: Arjan van de Ven --- kernel/futex.c | 7 +++---- kernel/hrtimer.c | 44 +++++++++++++++++++++++--------------------- kernel/posix-timers.c | 10 ++++------ kernel/rtmutex.c | 3 +-- kernel/sched.c | 7 +++---- kernel/time/ntp.c | 3 +-- kernel/time/tick-sched.c | 21 ++++++++++----------- kernel/time/timer_list.c | 4 ++-- 8 files changed, 47 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7d1136e97c14..4cd5b4319b04 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1299,10 +1299,9 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - t.timer.expires = *abs_time; + hrtimer_set_expires(&t.timer, *abs_time); - hrtimer_start(&t.timer, t.timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) t.task = NULL; @@ -1404,7 +1403,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = *time; + hrtimer_set_expires(&to->timer, *time); } q.pi_state = NULL; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 782137dc755f..ae307feec74c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) if (!base->first) continue; timer = rb_entry(base->first, struct hrtimer, node); - expires = ktime_sub(timer->expires, base->offset); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } @@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; - ktime_t expires = ktime_sub(timer->expires, base->offset); + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; - WARN_ON_ONCE(timer->expires.tv64 < 0); + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * When the callback is running, we do not reprogram the clock event @@ -794,7 +794,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) u64 orun = 1; ktime_t delta; - delta = ktime_sub(now, timer->expires); + delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta.tv64 < 0) return 0; @@ -806,8 +806,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); - timer->expires = ktime_add_ns(timer->expires, incr * orun); - if (timer->expires.tv64 > now.tv64) + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now.tv64) return orun; /* * This (and the ktime_add() below) is the @@ -815,7 +815,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) */ orun++; } - timer->expires = ktime_add_safe(timer->expires, interval); + hrtimer_add_expires(timer, interval); return orun; } @@ -847,7 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * We dont care about collisions. Nodes with * the same expiry time stay together. */ - if (timer->expires.tv64 < entry->expires.tv64) { + if (hrtimer_get_expires_tv64(timer) < + hrtimer_get_expires_tv64(entry)) { link = &(*link)->rb_left; } else { link = &(*link)->rb_right; @@ -982,7 +983,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) #endif } - timer->expires = tim; + hrtimer_set_expires(timer, tim); timer_stats_hrtimer_set_start_info(timer); @@ -1076,7 +1077,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) ktime_t rem; base = lock_hrtimer_base(timer, &flags); - rem = ktime_sub(timer->expires, base->get_time()); + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; @@ -1108,7 +1109,7 @@ ktime_t hrtimer_get_next_event(void) continue; timer = rb_entry(base->first, struct hrtimer, node); - delta.tv64 = timer->expires.tv64; + delta.tv64 = hrtimer_get_expires_tv64(timer); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) mindelta.tv64 = delta.tv64; @@ -1308,10 +1309,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) timer = rb_entry(node, struct hrtimer, node); - if (basenow.tv64 < timer->expires.tv64) { + if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) { ktime_t expires; - expires = ktime_sub(timer->expires, + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < expires_next.tv64) expires_next = expires; @@ -1414,7 +1415,8 @@ void hrtimer_run_queues(void) struct hrtimer *timer; timer = rb_entry(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= timer->expires.tv64) + if (base->softirq_time.tv64 <= + hrtimer_get_expires_tv64(timer)) break; if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { @@ -1462,7 +1464,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start(&t->timer, t->timer.expires, mode); + hrtimer_start_expires(&t->timer, mode); if (!hrtimer_active(&t->timer)) t->task = NULL; @@ -1484,7 +1486,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) struct timespec rmt; ktime_t rem; - rem = ktime_sub(timer->expires, timer->base->get_time()); + rem = hrtimer_expires_remaining(timer); if (rem.tv64 <= 0) return 0; rmt = ktime_to_timespec(rem); @@ -1503,7 +1505,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); - t.timer.expires.tv64 = restart->nanosleep.expires; + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); if (do_nanosleep(&t, HRTIMER_MODE_ABS)) goto out; @@ -1530,7 +1532,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, int ret = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - t.timer.expires = timespec_to_ktime(*rqtp); + hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp)); if (do_nanosleep(&t, mode)) goto out; @@ -1550,7 +1552,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.index = t.timer.base->index; restart->nanosleep.rmtp = rmtp; - restart->nanosleep.expires = t.timer.expires.tv64; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); ret = -ERESTART_RESTARTBLOCK; out: @@ -1724,11 +1726,11 @@ int __sched schedule_hrtimeout(ktime_t *expires, } hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); - t.timer.expires = *expires; + hrtimer_set_expires(&t.timer, *expires); hrtimer_init_sleeper(&t, current); - hrtimer_start(&t.timer, t.timer.expires, mode); + hrtimer_start_expires(&t.timer, mode); if (!hrtimer_active(&t.timer)) t.task = NULL; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbff..f85efcdcab2d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -668,7 +668,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(timer->expires, now); + remaining = ktime_sub(hrtimer_get_expires(timer), now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* @@ -762,7 +762,7 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; - timer->expires = timespec_to_ktime(new_setting->it_value); + hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); /* Convert interval */ timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); @@ -771,14 +771,12 @@ common_timer_set(struct k_itimer *timr, int flags, if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ if (mode == HRTIMER_MODE_REL) { - timer->expires = - ktime_add_safe(timer->expires, - timer->base->get_time()); + hrtimer_add_expires(timer, timer->base->get_time()); } return 0; } - hrtimer_start(timer, timer->expires, mode); + hrtimer_start_expires(timer, mode); return 0; } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 6522ae5b14a2..69d9cb921ffa 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) { - hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&timeout->timer)) timeout->task = NULL; } diff --git a/kernel/sched.c b/kernel/sched.c index 1a5f73c1fcdc..e46b5afa200d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -221,9 +221,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start(&rt_b->rt_period_timer, - rt_b->rt_period_timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -1058,7 +1057,7 @@ static void hrtick_start(struct rq *rq, u64 delay) struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - timer->expires = time; + hrtimer_set_expires(timer, time); if (rq == this_rq()) { hrtimer_restart(timer); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196b..4c8d85421d24 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OOP; printk(KERN_NOTICE "Clock: " "inserting leap second 23:59:60 UTC\n"); - leap_timer.expires = ktime_add_ns(leap_timer.expires, - NSEC_PER_SEC); + hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); res = HRTIMER_RESTART; break; case TIME_DEL: diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568b..b33be61c0f6b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -288,7 +288,7 @@ void tick_nohz_stop_sched_tick(int inidle) goto out; } - ts->idle_tick = ts->sched_timer.expires; + ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); @@ -419,21 +419,21 @@ void tick_nohz_restart_sched_tick(void) ts->tick_stopped = 0; ts->idle_exittime = now; hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; + hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); while (1) { /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, tick_period); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) + if (!tick_program_event( + hrtimer_get_expires(&ts->sched_timer), 0)) break; } /* Update jiffies and reread time */ @@ -446,7 +446,7 @@ void tick_nohz_restart_sched_tick(void) static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(ts->sched_timer.expires, 0); + return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); } /* @@ -529,7 +529,7 @@ static void tick_nohz_switch_to_nohz(void) next = tick_init_jiffy_update(); for (;;) { - ts->sched_timer.expires = next; + hrtimer_set_expires(&ts->sched_timer, next); if (!tick_program_event(next, 0)) break; next = ktime_add(next, tick_period); @@ -625,16 +625,15 @@ void tick_setup_sched_timer(void) ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; /* Get the next period (per cpu) */ - ts->sched_timer.expires = tick_init_jiffy_update(); + hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); offset = ktime_to_ns(tick_period) >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); - ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); + hrtimer_add_expires_ns(&ts->sched_timer, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20fd0001..5224a3215fb7 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -66,8 +66,8 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) #endif SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", - (unsigned long long)ktime_to_ns(timer->expires), - (long long)(ktime_to_ns(timer->expires) - now)); + (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } static void -- cgit v1.2.1 From 654c8e0b1c623b156c5b92f28d914ab38c9c2c90 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 15:47:08 -0700 Subject: hrtimer: turn hrtimers into range timers this patch turns hrtimers into range timers; they have 2 expire points 1) the soft expire point 2) the hard expire point the kernel will do it's regular best effort attempt to get the timer run at the hard expire point. However, if some other time fires after the soft expire point, the kernel now has the freedom to fire this timer at this point, and thus grouping the events and preventing a power-expensive wakeup in the future. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae307feec74c..01483004183d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1309,7 +1309,20 @@ void hrtimer_interrupt(struct clock_event_device *dev) timer = rb_entry(node, struct hrtimer, node); - if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) { + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing querry for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), @@ -1681,14 +1694,20 @@ void __init hrtimers_init(void) } /** - * schedule_hrtimeout - sleep until timeout + * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to @@ -1702,7 +1721,7 @@ void __init hrtimers_init(void) * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout(ktime_t *expires, +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, const enum hrtimer_mode mode) { struct hrtimer_sleeper t; @@ -1726,7 +1745,7 @@ int __sched schedule_hrtimeout(ktime_t *expires, } hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&t.timer, *expires); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1744,4 +1763,33 @@ int __sched schedule_hrtimeout(ktime_t *expires, return !t.task ? 0 : -EINTR; } +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} EXPORT_SYMBOL_GPL(schedule_hrtimeout); -- cgit v1.2.1 From 6976675d94042fbd446231d1bd8b7de71a980ada Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 15:52:40 -0700 Subject: hrtimer: create a "timer_slack" field in the task struct We want to be able to control the default "rounding" that is used by select() and poll() and friends. This is a per process property (so that we can have a "nice" like program to start certain programs with a looser or stricter rounding) that can be set/get via a prctl(). For this purpose, a field called "timer_slack_ns" is added to the task struct. In addition, a field called "default_timer_slack"ns" is added so that tasks easily can temporarily to a more/less accurate slack and then back to the default. The default value of the slack is set to 50 usec; this is significantly less than 2.6.27's average select() and poll() timing error but still allows the kernel to group timers somewhat to preserve power behavior. Applications and admins can override this via the prctl() Signed-off-by: Arjan van de Ven --- kernel/fork.c | 2 ++ kernel/sys.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe84796..4308d75f0fa5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -987,6 +987,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; + p->default_timer_slack_ns = current->timer_slack_ns; + #ifdef CONFIG_DETECT_SOFTLOCKUP p->last_switch_count = 0; p->last_switch_timestamp = 0; diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc0901d..1b96401a0576 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1727,6 +1727,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = + current->default_timer_slack_ns; + else + current->timer_slack_ns = arg2; + break; default: error = -EINVAL; break; -- cgit v1.2.1 From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Fri, 5 Sep 2008 23:46:19 +0200 Subject: sched: compilation fix with gcc 3.4.6 I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6. The error is: CC kernel/sched.o kernel/sched.c: In function `start_rt_bandwidth': kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available kernel/sched.c:214: sorry, unimplemented: called from here make[1]: *** [kernel/sched.o] Error 1 make: *** [kernel] Error 2 It seems that the gcc 3.4.6 requires full inline definition before first usage. The patch below fixes the compilation problem. Signed-off-by: Krzysztof Helt (if needed> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 703f56d5db5f..4de2bfb28c58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -static inline int rt_bandwidth_enabled(void); +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { @@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.2.1 From 38736f475071b80b66be28af7b44c854073699cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sat, 6 Sep 2008 14:50:23 +0530 Subject: sched: fix __load_balance_iterator() for cfq with only one task The __load_balance_iterator() returns a NULL when there's only one sched_entity which is a task. It is caused by the following code-path. /* Skip over entities that are not tasks */ do { se = list_entry(next, struct sched_entity, group_node); next = next->next; } while (next != &cfs_rq->tasks && !entity_is_task(se)); if (next == &cfs_rq->tasks) return NULL; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This will return NULL even when se is a task. As a side-effect, there was a regression in sched_mc behavior since 2.6.25, since iter_move_one_task() when it calls load_balance_start_fair(), would not get any tasks to move! Fix this by checking if the last entity was a task or not. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8264bb5dbd51..a10ac0bcee64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1458,7 +1458,7 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) next = next->next; } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) + if (next == &cfs_rq->tasks && !entity_is_task(se)) return NULL; cfs_rq->balance_iterator = next; -- cgit v1.2.1 From 3ba35573ad9a149a3af19625b502679283382f6b Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 31 Aug 2008 19:58:49 +0200 Subject: kernel/cpu.c: Move the CPU_DYING notifiers When a cpu is taken offline, the CPU_DYING notifiers are called on the dying cpu. According to , the cpu should be "not running any task, not handling interrupts, soon dead". For the current implementation, this is not true: - __cpu_disable can fail. If it fails, then the cpu will remain alive and happy. - At least on x86, __cpu_disable() briefly enables the local interrupts to handle any outstanding interrupts. What about moving CPU_DYING down a few lines, behind the __cpu_disable() line? There are only two CPU_DYING handlers in the kernel right now: one in kvm, one in the scheduler. Both should work with the patch applied [and: I'm not sure if either one handles a failing __cpu_disable()] The patch survives simple offlining a cpu. kvm untested due to lack of a test setup. Signed-off-By: Manfred Spraul Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..9e7ebde1331c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); -- cgit v1.2.1 From 978b0116cd225682a29e3d1d5010319bf2de32c2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 6 Sep 2008 20:04:36 +0200 Subject: softirq: allocate less vectors We don't need whole 32 of them, only NR_SOFTIRQS. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f266a6b9..82e32aadedd8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -46,7 +46,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; EXPORT_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); -- cgit v1.2.1 From 7e6e178ab1548c8d894a77593e757acf4510b8ba Mon Sep 17 00:00:00 2001 From: Pawel MOLL Date: Mon, 1 Sep 2008 10:12:11 +0100 Subject: genirq: irq_chip->startup() usage in setup_irq and set_irq_chained handler This patch clarifies usage of irq_chip->startup() callback: 1. The "if (startup) startup(); else enabled();" code in setup_irq() is unnecessary, as startup() falls back to enabled() via default callbacks, set by irq_chip_set_defaults(). 2. When using set_irq_chained_handler() the startup() was never called, which is not good at all... Fixed. And again - when startup() is not defined the call will fall back to enable() than to unmask() via default callbacks. Signed-off-by: Pawel Moll Acked-by: Benjamin Herrenschmidt Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 964964baefa2..240c64d59267 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -587,7 +587,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->unmask(irq); + desc->chip->startup(irq); } spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ae1b684e048c..9aa3e7b81389 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -397,10 +397,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - if (desc->chip->startup) - desc->chip->startup(irq); - else - desc->chip->enable(irq); + desc->chip->startup(irq); } else /* Undo nested disables: */ desc->depth = 1; -- cgit v1.2.1 From da8f2e170ea94cc20f8ebbc8ee8d127edb8f12f1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 7 Sep 2008 10:47:46 -0700 Subject: hrtimer: add a hrtimer_start_range() function this patch adds a _range version of hrtimer_start() so that range timers can be created; the hrtimer_start() function is just a wrapper around this. In addition, hrtimer_start_expires() will now preserve existing ranges. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 01483004183d..a0222097c57e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -945,9 +945,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) } /** - * hrtimer_start - (re)start an relative timer on the current CPU + * hrtimer_start_range_ns - (re)start an relative timer on the current CPU * @timer: the timer to be added * @tim: expiry time + * @delta_ns: "slack" range for the timer * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) * * Returns: @@ -955,7 +956,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * 1 when the timer was active */ int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns, + const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -983,7 +985,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) #endif } - hrtimer_set_expires(timer, tim); + hrtimer_set_expires_range_ns(timer, tim, delta_ns); timer_stats_hrtimer_set_start_info(timer); @@ -1016,8 +1018,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) return ret; } +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_start - (re)start an relative timer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int +hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +{ + return hrtimer_start_range_ns(timer, tim, 0, mode); +} EXPORT_SYMBOL_GPL(hrtimer_start); + /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop -- cgit v1.2.1 From 704af52bd13a5d9f3c60c496c68e752fafdfb434 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 7 Sep 2008 16:10:20 -0700 Subject: hrtimer: show the timer ranges in /proc/timer_list to help debugging and visibility of timer ranges, show them in the existing timer list in /proc/timer_list Signed-off-by: Arjan van de Ven --- kernel/time/timer_list.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5224a3215fb7..122ee751d2d1 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,8 +65,10 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", + SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -- cgit v1.2.1 From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 16:57:22 +0200 Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier Right now, there is no notifier that is called on a new cpu, before the new cpu begins processing interrupts/softirqs. Various kernel function would need that notification, e.g. kvm works around by calling smp_call_function_single(), rcu polls cpu_online_map. The patch adds a CPU_STARTING notification. It also adds a helper function that sends the message to all cpu_chain handlers. Tested on x86-64. All other archs are untested. Especially on sparc, I'm not sure if I got it right. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- kernel/cpu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..dc45f2459efb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -453,6 +453,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* -- cgit v1.2.1 From 3bd012060f962567aadb52b27b2fc8fdc91102c7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 8 Sep 2008 08:58:59 -0700 Subject: hrtimer: make the nanosleep() syscall use the per process slack This patch makes the nanosleep() system call use the per process slack value; with this users are able to externally control existing applications to reduce the wakeup rate. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a0222097c57e..9a4c90185566 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1563,9 +1563,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; + unsigned long slack; + + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp)); + hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); if (do_nanosleep(&t, mode)) goto out; -- cgit v1.2.1 From ae4b748e81b7e366f04f55229d5e372e372c33af Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 8 Sep 2008 09:03:57 -0700 Subject: hrtimer: make the futex() system call use the per process slack value This patch makes the futex() system call use the per process slack value; with this users are able to externally control existing applications to reduce the wakeup rate. Signed-off-by: Arjan van de Ven --- kernel/futex.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4cd5b4319b04..8af10027514b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1296,10 +1296,14 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (!abs_time) schedule(); else { + unsigned long slack; + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - hrtimer_set_expires(&t.timer, *abs_time); + hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) -- cgit v1.2.1 From 2e94d1f71f7e4404d997e6fb4f1618aa147d76f9 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 10 Sep 2008 16:06:00 -0700 Subject: hrtimer: peek at the timer queue just before going idle As part of going idle, we already look at the time of the next timer event to determine which C-state to select etc. This patch adds functionality that causes the timers that are past their soft expire time, to fire at this time, before we calculate the next wakeup time. This functionality will thus avoid wakeups by running timers before going idle rather than specially waking up for it. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9a4c90185566..eb2cf984959f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1381,6 +1381,36 @@ void hrtimer_interrupt(struct clock_event_device *dev) raise_softirq(HRTIMER_SOFTIRQ); } +/** + * hrtimer_peek_ahead_timers -- run soft-expired timers now + * + * hrtimer_peek_ahead_timers will peek at the timer queue of + * the current cpu and check if there are any timers for which + * the soft expires time has passed. If any such timers exist, + * they are run immediately and then removed from the timer queue. + * + */ +void hrtimer_peek_ahead_timers(void) +{ + unsigned long flags; + struct tick_device *td; + struct clock_event_device *dev; + + if (hrtimer_hres_active()) + return; + + local_irq_save(flags); + td = &__get_cpu_var(tick_cpu_device); + if (!td) + goto out; + dev = td->evtdev; + if (!dev) + goto out; + hrtimer_interrupt(dev); +out: + local_irq_restore(flags); +} + static void run_hrtimer_softirq(struct softirq_action *h) { run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); -- cgit v1.2.1 From 72c57ed50663dc04b0b329beaec39b557c8ac5a5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 11 Sep 2008 23:29:54 -0700 Subject: sysctl: Use CONFIG_SPARC instead of __sparc__ for ifdef tests. Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fe4713347275..eda6162c58f2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -118,7 +118,7 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif -#ifdef __sparc__ +#ifdef CONFIG_SPARC extern char reboot_command []; extern int stop_a_enabled; extern int scons_pwroff; @@ -414,7 +414,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef __sparc__ +#ifdef CONFIG_SPARC { .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", -- cgit v1.2.1 From 17f04fbb0f7153d95ec33da81189b113cc778157 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 11 Sep 2008 23:33:53 -0700 Subject: sysctl: Use header file for sysctl knob declarations on sparc. This also takes care of a sparse warning as scons_pwroff's definition point. Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eda6162c58f2..da5152088cc3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -119,9 +119,7 @@ extern int sg_big_buff; #endif #ifdef CONFIG_SPARC -extern char reboot_command []; -extern int stop_a_enabled; -extern int scons_pwroff; +#include #endif #ifdef __hppa__ -- cgit v1.2.1 From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar Cc: Roland McGrath Cc: Alexey Dobriyan Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/compat.c | 53 ++---- kernel/exit.c | 19 +- kernel/fork.c | 88 +++++---- kernel/itimer.c | 33 +--- kernel/posix-cpu-timers.c | 471 +++++++++++++++++++++++++--------------------- kernel/sched.c | 53 +++++- kernel/sched_fair.c | 1 + kernel/sched_rt.c | 4 +- kernel/signal.c | 8 +- kernel/sys.c | 75 +++----- 10 files changed, 415 insertions(+), 390 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a8ab9a..72650e39b3e6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -150,49 +151,23 @@ asmlinkage long compat_sys_setitimer(int which, return 0; } +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { + struct tms tms; struct compat_tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --git a/kernel/exit.c b/kernel/exit.c index 16395644a98f..40036ac04271 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1294,6 +1291,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1309,20 +1307,23 @@ static int wait_task_zombie(struct task_struct *p, int options, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; + thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe84796..a8ac2efb8e30 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current, tsk); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -795,15 +824,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; @@ -820,14 +844,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -837,6 +855,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); kmem_cache_free(signal_cachep, sig); } @@ -885,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ +/* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -995,12 +1027,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1201,21 +1228,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { diff --git a/kernel/itimer.c b/kernel/itimer.c index ab982747d9bd..db7c358b9a02 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t utime = tsk->signal->utime; - do { - utime = cputime_add(utime, t->utime); - t = next_thread(t); - } while (t != tsk); + struct task_cputime cputime; + cputime_t utime; + + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; case ITIMER_PROF: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t ptime = cputime_add(tsk->signal->utime, - tsk->signal->stime); - do { - ptime = cputime_add(ptime, - cputime_add(t->utime, - t->stime)); - t = next_thread(t); - } while (t != tsk); + struct task_cputime times; + cputime_t ptime; + + thread_group_cputime(tsk, ×); + ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; @@ -185,7 +176,6 @@ again: case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; @@ -200,7 +190,6 @@ again: tsk->signal->it_virt_expires = nval; tsk->signal->it_virt_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); @@ -209,7 +198,6 @@ again: case ITIMER_PROF: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; @@ -224,7 +212,6 @@ again: tsk->signal->it_prof_expires = nval; tsk->signal->it_prof_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c42a03aef36f..dba1c334c3e8 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,6 +8,99 @@ #include #include +#ifdef CONFIG_SMP +/* + * Allocate the thread_group_cputime structure appropriately for SMP kernels + * and fill in the current values of the fields. Called from copy_signal() + * via thread_group_cputime_clone_thread() when adding a second or subsequent + * thread to a thread group. Assumes interrupts are enabled when called. + */ +int thread_group_cputime_alloc_smp(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct task_cputime *cputime; + + /* + * If we have multiple threads and we don't already have a + * per-CPU task_cputime struct, allocate one and fill it in with + * the times accumulated so far. + */ + if (sig->cputime.totals) + return 0; + cputime = alloc_percpu(struct task_cputime); + if (cputime == NULL) + return -ENOMEM; + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + if (sig->cputime.totals) { + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + free_percpu(cputime); + return 0; + } + sig->cputime.totals = cputime; + cputime = per_cpu_ptr(sig->cputime.totals, get_cpu()); + cputime->utime = tsk->utime; + cputime->stime = tsk->stime; + cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; + put_cpu_no_resched(); + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + return 0; +} + +/** + * thread_group_cputime_smp - Sum the thread group time fields across all CPUs. + * + * @tsk: The task we use to identify the thread group. + * @times: task_cputime structure in which we return the summed fields. + * + * Walk the list of CPUs to sum the per-CPU time fields in the thread group + * time structure. + */ +void thread_group_cputime_smp( + struct task_struct *tsk, + struct task_cputime *times) +{ + struct signal_struct *sig; + int i; + struct task_cputime *tot; + + sig = tsk->signal; + if (unlikely(!sig) || !sig->cputime.totals) { + times->utime = tsk->utime; + times->stime = tsk->stime; + times->sum_exec_runtime = tsk->se.sum_exec_runtime; + return; + } + times->stime = times->utime = cputime_zero; + times->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + times->utime = cputime_add(times->utime, tot->utime); + times->stime = cputime_add(times->stime, tot->stime); + times->sum_exec_runtime += tot->sum_exec_runtime; + } +} + +#endif /* CONFIG_SMP */ + +/* + * Called after updating RLIMIT_CPU to set timer expiration if necessary. + */ +void update_rlimit_cpu(unsigned long rlim_new) +{ + cputime_t cputime; + + cputime = secs_to_cputime(rlim_new); + if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + cputime_lt(current->signal->it_prof_expires, cputime)) { + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); + } +} + static int check_clock(const clockid_t which_clock) { int error = 0; @@ -158,10 +251,6 @@ static inline cputime_t virt_ticks(struct task_struct *p) { return p->utime; } -static inline unsigned long long sched_ns(struct task_struct *p) -{ - return task_sched_runtime(p); -} int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -211,7 +300,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = sched_ns(p); + cpu->sched = task_sched_runtime(p); break; } return 0; @@ -226,31 +315,20 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, struct task_struct *p, union cpu_time_count *cpu) { - struct task_struct *t = p; - switch (clock_idx) { + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + switch (clock_idx) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); - do { - cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = p->signal->utime; - do { - cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; - /* Add in each other live thread. */ - while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; - } - cpu->sched += sched_ns(p); + cpu->sched = thread_group_sched_runtime(p); break; } return 0; @@ -471,80 +549,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, tsk->signal->utime), - cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} - - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, - unsigned int clock_idx, - union cpu_time_count expires, - union cpu_time_count val) -{ - cputime_t ticks, left; - unsigned long long ns, nsleft; - struct task_struct *t = p; - unsigned int nthreads = atomic_read(&p->signal->live); - - if (!nthreads) - return; + struct task_cputime cputime; - switch (clock_idx) { - default: - BUG(); - break; - case CPUCLOCK_PROF: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(prof_ticks(t), left); - if (cputime_eq(t->it_prof_expires, - cputime_zero) || - cputime_gt(t->it_prof_expires, ticks)) { - t->it_prof_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(virt_ticks(t), left); - if (cputime_eq(t->it_virt_expires, - cputime_zero) || - cputime_gt(t->it_virt_expires, ticks)) { - t->it_virt_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - nsleft = expires.sched - val.sched; - do_div(nsleft, nthreads); - nsleft = max_t(unsigned long long, nsleft, 1); - do { - if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; - if (t->it_sched_expires == 0 || - t->it_sched_expires > ns) { - t->it_sched_expires = ns; - } - } - t = next_thread(t); - } while (t != p); - break; - } + thread_group_cputime(tsk, &cputime); + cleanup_timers(tsk->signal->cpu_timers, + cputime.utime, cputime.stime, cputime.sum_exec_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -608,29 +617,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->it_prof_expires, + if (cputime_eq(p->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(p->it_prof_expires, + cputime_gt(p->cputime_expires.prof_exp, nt->expires.cpu)) - p->it_prof_expires = nt->expires.cpu; + p->cputime_expires.prof_exp = + nt->expires.cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->it_virt_expires, + if (cputime_eq(p->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(p->it_virt_expires, + cputime_gt(p->cputime_expires.virt_exp, nt->expires.cpu)) - p->it_virt_expires = nt->expires.cpu; + p->cputime_expires.virt_exp = + nt->expires.cpu; break; case CPUCLOCK_SCHED: - if (p->it_sched_expires == 0 || - p->it_sched_expires > nt->expires.sched) - p->it_sched_expires = nt->expires.sched; + if (p->cputime_expires.sched_exp == 0 || + p->cputime_expires.sched_exp > + nt->expires.sched) + p->cputime_expires.sched_exp = + nt->expires.sched; break; } } else { /* - * For a process timer, we must balance - * all the live threads' expirations. + * For a process timer, set the cached expiration time. */ switch (CPUCLOCK_WHICH(timer->it_clock)) { default: @@ -641,7 +653,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) cputime_lt(p->signal->it_virt_expires, timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.virt_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_PROF: if (!cputime_eq(p->signal->it_prof_expires, cputime_zero) && @@ -652,13 +666,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) if (i != RLIM_INFINITY && i <= cputime_to_secs(timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.prof_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_SCHED: - rebalance: - process_timer_rebalance( - timer->it.cpu.task, - CPUCLOCK_WHICH(timer->it_clock), - timer->it.cpu.expires, now); + p->signal->cputime_expires.sched_exp = + timer->it.cpu.expires.sched; break; } } @@ -969,13 +982,13 @@ static void check_thread_timers(struct task_struct *tsk, struct signal_struct *const sig = tsk->signal; maxfire = 20; - tsk->it_prof_expires = cputime_zero; + tsk->cputime_expires.prof_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { - tsk->it_prof_expires = t->expires.cpu; + tsk->cputime_expires.prof_exp = t->expires.cpu; break; } t->firing = 1; @@ -984,13 +997,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_virt_expires = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { - tsk->it_virt_expires = t->expires.cpu; + tsk->cputime_expires.virt_exp = t->expires.cpu; break; } t->firing = 1; @@ -999,13 +1012,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_sched_expires = 0; + tsk->cputime_expires.sched_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->it_sched_expires = t->expires.sched; + tsk->cputime_expires.sched_exp = t->expires.sched; break; } t->firing = 1; @@ -1055,10 +1068,10 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime, ptime, virt_expires, prof_expires; + cputime_t utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; - struct task_struct *t; struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1074,18 +1087,10 @@ static void check_process_timers(struct task_struct *tsk, /* * Collect the current process totals. */ - utime = sig->utime; - stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; - t = next_thread(t); - } while (t != tsk); - ptime = cputime_add(utime, stime); - + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; + ptime = cputime_add(utime, cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { @@ -1193,60 +1198,18 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) || - !cputime_eq(virt_expires, cputime_zero) || - sched_expires != 0) { - /* - * Rebalance the threads' expiry times for the remaining - * process CPU timers. - */ - - cputime_t prof_left, virt_left, ticks; - unsigned long long sched_left, sched; - const unsigned int nthreads = atomic_read(&sig->live); - - if (!nthreads) - return; - - prof_left = cputime_sub(prof_expires, utime); - prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div_non_zero(prof_left, nthreads); - virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div_non_zero(virt_left, nthreads); - if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; - do_div(sched_left, nthreads); - sched_left = max_t(unsigned long long, sched_left, 1); - } else { - sched_left = 0; - } - t = tsk; - do { - if (unlikely(t->flags & PF_EXITING)) - continue; - - ticks = cputime_add(cputime_add(t->utime, t->stime), - prof_left); - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(t->it_prof_expires, cputime_zero) || - cputime_gt(t->it_prof_expires, ticks))) { - t->it_prof_expires = ticks; - } - - ticks = cputime_add(t->utime, virt_left); - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(t->it_virt_expires, cputime_zero) || - cputime_gt(t->it_virt_expires, ticks))) { - t->it_virt_expires = ticks; - } - - sched = t->se.sum_exec_runtime + sched_left; - if (sched_expires && (t->it_sched_expires == 0 || - t->it_sched_expires > sched)) { - t->it_sched_expires = sched; - } - } while ((t = next_thread(t)) != tsk); - } + if (!cputime_eq(prof_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) + sig->cputime_expires.prof_exp = prof_expires; + if (!cputime_eq(virt_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) + sig->cputime_expires.virt_exp = virt_expires; + if (sched_expires != 0 && + (sig->cputime_expires.sched_exp == 0 || + sig->cputime_expires.sched_exp > sched_expires)) + sig->cputime_expires.sched_exp = sched_expires; } /* @@ -1314,6 +1277,78 @@ out: ++timer->it_requeue_pending; } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) + return 1; + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * @sig: The signal pointer for that task. + * + * If there are no timers set return false. Otherwise snapshot the task and + * thread group timers, then compare them with the corresponding expiration + # times. Returns true if a timer has expired, else returns false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk, + struct signal_struct *sig) +{ + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + struct task_cputime group_sample; + + if (task_cputime_zero(&tsk->cputime_expires) && + task_cputime_zero(&sig->cputime_expires)) + return 0; + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + thread_group_cputime(tsk, &group_sample); + return task_cputime_expired(&group_sample, &sig->cputime_expires); +} + /* * This is called from the timer interrupt handler. The irq handler has * already updated our counts. We need to check if any timers fire now. @@ -1323,30 +1358,29 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + struct signal_struct *sig; + struct sighand_struct *sighand; + unsigned long flags; BUG_ON(!irqs_disabled()); -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) - return; - -#undef UNEXPIRED - + /* Pick up tsk->signal and make sure it's valid. */ + sig = tsk->signal; /* - * Double-check with locks held. + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. Also check that + * tsk->signal is non-NULL; this probably can't happen but cover the + * possibility anyway. */ - read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); - + if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) { + return; + } + sighand = lock_task_sighand(tsk, &flags); + if (likely(sighand)) { /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. */ check_thread_timers(tsk, &firing); check_process_timers(tsk, &firing); @@ -1359,9 +1393,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, @@ -1389,10 +1422,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. + * The tsk->sighand->siglock must be held by the caller. + * The *newval argument is relative and we update it to be absolute, *oldval + * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) @@ -1435,13 +1467,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { - /* - * Rejigger each thread's expiry time so that one will - * notice before we hit the process-cumulative expiry time. - */ - union cpu_time_count expires = { .sched = 0 }; - expires.cpu = *newval; - process_timer_rebalance(tsk, clock_idx, expires, now); + switch (clock_idx) { + case CPUCLOCK_PROF: + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } } } diff --git a/kernel/sched.c b/kernel/sched.c index cc1f81b50b82..c51b5d276665 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4036,6 +4036,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. + */ +static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq) +{ + if (task_current(rq, p)) { + u64 delta_exec; + + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; + if ((s64)delta_exec > 0) + return delta_exec; + } + return 0; +} + /* * Return p->sum_exec_runtime plus any more ns on the sched_clock * that have not yet been banked in case the task is currently running. @@ -4043,17 +4062,31 @@ EXPORT_PER_CPU_SYMBOL(kstat); unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; + u64 ns; struct rq *rq; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; - if (task_current(rq, p)) { - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns += delta_exec; - } + ns = p->se.sum_exec_runtime + task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); + + return ns; +} + +/* + * Return sum_exec_runtime for the thread group plus any more ns on the + * sched_clock that have not yet been banked in case the task is currently + * running. + */ +unsigned long long thread_group_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + u64 ns; + struct rq *rq; + struct task_cputime totals; + + rq = task_rq_lock(p, &flags); + thread_group_cputime(p, &totals); + ns = totals.sum_exec_runtime + task_delta_exec(p, rq); task_rq_unlock(rq, &flags); return ns; @@ -4070,6 +4103,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4094,6 +4128,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4129,6 +4164,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4170,6 +4206,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..99aa31acc544 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -507,6 +507,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); } } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 552310798dad..8375e69af36a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -483,6 +483,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); @@ -1412,7 +1414,7 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->rt.timeout++; next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) - p->it_sched_expires = p->se.sum_exec_runtime; + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } diff --git a/kernel/signal.c b/kernel/signal.c index e661b01d340f..6eea5826d618 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + thread_group_cputime(tsk, &cputime); + info.si_utime = cputime_to_jiffies(cputime.utime); + info.si_stime = cputime_to_jiffies(cputime.stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc0901d..d046a7a055c2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_cputime(current, &cputime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + asmlinkage long sys_times(struct tms __user * tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); + + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1445,7 +1435,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) @@ -1491,18 +1480,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - } + update_rlimit_cpu(new_rlim.rlim_cur); out: return 0; } @@ -1540,11 +1518,8 @@ out: * */ -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, - cputime_t *utimep, cputime_t *stimep) +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { - *utimep = cputime_add(*utimep, t->utime); - *stimep = cputime_add(*stimep, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@ -1558,12 +1533,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + struct task_cputime cputime; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - accumulate_thread_rusage(p, r, &utime, &stime); + accumulate_thread_rusage(p, r); goto out; } @@ -1586,8 +1562,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1596,7 +1573,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - accumulate_thread_rusage(t, r, &utime, &stime); + accumulate_thread_rusage(t, r); t = next_thread(t); } while (t != p); break; -- cgit v1.2.1 From 430b5294bd72c085c730e1e4b86580f164d976bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 16:33:01 +0200 Subject: timers: fix itimer/many thread hang, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: kernel/fork.c:843: error: ‘struct signal_struct’ has no member named ‘sum_sched_runtime’ kernel/irq/handle.c:117: warning: ‘sparse_irq_lock’ defined but not used Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a8ac2efb8e30..1181b9aac48e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -834,7 +834,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); -- cgit v1.2.1 From 5ce73a4a5a4893a1aa4cdeed1b1a5a6de42c43b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 17:11:46 +0200 Subject: timers: fix itimer/many thread hang, cleanups Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index dba1c334c3e8..9a7ea049fcdc 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -94,7 +94,7 @@ void update_rlimit_cpu(unsigned long rlim_new) cputime = secs_to_cputime(rlim_new); if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_lt(current->signal->it_prof_expires, cputime)) { + cputime_lt(current->signal->it_prof_expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -1372,9 +1372,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) * tsk->signal is non-NULL; this probably can't happen but cover the * possibility anyway. */ - if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) { + if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) return; - } + sighand = lock_task_sighand(tsk, &flags); if (likely(sighand)) { /* -- cgit v1.2.1 From d7cfb60c5cf904ecf1e0ae23ec178175b86f0d4a Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 19 Sep 2008 13:13:44 +0100 Subject: hrtimer: remove hrtimer_clock_base::get_softirq_time() Peter Zijlstra noticed this 8 months ago and I just noticed it again. hrtimer_clock_base::get_softirq_time() is currently unused in the entire tree. In fact, looking at the logs, it appears as if it was never used. Remove it. Signed-off-by: Mark McLoughlin Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 03ea1378c43b..4d761d50c529 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1401,9 +1401,7 @@ void hrtimer_run_queues(void) if (!base->first) continue; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { + if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } -- cgit v1.2.1 From 15afe09bf496ae10c989e1a375a6b5da7bd3e16e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 20 Sep 2008 23:38:02 +0200 Subject: sched: wakeup preempt when small overlap Lin Ming reported a 10% OLTP regression against 2.6.27-rc4. The difference seems to come from different preemption agressiveness, which affects the cache footprint of the workload and its effective cache trashing. Aggresively preempt a task if its avg overlap is very small, this should avoid the task going to sleep and find it still running when we schedule back to it - saving a wakeup. Reported-by: Lin Ming Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ kernel/sched_fair.c | 13 ++++++++++--- kernel/sched_features.h | 1 + kernel/sched_idletask.c | 6 +++--- kernel/sched_rt.c | 2 +- 5 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0d8905a1b8ca..ad9d39b021f8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -604,9 +604,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -2282,7 +2282,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2417,7 +2417,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2877,7 +2877,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -6007,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a10ac0bcee64..7328383690f1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ static inline int depth_se(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -1367,6 +1367,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; + if (sched_feat(WAKEUP_OVERLAP) && sync && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) { + resched_task(curr); + return; + } + /* * preemption test can be made between sibling entities who are in the * same cfs_rq i.e who have a common parent. Walk up the hierarchy of @@ -1649,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1666,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154e..bf027a7accf8 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 552310798dad..6d2d0a5d030b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -783,7 +783,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); -- cgit v1.2.1 From f681bbd656b01439be904250a1581ca9c27505a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Sep 2008 16:29:00 +0200 Subject: sched: turn off WAKEUP_OVERLAP WAKEUP_OVERLAP is not a winner on a 16way box, running psql+sysbench: .27-rc7-NO_WAKEUP_OVERLAP .27-rc7-WAKEUP_OVERLAP ------------------------------------------------- 1: 694 811 +14.39% 2: 1454 1427 -1.86% 4: 3017 3070 +1.70% 8: 5694 5808 +1.96% 16: 10592 10612 +0.19% 32: 9693 9647 -0.48% 64: 8507 8262 -2.97% 128: 8402 7087 -18.55% 256: 8419 5124 -64.30% 512: 7990 3671 -117.62% ------------------------------------------------- SUM: 64466 55524 -16.11% ... so turn it off by default. Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index bf027a7accf8..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,4 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) -- cgit v1.2.1 From caea8a03702c147e8ae90da0801e7ba8297b1d46 Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Mon, 22 Sep 2008 11:06:09 -0600 Subject: sched: fix list traversal to use _rcu variant load_balance_fair() calls rcu_read_lock() but then traverses the list using the regular list traversal routine. This patch converts the list traversal to use the _rcu version. Signed-off-by: Chris Friesen Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7328383690f1..3b89aa6594a9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1521,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; -- cgit v1.2.1 From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 22 Sep 2008 14:55:45 -0700 Subject: sched: fix init_hrtick() section mismatch warning LD kernel/built-in.o WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference from the function init_hrtick() to the variable .cpuinit.data:hotplug_hrtick_nb.8 The function init_hrtick() references the variable __cpuinitdata hotplug_hrtick_nb.8. This is often because init_hrtick lacks a __cpuinitdata annotation or the annotation of hotplug_hrtick_nb.8 is wrong. Signed-off-by: Md.Rakib H. Mullick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98890807375b..13dd2db9fb2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -- cgit v1.2.1 From 006c75f146e58e080d2b2725a6664f71886e112b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 22 Sep 2008 14:55:46 -0700 Subject: sched: clarify ifdef tangle - Add some comments to try to make the ifdef puzzle a bit clearer - Explicitly inline one of the three init_hrtick() implementations. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ad9d39b021f8..927c9307cd00 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1102,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1121,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq) rq->hrtick_timer.function = hrtick; rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. -- cgit v1.2.1 From 3a72dc8eb5a7122fff439a22bd22486a4fff505c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 22 Sep 2008 14:55:46 -0700 Subject: rcu: fix sparse shadowed variable warning kernel/rcuclassic.c:564:18: warning: symbol 'flags' shadows an earlier one kernel/rcuclassic.c:527:16: originally declared here Signed-off-by: Harvey Harrison Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 743cf0550ff4..ed15128ca2c9 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -561,15 +561,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_restore(flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags; + unsigned long flags2; /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags); + spin_lock_irqsave(&rcp->lock, flags2); if (rcu_batch_after(rdp->batch, rcp->pending)) { rcp->pending = rdp->batch; rcu_start_batch(rcp); } - spin_unlock_irqrestore(&rcp->lock, flags); + spin_unlock_irqrestore(&rcp->lock, flags2); } } -- cgit v1.2.1 From 6441402b1f173fa38e561d3cee7c01c32e5281ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:46:37 +0200 Subject: clockevents: prevent cpu online to interfere with nohz Impact: rare hang which can be triggered on CPU online. tick_do_timer_cpu keeps track of the CPU which updates jiffies via do_timer. The value -1 is used to signal, that currently no CPU is doing this. There are two cases, where the variable can have this state: boot: necessary for systems where the boot cpu id can be != 0 nohz long idle sleep: When the CPU which did the jiffies update last goes into a long idle sleep it drops the update jiffies duty so another CPU which is not idle can pick it up and keep jiffies going. Using the same value for both situations is wrong, as the CPU online code can see the -1 state when the timer of the newly onlined CPU is setup. The setup for a newly onlined CPU goes through periodic mode and can pick up the do_timer duty without being aware of the nohz / highres mode of the already running system. Use two separate states and make them constants to avoid magic numbers confusion. Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 7 ++++--- kernel/time/tick-internal.h | 4 ++++ kernel/time/tick-sched.c | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 019315ebf9de..b523d095decf 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -148,7 +148,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -300,7 +300,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6e9db9734aa6..e18014fadf95 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568b..31a14e8caac1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -221,7 +221,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -303,7 +303,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -468,7 +468,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -570,7 +570,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif -- cgit v1.2.1 From 49d670fb8dd62d3ed4e3ed2513538ea65b051aed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:56:01 +0200 Subject: clockevents: prevent stale tick_next_period for onlining CPUs Impact: possible hang on CPU onlining in timer one shot mode. The tick_next_period variable is only used during boot on nohz/highres enabled systems, but for CPU onlining it needs to be maintained when the per cpu clock events device operates in one shot mode. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 31a14e8caac1..39019b3f7621 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } -- cgit v1.2.1 From 302745699c1b675b5d2a1af87271de10e4d96b6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:02:25 +0200 Subject: clockevents: check broadcast device not tick device Impact: Possible hang on CPU online observed on AMD C1E machines. The broadcast setup code looks at the mode of the tick device to determine whether it needs to be shut down or setup. This is wrong when the broadcast mode is set to one shot already. This can happen when a CPU is brought online as it goes through the periodic setup first. The problem went unnoticed as sane systems do not call into that code before the switch to one shot for the clock event device happens. The AMD C1E idle routine switches over immediately and thereby shuts down the just setup device before the first interrupt happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f1f3eee28113..e2b66d1c8ca5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,7 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +245,7 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; -- cgit v1.2.1 From 27ce4cb4a0c7cf59b9a9952266883862f2e4c99f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:04:02 +0200 Subject: clockevents: prevent mode mismatch on cpu online Impact: timer hang on CPU online observed on AMD C1E systems When a CPU is brought online then the broadcast machinery can be in the one shot state already. Check this and setup the timer device of the new CPU in one shot mode so the broadcast code can pick up the next_event value correctly. Another AMD C1E oddity, as we switch to broadcast immediately and not after the full bring up via the ACPI cpu idle code. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 ++++++++ kernel/time/tick-common.c | 3 ++- kernel/time/tick-internal.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e2b66d1c8ca5..bd7034542399 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,4 +575,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b523d095decf..df12434b43ca 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e18014fadf95..55c3f4be6077 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -35,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -43,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } # endif /* !BROADCAST */ #else /* !ONESHOT */ -- cgit v1.2.1 From f8e256c687eb53850685747757c8d75e58756e15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Sep 2008 13:00:57 +0200 Subject: timers: fix build error in !oneshot case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/tick-common.c: In function ‘tick_setup_periodic’: kernel/time/tick-common.c:113: error: implicit declaration of function ‘tick_broadcast_oneshot_active’ Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 55c3f4be6077..469248782c23 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -74,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /* -- cgit v1.2.1 From bb34d92f643086d546b49cef680f6f305ed84414 Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang, v2 This is the second resubmission of the posix timer rework patch, posted a few days ago. This includes the changes from the previous resubmittion, which addressed Oleg Nesterov's comments, removing the RCU stuff from the patch and un-inlining the thread_group_cputime() function for SMP. In addition, per Ingo Molnar it simplifies the UP code, consolidating much of it with the SMP version and depending on lower-level SMP/UP handling to take care of the differences. It also cleans up some UP compile errors, moves the scheduler stats-related macros into kernel/sched_stats.h, cleans up a merge error in kernel/fork.c and has a few other minor fixes and cleanups as suggested by Oleg and Ingo. Thanks for the review, guys. Signed-off-by: Frank Mayhar Cc: Roland McGrath Cc: Alexey Dobriyan Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/fork.c | 5 +- kernel/posix-cpu-timers.c | 153 +++++++++++++++++++--------------------------- kernel/sched.c | 47 +++----------- kernel/sched_stats.h | 136 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 133 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1181b9aac48e..021ae012cc75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -791,7 +791,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) int ret; if (clone_flags & CLONE_THREAD) { - ret = thread_group_cputime_clone_thread(current, tsk); + ret = thread_group_cputime_clone_thread(current); if (likely(!ret)) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); @@ -834,9 +834,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9a7ea049fcdc..153dcb2639c3 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -7,50 +7,46 @@ #include #include #include +#include -#ifdef CONFIG_SMP /* - * Allocate the thread_group_cputime structure appropriately for SMP kernels - * and fill in the current values of the fields. Called from copy_signal() - * via thread_group_cputime_clone_thread() when adding a second or subsequent + * Allocate the thread_group_cputime structure appropriately and fill in the + * current values of the fields. Called from copy_signal() via + * thread_group_cputime_clone_thread() when adding a second or subsequent * thread to a thread group. Assumes interrupts are enabled when called. */ -int thread_group_cputime_alloc_smp(struct task_struct *tsk) +int thread_group_cputime_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct task_cputime *cputime; /* * If we have multiple threads and we don't already have a - * per-CPU task_cputime struct, allocate one and fill it in with - * the times accumulated so far. + * per-CPU task_cputime struct (checked in the caller), allocate + * one and fill it in with the times accumulated so far. We may + * race with another thread so recheck after we pick up the sighand + * lock. */ - if (sig->cputime.totals) - return 0; cputime = alloc_percpu(struct task_cputime); if (cputime == NULL) return -ENOMEM; - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); if (sig->cputime.totals) { spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); free_percpu(cputime); return 0; } sig->cputime.totals = cputime; - cputime = per_cpu_ptr(sig->cputime.totals, get_cpu()); + cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); cputime->utime = tsk->utime; cputime->stime = tsk->stime; cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; - put_cpu_no_resched(); spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); return 0; } /** - * thread_group_cputime_smp - Sum the thread group time fields across all CPUs. + * thread_group_cputime - Sum the thread group time fields across all CPUs. * * @tsk: The task we use to identify the thread group. * @times: task_cputime structure in which we return the summed fields. @@ -58,7 +54,7 @@ int thread_group_cputime_alloc_smp(struct task_struct *tsk) * Walk the list of CPUs to sum the per-CPU time fields in the thread group * time structure. */ -void thread_group_cputime_smp( +void thread_group_cputime( struct task_struct *tsk, struct task_cputime *times) { @@ -83,8 +79,6 @@ void thread_group_cputime_smp( } } -#endif /* CONFIG_SMP */ - /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ @@ -300,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -309,16 +303,15 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock. */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, - struct task_struct *p, - union cpu_time_count *cpu) +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) { struct task_cputime cputime; thread_group_cputime(p, &cputime); - switch (clock_idx) { + switch (which_clock) { default: return -EINVAL; case CPUCLOCK_PROF: @@ -328,29 +321,12 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = thread_group_sched_runtime(p); + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - int ret; - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, - cpu); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { @@ -1324,29 +1300,37 @@ static inline int task_cputime_expired(const struct task_cputime *sample, * fastpath_timer_check - POSIX CPU timers fast path. * * @tsk: The task (thread) being checked. - * @sig: The signal pointer for that task. * - * If there are no timers set return false. Otherwise snapshot the task and - * thread group timers, then compare them with the corresponding expiration - # times. Returns true if a timer has expired, else returns false. + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. */ -static inline int fastpath_timer_check(struct task_struct *tsk, - struct signal_struct *sig) +static inline int fastpath_timer_check(struct task_struct *tsk) { - struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; - struct task_cputime group_sample; + struct signal_struct *sig = tsk->signal; - if (task_cputime_zero(&tsk->cputime_expires) && - task_cputime_zero(&sig->cputime_expires)) + if (unlikely(!sig)) return 0; - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) - return 1; - thread_group_cputime(tsk, &group_sample); - return task_cputime_expired(&group_sample, &sig->cputime_expires); + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + if (!task_cputime_zero(&sig->cputime_expires)) { + struct task_cputime group_sample; + + thread_group_cputime(tsk, &group_sample); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + return 0; } /* @@ -1358,43 +1342,34 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - struct signal_struct *sig; - struct sighand_struct *sighand; - unsigned long flags; BUG_ON(!irqs_disabled()); - /* Pick up tsk->signal and make sure it's valid. */ - sig = tsk->signal; /* * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. Also check that - * tsk->signal is non-NULL; this probably can't happen but cover the - * possibility anyway. + * group timers. If that's so, just return. */ - if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) + if (!fastpath_timer_check(tsk)) return; - sighand = lock_task_sighand(tsk, &flags); - if (likely(sighand)) { - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + spin_lock(&tsk->sighand->siglock); + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - } - unlock_task_sighand(tsk, &flags); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); /* * Now that all the timers on our list have the firing flag, @@ -1433,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group_locked(clock_idx, tsk, &now); + cpu_clock_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched.c b/kernel/sched.c index c51b5d276665..260c22cc530a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4039,55 +4039,22 @@ EXPORT_PER_CPU_SYMBOL(kstat); /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. */ -static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq) +unsigned long long task_delta_exec(struct task_struct *p) { + struct rq *rq; + unsigned long flags; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); if (task_current(rq, p)) { u64 delta_exec; update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - return delta_exec; + ns = delta_exec; } - return 0; -} - -/* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. - */ -unsigned long long task_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - u64 ns; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); - - return ns; -} - -/* - * Return sum_exec_runtime for the thread group plus any more ns on the - * sched_clock that have not yet been banked in case the task is currently - * running. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - u64 ns; - struct rq *rq; - struct task_cputime totals; - - rq = task_rq_lock(p, &flags); - thread_group_cputime(p, &totals); - ns = totals.sum_exec_runtime + task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); return ns; } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43987e2..d6903bd0c7a8 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -270,3 +270,139 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +#ifdef CONFIG_SMP + +/** + * thread_group_cputime_account_user - Maintain utime for a thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @cputime: Time value by which to increment the utime field of that + * structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void thread_group_cputime_account_user( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * thread_group_cputime_account_system - Maintain stime for a thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @cputime: Time value by which to increment the stime field of that + * structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void thread_group_cputime_account_system( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a + * thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of that structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void thread_group_cputime_account_exec_runtime( + struct thread_group_cputime *tgtimes, + unsigned long long ns) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} + +#else /* CONFIG_SMP */ + +static inline void thread_group_cputime_account_user( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime); +} + +static inline void thread_group_cputime_account_system( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime); +} + +static inline void thread_group_cputime_account_exec_runtime( + struct thread_group_cputime *tgtimes, + unsigned long long ns) +{ + tgtimes->totals->sum_exec_runtime += ns; +} + +#endif /* CONFIG_SMP */ + +/* + * These are the generic time-accounting routines that use the above + * functions. They are the functions actually called by the scheduler. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_user(&sig->cputime, cputime); +} + +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_system(&sig->cputime, cputime); +} + +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_exec_runtime(&sig->cputime, ns); +} -- cgit v1.2.1 From 695698500912c4479ddf4723e492de3970ff8530 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 14:54:23 +0200 Subject: sched: rework wakeup preemption Rework the wakeup preemption to work on real runtime instead of the virtual runtime. This greatly simplifies the code. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 133 ++-------------------------------------------------- 1 file changed, 4 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3b89aa6594a9..c20899763457 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -408,64 +408,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) return __sched_period(nr_running); } -/* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1281,53 +1223,11 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } -/* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - /* * Preempt the current task with a newly woken task if needed: */ @@ -1336,7 +1236,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1374,33 +1274,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } - - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } -- cgit v1.2.1 From 940959e93949e839c14f8ddc3b9b0e34a2ab6e29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:42 +0200 Subject: sched: fixlet for group load balance We should not only correct the increment for the initial group, but should be consistent and do so for all the groups we encounter. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c20899763457..0c59da7e3120 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1027,7 +1027,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1039,18 +1038,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1059,7 +1057,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1068,7 +1070,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; -- cgit v1.2.1 From 78333cdd0e472180743d35988e576d6ecc6f6ddb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:43 +0200 Subject: sched: add some comments to the bandwidth code Hopefully clarify some of this code a little. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2e228bd5395e..d570a8cc4fcd 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -274,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; + /* + * Can't reclaim from ourselves or disabled runqueues. + */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); -- cgit v1.2.1 From 4653f803e6e0d970ffeac0efd2c01743eb6c5228 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:44 +0200 Subject: sched: more sanity checks on the bandwidth settings While playing around with it, I noticed we missed some sanity checks. Also add some comments while we're there. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 669c49aa57f0..e1299de1765e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8866,11 +8866,29 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) return -EBUSY; total = to_ratio(period, runtime); + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ list_for_each_entry_rcu(child, &tg->children, siblings) { period = ktime_to_ns(child->rt_bandwidth.rt_period); runtime = child->rt_bandwidth.rt_runtime; @@ -8978,19 +8996,24 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; if (sysctl_sched_rt_period <= 0) return -EINVAL; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - ret = __rt_schedulable(tg, rt_period, rt_runtime); + ret = __rt_schedulable(NULL, 0, 0); read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); -- cgit v1.2.1 From 57fdc26d4a734a3e00c6b2fc0e1e40ff8da4dc31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:45 +0200 Subject: sched: fixup buddy selection We should set the buddy even though we might already have the TIF_RESCHED flag set. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c59da7e3120..e3f3c10f7033 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1249,6 +1249,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (unlikely(se == pse)) return; + cfs_rq_of(pse)->next = pse; + /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -1256,8 +1258,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (test_tsk_need_resched(curr)) return; - cfs_rq_of(pse)->next = pse; - /* * Batch tasks do not preempt (their preemption is driven by * the tick): -- cgit v1.2.1 From f9092f358bc2ec5367621478811f046f82873376 Mon Sep 17 00:00:00 2001 From: Jonathan Steel Date: Mon, 22 Sep 2008 13:57:45 -0700 Subject: kexec: fix segmentation fault in kimage_add_entry A segmentation fault can occur in kimage_add_entry in kexec.c when loading a kernel image into memory. The fault occurs because a page is requested by calling kimage_alloc_page with gfp_mask GFP_KERNEL and the function may actually return a page with gfp_mask GFP_HIGHUSER. The high mem page is returned because it was swapped with the kernel page due to the kernel page being a page that will shortly be copied to. This patch ensures that kimage_alloc_page returns a page that was created with the correct gfp flags. I have verified the change and fixed the whitespace damage of the original patch. Jonathan did a great job of tracking this down after he hit the problem. -- Eric Signed-off-by: Jonathan Steel Signed-off-by: Eric W. Biederman Acked-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f3f0df35d4..aef265325cd3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image, *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; -- cgit v1.2.1 From 4aa7361179bed905fd0f35b236a5c65db683b9e0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:46 -0700 Subject: posix-timers: don't switch to ->group_leader if ->it_process dies posix_timer_event() drops SIGEV_THREAD_ID and switches to ->group_leader if send_sigqueue() fails. This is not very useful and doesn't work reliably. send_sigqueue() can only fail if ->it_process is dead. But it can die before it dequeues the SI_TIMER signal, in that case the timer stops anyway. Remove this code. I guess it was needed a long ago to ensure that the timer is not destroyed when when its creator thread dies. Q: perhaps it makes sense to change sys_timer_settime() to return an error if ->it_process is dead? Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbff..3dfd15aecc60 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -298,6 +298,7 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { + int shared, ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -316,20 +317,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - - if (likely(ret >= 0)) - return ret; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } - - return send_sigqueue(timr->sigq, timr->it_process, 1); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, timr->it_process, shared); + /* If we failed to send the signal the timer stops. */ + return ret > 0; } EXPORT_SYMBOL_GPL(posix_timer_event); -- cgit v1.2.1 From 918fc0372831dca73039e1577bfea0c2ce49bdb6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:46 -0700 Subject: posix-timers: always do get_task_struct(timer->it_process) Change the code to get/put timer->it_process regardless of SIGEV_THREAD_ID. This streamlines the create/destroy paths and allows us to simplify the usage of exit_itimers() in de_thread(). Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 3dfd15aecc60..bd9c931b3659 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -540,11 +540,10 @@ sys_timer_create(const clockid_t which_clock, */ spin_lock_irqsave(&process->sighand->siglock, flags); if (!(process->flags & PF_EXITING)) { + get_task_struct(process); new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - get_task_struct(process); spin_unlock_irqrestore(&process->sighand->siglock, flags); } else { spin_unlock_irqrestore(&process->sighand->siglock, flags); @@ -561,6 +560,7 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_sigev_signo = SIGALRM; new_timer->it_sigev_value.sival_int = new_timer->it_id; process = current->group_leader; + get_task_struct(process); spin_lock_irqsave(&process->sighand->siglock, flags); new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); @@ -853,8 +853,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); @@ -881,8 +880,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); -- cgit v1.2.1 From 2cd499e38ec241691e4bce50bddc8f57e4cc9bd0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:47 -0700 Subject: posix-timers: sys_timer_create: remove the buggy PF_EXITING check sys_timer_create() return -EINVAL if the target thread has PF_EXITING. This doesn't really make sense, the sub-thread can die right after unlock. And in fact, this is just wrong. Without SIGEV_THREAD_ID good_sigevent() returns ->group_leader, and it is very possible that the leader is already dead. This is OK, we shouldn't return the error in this case. Remove this check and the comment. Note that the "process" was found under tasklist_lock, it must have ->sighand != NULL. Also, remove a couple of unneeded initializations. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index bd9c931b3659..60b262051d1d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -460,9 +460,9 @@ sys_timer_create(const clockid_t which_clock, timer_t __user * created_timer_id) { int error = 0; - struct k_itimer *new_timer = NULL; + struct k_itimer *new_timer; int new_timer_id; - struct task_struct *process = NULL; + struct task_struct *process; unsigned long flags; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -523,32 +523,12 @@ sys_timer_create(const clockid_t which_clock, read_lock(&tasklist_lock); if ((process = good_sigevent(&event))) { - /* - * We may be setting up this process for another - * thread. It may be exiting. To catch this - * case the we check the PF_EXITING flag. If - * the flag is not set, the siglock will catch - * him before it is too late (in exit_itimers). - * - * The exec case is a bit more invloved but easy - * to code. If the process is in our thread - * group (and it must be or we would not allow - * it here) and is doing an exec, it will cause - * us to be killed. In this case it will wait - * for us to die which means we can finish this - * linkage with our last gasp. I.e. no code :) - */ + get_task_struct(process); spin_lock_irqsave(&process->sighand->siglock, flags); - if (!(process->flags & PF_EXITING)) { - get_task_struct(process); - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } else { - spin_unlock_irqrestore(&process->sighand->siglock, flags); - process = NULL; - } + new_timer->it_process = process; + list_add(&new_timer->list, + &process->signal->posix_timers); + spin_unlock_irqrestore(&process->sighand->siglock, flags); } read_unlock(&tasklist_lock); if (!process) { -- cgit v1.2.1 From 36b2f046000b358b62b9d116cb10a2b1c5be5cbf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:48 -0700 Subject: posix-timers: sys_timer_create: simplify and s/tasklist/rcu/ - Change the code to do rcu_read_lock() instead of taking tasklist_lock, it is safe to get_task_struct(p) if p was found under RCU. However, now we must not use process's sighand/signal, they may be NULL. We can use current->sighand/signal instead, this "process" must belong to the current's thread-group. - Factor out the common code for 2 "if (timer_event_spec)" branches, the !timer_event_spec case can use current too. - use spin_lock_irq() instead of _irqsave(), kill "flags". Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 60b262051d1d..5b761903b49a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -463,7 +463,6 @@ sys_timer_create(const clockid_t which_clock, struct k_itimer *new_timer; int new_timer_id; struct task_struct *process; - unsigned long flags; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -521,16 +520,11 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_sigev_signo = event.sigev_signo; new_timer->it_sigev_value = event.sigev_value; - read_lock(&tasklist_lock); - if ((process = good_sigevent(&event))) { + rcu_read_lock(); + process = good_sigevent(&event); + if (process) get_task_struct(process); - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!process) { error = -EINVAL; goto out; @@ -541,19 +535,18 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_sigev_value.sival_int = new_timer->it_id; process = current->group_leader; get_task_struct(process); - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); } + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_process = process; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify * new_timer after the unlock call. */ - out: if (error) release_posix_timer(new_timer, it_id_set); -- cgit v1.2.1 From 717835d94d3e3d343a302df0a3cb9405887c3e2a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:49 -0700 Subject: posix-timers: move the initialization of timer->sigq from send to create path posix_timer_event() always populates timer->sigq with the same numbers, move this code into sys_timer_create(). Note that with this patch we can kill it_sigev_signo and it_sigev_value. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5b761903b49a..c459b29efdd4 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -312,11 +312,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_code = SI_TIMER; - timr->sigq->info.si_tid = timr->it_id; - timr->sigq->info.si_value = timr->it_sigev_value; - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); ret = send_sigqueue(timr->sigq, timr->it_process, shared); /* If we failed to send the signal the timer stops. */ @@ -537,6 +532,11 @@ sys_timer_create(const clockid_t which_clock, get_task_struct(process); } + new_timer->sigq->info.si_code = SI_TIMER; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_signo = new_timer->it_sigev_signo; + new_timer->sigq->info.si_value = new_timer->it_sigev_value; + spin_lock_irq(¤t->sighand->siglock); new_timer->it_process = process; list_add(&new_timer->list, ¤t->signal->posix_timers); -- cgit v1.2.1 From ef864c958801768fb28bd3603cd0b098b394671c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:49 -0700 Subject: posix-timers: sys_timer_create: cleanup the error handling Cleanup. - sys_timer_create() is big and complicated. The code above the "out:" label relies on the fact that "error" must be == 0. This is not very robust, make the code more explicit. Remove the unneeded initialization of error. - If idr_get_new() succeeds (as it normally should), we check the returned value twice. Move the "-EAGAIN" check under "if (error)". Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index c459b29efdd4..7be385fe4eca 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -454,9 +454,8 @@ sys_timer_create(const clockid_t which_clock, struct sigevent __user *timer_event_spec, timer_t __user * created_timer_id) { - int error = 0; struct k_itimer *new_timer; - int new_timer_id; + int error, new_timer_id; struct task_struct *process; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -478,9 +477,9 @@ sys_timer_create(const clockid_t which_clock, error = idr_get_new(&posix_timers_id, (void *) new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); - if (error == -EAGAIN) - goto retry; - else if (error) { + if (error) { + if (error == -EAGAIN) + goto retry; /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) @@ -541,6 +540,8 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_process = process; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); + + return 0; /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task @@ -548,9 +549,7 @@ sys_timer_create(const clockid_t which_clock, * new_timer after the unlock call. */ out: - if (error) - release_posix_timer(new_timer, it_id_set); - + release_posix_timer(new_timer, it_id_set); return error; } -- cgit v1.2.1 From 5a9fa73072854981a5c05eb7ba18a96d49c2804f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:50 -0700 Subject: posix-timers: kill ->it_sigev_signo and ->it_sigev_value With the recent changes ->it_sigev_signo and ->it_sigev_value are only used in sys_timer_create(), kill them. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7be385fe4eca..3eff47b0d8d5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -510,10 +510,6 @@ sys_timer_create(const clockid_t which_clock, error = -EFAULT; goto out; } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->it_sigev_signo = event.sigev_signo; - new_timer->it_sigev_value = event.sigev_value; - rcu_read_lock(); process = good_sigevent(&event); if (process) @@ -524,17 +520,18 @@ sys_timer_create(const clockid_t which_clock, goto out; } } else { - new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->it_sigev_signo = SIGALRM; - new_timer->it_sigev_value.sival_int = new_timer->it_id; + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; process = current->group_leader; get_task_struct(process); } - new_timer->sigq->info.si_code = SI_TIMER; + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_signo = new_timer->it_sigev_signo; - new_timer->sigq->info.si_value = new_timer->it_sigev_value; + new_timer->sigq->info.si_code = SI_TIMER; spin_lock_irq(¤t->sighand->siglock); new_timer->it_process = process; -- cgit v1.2.1 From 5a51b713ccf8835d5adf7217e2f86eb12b1ca851 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:51 -0700 Subject: posix-timers: lock_timer: kill the bogus ->it_id check lock_timer() checks that the timer found by idr_find(timer_id) has ->it_id == timer_id. This buys nothing. This check can fail only if sys_timer_create() unlocked idr_lock after idr_get_new(), but didn't set ->it_id = new_timer_id yet. But in that case ->it_process == NULL so lock_timer() can't succeed anyway. Also remove a couple of unneeded typecasts. Note that with or without this patch we have a small problem. sys_timer_create() doesn't ensure that the result of setting (say) ->it_sigev_notify must be visible if lock_timer() succeeds. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 3eff47b0d8d5..7185f05d53a9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -474,8 +474,7 @@ sys_timer_create(const clockid_t which_clock, goto out; } spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, (void *) new_timer, - &new_timer_id); + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); if (error) { if (error == -EAGAIN) @@ -567,12 +566,12 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) */ spin_lock_irqsave(&idr_lock, *flags); - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int) timer_id); if (timr) { spin_lock(&timr->it_lock); - if ((timr->it_id != timer_id) || !(timr->it_process) || - !same_thread_group(timr->it_process, current)) { + if (!timr->it_process || + !same_thread_group(timr->it_process, current)) { spin_unlock(&timr->it_lock); spin_unlock_irqrestore(&idr_lock, *flags); timr = NULL; -- cgit v1.2.1 From 31d9284569e38fb97117497af3e8047a6a3c86f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:51 -0700 Subject: posix-timers: lock_timer: make it readable Cleanup. Imho makes the code much more understandable. At least this patch lessens both the source and compiled code. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7185f05d53a9..95451bf7d2eb 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -556,7 +556,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* @@ -564,23 +564,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) * flags part over to the timer lock. Must not let interrupts in * while we are moving the lock. */ - spin_lock_irqsave(&idr_lock, *flags); - timr = idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - - if (!timr->it_process || - !same_thread_group(timr->it_process, current)) { - spin_unlock(&timr->it_lock); - spin_unlock_irqrestore(&idr_lock, *flags); - timr = NULL; - } else + if (timr->it_process && + same_thread_group(timr->it_process, current)) { spin_unlock(&idr_lock); - } else - spin_unlock_irqrestore(&idr_lock, *flags); + return timr; + } + spin_unlock(&timr->it_lock); + } + spin_unlock_irqrestore(&idr_lock, *flags); - return timr; + return NULL; } /* -- cgit v1.2.1 From eb3f938fd6292dc79f43a5fe14784b044776e9f0 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 22 Sep 2008 14:42:40 -0700 Subject: ntp: let update_persistent_clock() sleep This is a change that makes the 11-minute RTC update be run in the process context. This is so that update_persistent_clock() can sleep, which may be required for certain types of RTC hardware -- most notably I2C devices. Signed-off-by: Maciej W. Rozycki Cc: Roman Zippel Cc: Rik van Riel Cc: David Brownell Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c6921aa1a42a..450a45cb01c1 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@ #include #include -#include #include #include #include #include #include #include +#include #include /* @@ -218,11 +218,11 @@ void second_overflow(void) /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + schedule_delayed_work(&sync_cmos_work, 0); } #else -- cgit v1.2.1 From 5cd1c9c5cf30d4b33df3d3f74d8142f278d536b7 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:43 -0700 Subject: timekeeping: fix rounding problem during clock update Due to a rounding problem during a clock update it's possible for readers to observe the clock jumping back by 1nsec. The following simplified example demonstrates the problem: cycle xtime 0 0 1000 999999.6 2000 1999999.2 3000 2999998.8 ... 1500 = 1499999.4 = 0.0 + 1499999.4 = 999999.6 + 499999.8 When reading the clock only the full nanosecond part is used, while timekeeping internally keeps nanosecond fractions. If the clock is now updated at cycle 1500 here, a nanosecond is missing due to the truncation. The simple fix is to round up the xtime value during the update, this also changes the distance to the reference time, but the adjustment will automatically take care that it stays under control. Signed-off-by: Roman Zippel Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f961c9..5ecbfc39a268 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -454,7 +454,7 @@ void update_wall_time(void) #else offset = clock->cycle_interval; #endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. @@ -479,9 +479,12 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + /* store full nanoseconds into xtime after rounding it up and + * add the remainder to the error difference. + */ + xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); -- cgit v1.2.1 From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:44 -0700 Subject: ntp: improve adjtimex frequency rounding Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits (19 is the amount of the factor 2 in PPM_SCALE), the output frequency can then be calculated back to its input value, as the inverse divide produce a slightly larger value, which is then correctly rounded by the final shift. Reported-by: Martin Ziegler Signed-off-by: Roman Zippel Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 450a45cb01c1..ddb0465a6baa 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -406,9 +406,8 @@ adj_done: if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; -- cgit v1.2.1 From b87f17242da6b2ac6db2d179b2f93fb84cff2fbe Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Thu, 25 Sep 2008 09:53:54 +0530 Subject: sched: maintain only task entities in cfs_rq->tasks list cfs_rq->tasks list is used by the load balancer to iterate over all the tasks. Currently it holds all the entities (both task and group entities) because of which there is a need to check for group entities explicitly during load balancing. This patch changes the cfs_rq->tasks list to hold only task entities. Signed-off-by: Bharata B Rao Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e3f3c10f7033..95c1295ad26d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -528,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -541,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1335,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks && !entity_is_task(se)) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; return p; } -- cgit v1.2.1 From 379daf6290814e41f14880094b7b773640df2461 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Sep 2008 18:43:34 -0700 Subject: IO resources, x86: ioremap sanity check to catch mapping requests exceeding the BAR sizes Go through the iomem resource tree to check if any of the ioremap() requests span more than any slot in the iomem resource tree and do a WARN_ON() if we hit this check. This will raise a red-flag, if some driver is mapping more than what is needed. And hopefully identify possible corruptions much earlier. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/resource.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index fc59dcc4795b..1d003a50ee17 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -827,3 +827,36 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + addr, addr + size - 1, p->start, p->end, p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} -- cgit v1.2.1 From 13eb83754b40bf01dc84e52a08d4196d1b719a0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Sep 2008 10:10:12 +0200 Subject: IO resources, x86: ioremap sanity check to catch mapping requests exceeding, fix fix this build error: kernel/resource.c: In function 'iomem_map_sanity_check': kernel/resource.c:842: error: implicit declaration of function 'r_next' kernel/resource.c:842: warning: assignment makes pointer from integer without a cast r_next() was only available if CONFIG_PROCFS was enabled. and fix this build warning: kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 2 has type 'resource_size_t' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'long unsigned int' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'resource_size_t' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'resource_size_t' resource_t can be 32 bits. Signed-off-by: Ingo Molnar --- kernel/resource.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 1d003a50ee17..7797dae85b50 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { @@ -852,7 +852,11 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) continue; printk(KERN_WARNING "resource map sanity check conflict: " "0x%llx 0x%llx 0x%llx 0x%llx %s\n", - addr, addr + size - 1, p->start, p->end, p->name); + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); err = -1; break; } -- cgit v1.2.1 From 18d6522b86d21a04c8ac1ea79747e2e434a956d9 Mon Sep 17 00:00:00 2001 From: Atsuo Igarashi Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb: could not write to the last of valid memory with kgdb On the ARM architecture, kgdb will crash the kernel if the last byte of valid memory is written due to a flush_icache_range flushing beyond the memory boundary. Signed-off-by: Atsuo Igarashi Signed-off-by: Jason Wessel --- kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index eaa21fc9ad1d..949806ab67de 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -488,7 +488,7 @@ static int write_mem_msg(int binary) if (err) return err; if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length + 1); + flush_icache_range(addr, addr + length); return 0; } -- cgit v1.2.1 From d7161a65341556bacb5e6654e133803f46f51063 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb, x86, arm, mips, powerpc: ignore user space single stepping On the x86 arch, user space single step exceptions should be ignored if they occur in the kernel space, such as ptrace stepping through a system call. First check if it is kgdb that is executing a single step, then ensure it is not an accidental traversal into the user space, while in kgdb, any other time the TIF_SINGLESTEP is set, kgdb should ignore the exception. On x86, arm, mips and powerpc, the kgdb_contthread usage was inconsistent with the way single stepping is implemented in the kgdb core. The arch specific stub should always set the kgdb_cpu_doing_single_step correctly if it is single stepping. This allows kgdb to correctly process an instruction steps if ptrace happens to be requesting an instruction step over a system call. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 949806ab67de..25d955dbb989 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1462,7 +1462,7 @@ acquirelock: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) atomic_set(&passive_cpu_wait[i], 1); } @@ -1475,7 +1475,7 @@ acquirelock: #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -1494,7 +1494,7 @@ acquirelock: kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); kgdb_deactivate_sw_breakpoints(); kgdb_single_step = 0; - kgdb_contthread = NULL; + kgdb_contthread = current; exception_level = 0; /* Talk to debugger with gdbserial protocol */ @@ -1508,7 +1508,7 @@ acquirelock: kgdb_info[ks->cpu].task = NULL; atomic_set(&cpu_in_kgdb[ks->cpu], 0); - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* -- cgit v1.2.1 From 7086efe1c1536f6bc160e7d60a9bfd645b91f279 Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang, v3 - fix UP lockup - another set of UP/SMP cleanups and simplifications Signed-off-by: Frank Mayhar Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - kernel/sched_stats.h | 126 ++++++++++++++++----------------------------------- 2 files changed, 38 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 260c22cc530a..29a3152c45db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4046,7 +4046,6 @@ unsigned long long task_delta_exec(struct task_struct *p) unsigned long flags; u64 ns = 0; - rq = task_rq_lock(p, &flags); if (task_current(rq, p)) { u64 delta_exec; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index d6903bd0c7a8..b8c156979cf2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -276,133 +276,83 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) * on CONFIG_SCHEDSTATS. */ -#ifdef CONFIG_SMP - /** - * thread_group_cputime_account_user - Maintain utime for a thread group. + * account_group_user_time - Maintain utime for a thread group. * - * @tgtimes: Pointer to thread_group_cputime structure. - * @cputime: Time value by which to increment the utime field of that - * structure. + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. * * If thread group time is being maintained, get the structure for the * running CPU and update the utime field there. */ -static inline void thread_group_cputime_account_user( - struct thread_group_cputime *tgtimes, - cputime_t cputime) +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) { - if (tgtimes->totals) { + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { struct task_cputime *times; - times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); times->utime = cputime_add(times->utime, cputime); put_cpu_no_resched(); } } /** - * thread_group_cputime_account_system - Maintain stime for a thread group. + * account_group_system_time - Maintain stime for a thread group. * - * @tgtimes: Pointer to thread_group_cputime structure. - * @cputime: Time value by which to increment the stime field of that - * structure. + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. * * If thread group time is being maintained, get the structure for the * running CPU and update the stime field there. */ -static inline void thread_group_cputime_account_system( - struct thread_group_cputime *tgtimes, - cputime_t cputime) +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) { - if (tgtimes->totals) { + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { struct task_cputime *times; - times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); times->stime = cputime_add(times->stime, cputime); put_cpu_no_resched(); } } /** - * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a - * thread group. + * account_group_exec_runtime - Maintain exec runtime for a thread group. * - * @tgtimes: Pointer to thread_group_cputime structure. + * @tsk: Pointer to task structure. * @ns: Time value by which to increment the sum_exec_runtime field - * of that structure. + * of the thread_group_cputime structure. * * If thread group time is being maintained, get the structure for the * running CPU and update the sum_exec_runtime field there. */ -static inline void thread_group_cputime_account_exec_runtime( - struct thread_group_cputime *tgtimes, - unsigned long long ns) +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) { - if (tgtimes->totals) { + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { struct task_cputime *times; - times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); times->sum_exec_runtime += ns; put_cpu_no_resched(); } } - -#else /* CONFIG_SMP */ - -static inline void thread_group_cputime_account_user( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime); -} - -static inline void thread_group_cputime_account_system( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime); -} - -static inline void thread_group_cputime_account_exec_runtime( - struct thread_group_cputime *tgtimes, - unsigned long long ns) -{ - tgtimes->totals->sum_exec_runtime += ns; -} - -#endif /* CONFIG_SMP */ - -/* - * These are the generic time-accounting routines that use the above - * functions. They are the functions actually called by the scheduler. - */ -static inline void account_group_user_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_user(&sig->cputime, cputime); -} - -static inline void account_group_system_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_system(&sig->cputime, cputime); -} - -static inline void account_group_exec_runtime(struct task_struct *tsk, - unsigned long long ns) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_exec_runtime(&sig->cputime, ns); -} -- cgit v1.2.1 From 7659e349672bb0d378ef8d7d62bae4c53d2bdd18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 14:06:45 +0200 Subject: hrtimer: migrate pending list on cpu offline Impact: hrtimers which are on the pending list are not migrated at cpu offline and can be stale forever Add the pending list migration when CONFIG_HIGH_RES_TIMERS is enabled Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a74..580bc66ae136 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1610,10 +1610,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } +#ifdef CONFIG_HIGH_RES_TIMERS +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + struct hrtimer *timer; + int raise = 0; + + while (!list_empty(&old_base->cb_pending)) { + timer = list_entry(old_base->cb_pending.next, + struct hrtimer, cb_entry); + + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); + timer->base = &new_base->clock_base[timer->base->index]; + list_add_tail(&timer->cb_entry, &new_base->cb_pending); + raise = 1; + } + return raise; +} +#else +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + return 0; +} +#endif + static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); @@ -1630,10 +1656,16 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } + if (migrate_hrtimer_pending(old_base, new_base)) + raise = 1; + spin_unlock(&old_base->lock); spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); + + if (raise) + hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.2.1 From 41e1022eae71707f1ce6801a746f70b1e57b7567 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 14:09:39 +0200 Subject: hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers Impact: Stale timers after a CPU went offline. commit 37bb6cb4097e29ffee970065b74499cbf10603a3 hrtimer: unlock hrtimer_wakeup changed the hrtimer sleeper callback mode to CB_IRQSAFE_NO_SOFTIRQ due to locking problems. A result of this change is that when enqueue is called for an already expired hrtimer the callback function is not longer called directly from the enqueue code. The normal callers have been fixed in the code, but the migration code which moves hrtimers from a dead CPU to a live CPU was not made aware of this. This can be fixed by checking the timer state after the call to enqueue in the migration code. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 580bc66ae136..ac2f6d6d4868 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1591,11 +1591,12 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; + int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); @@ -1607,7 +1608,27 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Happens with high res enabled when the timer was + * already expired and the callback mode is + * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ + * (hrtimer_sleeper). The enqueue code does not move + * them to the soft irq pending list for + * performance/latency reasons, but in the migration + * state, we need to do that otherwise we end up with + * a stale timer. + */ + if (timer->state == HRTIMER_STATE_INACTIVE) { + timer->state = HRTIMER_STATE_PENDING; + list_add_tail(&timer->cb_entry, + &new_base->cpu_base->cb_pending); + raise = 1; + } +#endif } + return raise; } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1652,8 +1673,9 @@ static void migrate_hrtimers(int cpu) spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + if (migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i])) + raise = 1; } if (migrate_hrtimer_pending(old_base, new_base)) -- cgit v1.2.1 From b00c1a99e7758f794923c61e5cd55268d61c9469 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 15:44:46 +0200 Subject: hrtimer: mark migration state Impact: during migration active hrtimers can be seen as inactive The migration code removes the hrtimers from the queues of the dead CPU and sets the state temporary to INACTIVE. The enqueue code sets it to ACTIVE/PENDING again. Prevent that the wrong state can be seen by using a separate migration state bit. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ac2f6d6d4868..ace723dd1e52 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1602,7 +1602,13 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device @@ -1620,13 +1626,15 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * state, we need to do that otherwise we end up with * a stale timer. */ - if (timer->state == HRTIMER_STATE_INACTIVE) { + if (timer->state == HRTIMER_STATE_MIGRATE) { timer->state = HRTIMER_STATE_PENDING; list_add_tail(&timer->cb_entry, &new_base->cpu_base->cb_pending); raise = 1; } #endif + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; } return raise; } -- cgit v1.2.1 From ccc7dadf736639da86f3e0c86832c11a66fc8221 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 15:47:42 +0200 Subject: hrtimer: prevent migration of per CPU hrtimers Impact: per CPU hrtimers can be migrated from a dead CPU The hrtimer code has no knowledge about per CPU timers, but we need to prevent the migration of such timers and warn when such a timer is active at migration time. Explicitely mark the timers as per CPU and use a more understandable mode descriptor for the interrupts safe unlocked callback mode, which is used by hrtimer_sleeper and the scheduler code. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 37 +++++++++++++++++++++++++------------ kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 2 +- kernel/trace/trace_sysprof.c | 2 +- 4 files changed, 29 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ace723dd1e52..cdec83e722fa 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + case HRTIMER_CB_IRQSAFE_PERCPU: + case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site - * takes care of this. + * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer) timer_stats_account_hrtimer(timer); fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@ -1592,7 +1594,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; @@ -1603,6 +1605,18 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); + /* + * Should not happen. Per CPU timers should be + * canceled _before_ the migration code is called + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { + __remove_hrtimer(timer, old_base, + HRTIMER_STATE_INACTIVE, 0); + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", + timer, timer->function, dcpu); + continue; + } + /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the * timer could be seen as !active and just vanish away @@ -1619,12 +1633,11 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, /* * Happens with high res enabled when the timer was * already expired and the callback mode is - * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ - * (hrtimer_sleeper). The enqueue code does not move - * them to the soft irq pending list for - * performance/latency reasons, but in the migration - * state, we need to do that otherwise we end up with - * a stale timer. + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The + * enqueue code does not move them to the soft irq + * pending list for performance/latency reasons, but + * in the migration state, we need to do that + * otherwise we end up with a stale timer. */ if (timer->state == HRTIMER_STATE_MIGRATE) { timer->state = HRTIMER_STATE_PENDING; @@ -1682,7 +1695,7 @@ static void migrate_hrtimers(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { if (migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i])) + &new_base->clock_base[i], cpu)) raise = 1; } diff --git a/kernel/sched.c b/kernel/sched.c index 13dd2db9fb2d..ad1962dc0aa2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else static inline void hrtick_clear(struct rq *rq) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 39019b3f7621..cb02324bdb88 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -625,7 +625,7 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index bb948e52ce20..db58fb66a135 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,7 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } -- cgit v1.2.1 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- kernel/exit.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abde159..a0123d75ec9a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) diff --git a/kernel/exit.c b/kernel/exit.c index 16395644a98f..85a83c831856 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -627,6 +625,16 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: -- cgit v1.2.1 From bfcd17a6c5529bc37234cfa720a047cf9397bcfc Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 6 Aug 2008 15:12:22 +0200 Subject: Configure out file locking features This patch adds the CONFIG_FILE_LOCKING option which allows to remove support for advisory locks. With this patch enabled, the flock() system call, the F_GETLK, F_SETLK and F_SETLKW operations of fcntl() and NFS support are disabled. These features are not necessarly needed on embedded systems. It allows to save ~11 Kb of kernel code and data: text data bss dec hex filename 1125436 118764 212992 1457192 163c28 vmlinux.old 1114299 118564 212992 1445855 160fdf vmlinux -11137 -200 0 -11337 -2C49 +/- This patch has originally been written by Matt Mackall , and is part of the Linux Tiny project. Signed-off-by: Thomas Petazzoni Signed-off-by: Matt Mackall Cc: matthew@wil.cx Cc: linux-fsdevel@vger.kernel.org Cc: mpm@selenic.com Cc: akpm@linux-foundation.org Signed-off-by: J. Bruce Fields --- kernel/sys_ni.c | 1 + kernel/sysctl.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1bb99ac..503d8d4eb80a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -125,6 +125,7 @@ cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_flock); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec0886fa3d..4588b2cf2ecb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,7 +97,7 @@ static int sixty = 60; static int neg_one = -1; #endif -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) static int two = 2; #endif @@ -1261,6 +1261,7 @@ static struct ctl_table fs_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASES, .procname = "leases-enable", @@ -1269,6 +1270,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #ifdef CONFIG_DNOTIFY { .ctl_name = FS_DIR_NOTIFY, @@ -1280,6 +1282,7 @@ static struct ctl_table fs_table[] = { }, #endif #ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", @@ -1291,6 +1294,7 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &two, }, +#endif { .procname = "aio-nr", .data = &aio_nr, -- cgit v1.2.1 From 1508487e7f16d992ad23cabd3712563ff912f413 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Sep 2008 08:28:17 +0200 Subject: timers: fix itimer/many thread hang, fix fix bogus rq dereference: v3 removed the locking but also removed the rq initialization. Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 29a3152c45db..ebb03def564b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4042,10 +4042,12 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ unsigned long long task_delta_exec(struct task_struct *p) { - struct rq *rq; unsigned long flags; + struct rq *rq; u64 ns = 0; + rq = task_rq_lock(p, &flags); + if (task_current(rq, p)) { u64 delta_exec; @@ -4055,6 +4057,8 @@ unsigned long long task_delta_exec(struct task_struct *p) ns = delta_exec; } + task_rq_unlock(rq, &flags); + return ns; } -- cgit v1.2.1 From 64b9e0294d24a4204232e13e01630b0690e48d61 Mon Sep 17 00:00:00 2001 From: "Amit K. Arora" Date: Tue, 30 Sep 2008 17:15:39 +0530 Subject: sched: minor optimizations in wake_affine and select_task_rq_fair This patch does following: o Removes unused variable and argument "rq". o Optimizes one of the "if" conditions in wake_affine() - i.e. if "balanced" is true, we need not do rest of the calculations in the condition. o If this cpu is same as the previous cpu (on which woken up task was running when it went to sleep), no need to call wake_affine at all. Signed-off-by: Amit K Arora Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 95c1295ad26d..fcbe850a5a90 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1088,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1136,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1156,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1193,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. -- cgit v1.2.1 From 0c5d1eb77a8be917b638344a22afe1398236482b Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 1 Oct 2008 14:46:18 -0700 Subject: genirq: record trigger type Genirq hasn't previously recorded the trigger type used by any given IRQ, although some irq_chip support has done so. That data can be useful when troubleshooting. This patch records it in the relevant irq_desc.status bits, and improves consistency between the two driver-visible calls affected: - Make set_irq_type() usage match request_irq() usage: * IRQ_TYPE_NONE should be a NOP; succeed, so irq_chip methods won't have to handle that case any more (many do it wrong). * IRQ_TYPE_PROBE is ignored; any buggy out-of-tree callers might need to switch over to the real IRQ probing code. * emit the same diagnostics (from shared utility code) - Their kerneldoc now reflects usage: * request_irq() flags include IRQF_TRIGGER_* to specify active edge(s)/level ... docs previously omitted that * set_irq_type() is declared in so callers should use the (bit-equivalent) IRQ_TYPE_* symbols there Also: adds a warning about shared IRQs that don't end up using the requested trigger mode; and fix an unrelated "sparse" warning. Signed-off-by: David Brownell Signed-off-by: Andrew Morton Acked-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 15 ++++++++------- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 21 ++++++++++++++++++--- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d663338cb4a8..5203a599d211 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -111,9 +111,9 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) EXPORT_SYMBOL(set_irq_chip); /** - * set_irq_type - set the irq type for an irq + * set_irq_type - set the irq trigger type for an irq * @irq: irq number - * @type: interrupt type - see include/linux/interrupt.h + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int set_irq_type(unsigned int irq, unsigned int type) { @@ -127,11 +127,12 @@ int set_irq_type(unsigned int irq, unsigned int type) } desc = irq_desc + irq; - if (desc->chip->set_type) { - spin_lock_irqsave(&desc->lock, flags); - ret = desc->chip->set_type(irq, type); - spin_unlock_irqrestore(&desc->lock, flags); - } + if (type == IRQ_TYPE_NONE) + return 0; + + spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_trigger(desc, irq, flags); + spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 08a849a22447..422dd00c8bd3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -10,6 +10,9 @@ extern void irq_chip_set_defaults(struct irq_chip *chip); /* Set default handler: */ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, + unsigned long flags); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d62f69ba7453..e59157b591f8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,7 +216,7 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); -int set_irq_wake_real(unsigned int irq, unsigned int on) +static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; int ret = -ENXIO; @@ -305,10 +305,11 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) desc->handle_irq = NULL; } -static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, +int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { int ret; + struct irq_chip *chip = desc->chip; if (!chip || !chip->set_type) { /* @@ -326,6 +327,11 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, pr_err("setting trigger mode %d for irq %u failed (%pF)\n", (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); + else { + /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= flags & IRQ_TYPE_SENSE_MASK; + } return ret; } @@ -404,7 +410,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc->chip, irq, new->flags); + ret = __irq_set_trigger(desc, irq, new->flags); if (ret) { spin_unlock_irqrestore(&desc->lock, flags); @@ -430,6 +436,14 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Set default affinity mask once everything is setup */ irq_select_affinity(irq); + + } else if ((new->flags & IRQF_TRIGGER_MASK) + && (new->flags & IRQF_TRIGGER_MASK) + != (desc->status & IRQ_TYPE_SENSE_MASK)) { + /* hope the handler works with the actual trigger mode... */ + pr_warning("IRQ %d uses trigger mode %d; requested %d\n", + irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), + (int)(new->flags & IRQF_TRIGGER_MASK)); } *p = new; @@ -586,6 +600,7 @@ EXPORT_SYMBOL(free_irq); * IRQF_SHARED Interrupt is shared * IRQF_DISABLED Disable local interrupts while processing * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy + * IRQF_TRIGGER_* Specify active edge(s) or level * */ int request_irq(unsigned int irq, irq_handler_t handler, -- cgit v1.2.1 From 8e85b4b553fc932e1c5141feb5fda389b7f5db01 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Oct 2008 10:50:53 +0200 Subject: softirqs, debug: preemption check if a preempt count leaks out of a softirq handler it can be very hard to figure it out. Add a debug check for this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/softirq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 82e32aadedd8..1cf1e2f2c406 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -205,7 +205,18 @@ restart: do { if (pending & 1) { + int prev_count = preempt_count(); + h->action(h); + + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered sotfirq %ld %p" + "with preempt_count %08x," + " exited with %08x?\n", h - softirq_vec, + h->action, prev_count, preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qsctr_inc(cpu); } h++; -- cgit v1.2.1 From aa94fbd5ccd840c8ab26d02439ec799b03a72547 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Oct 2008 14:50:14 -0700 Subject: fix error-path NULL deref in alloc_posix_timer() Found by static checker (http://repo.or.cz/w/smatch.git). Signed-off-by: Dan Carpenter Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbff..5131e5471169 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); - tmr = NULL; + return NULL; } memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; -- cgit v1.2.1 From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Oct 2008 16:06:39 -0700 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 166 +++++++++++++++++++++++++++------------------------- 1 file changed, 86 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128ca2c9..0d07e6e51578 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, } } +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * period (if necessary). */ -#ifdef CONFIG_DEBUG_RCU_STALL - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_check = get_seconds() + 3; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = get_seconds() - rcp->gp_check; - if (delta < 2L || cpus_empty(rcp->cpumask)) { - spin_unlock(&rcp->lock); - return; - } - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_cpu_mask(cpu, rcp->cpumask) - printk(" %d", cpu); - printk(" (detected by %d, t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(get_seconds() - rcp->gp_check) >= 0L) - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - long delta; - - delta = get_seconds() - rcp->gp_check; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { - - /* We haven't checked in, so go dump stack. */ - - print_cpu_stall(rcp); - - } else { - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } - } -} - -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; - record_gp_check_time(rcp); + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp, rdp); + check_cpu_stall(rcp); if (rdp->nxtlist) { long completed_snap = ACCESS_ONCE(rcp->completed); @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ -- cgit v1.2.1 From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Oct 2008 10:41:00 +0200 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU, fix fix the !CONFIG_RCU_CPU_STALL_DETECTOR path: kernel/rcuclassic.c: In function '__rcu_pending': kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall' Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 0d07e6e51578..37f72e551542 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) { } -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) { } -- cgit v1.2.1 From 77af7e3403e7314c47b0c07fbc5e4ef21d939532 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 11:39:46 +0200 Subject: softirq, warning fix: correct a format to avoid a warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last -tip gives this warning: kernel/softirq.c: Dans la fonction «__do_softirq» : kernel/softirq.c:216: attention : format «%ld» expects type «long int», but argument 2 has type «int» This patch corrects the format type, and a small mistake in the "softirq" word. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 1cf1e2f2c406..be7a8292f992 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,7 +210,7 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered sotfirq %ld %p" + printk(KERN_ERR "huh, entered softirq %d %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, h->action, prev_count, preempt_count()); -- cgit v1.2.1 From d294eb83d8d39a29f01dad391f15fc3a29aa04f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 12:10:10 +0200 Subject: cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a warning on latest -tip: kernel/cpuset.c: Dans la fonction «scan_for_empty_cpusets» : kernel/cpuset.c:1932: attention : passing argument 1 of «list_add_tail» discards qualifiers from pointer target type Actually the struct cpuset *root passed in parameter to scan_for_empty_cpusets is not supposed to be const since an entry is added on the tail of its list. Just correct the qualifier. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb2..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ -- cgit v1.2.1 From 07454bfff151d2465ada809bbaddf3548cc1097c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Oct 2008 10:51:07 +0200 Subject: clockevents: check broadcast tick device not the clock events device Impact: jiffies increment too fast. Hugh Dickins noted that with NOHZ=n and HIGHRES=n jiffies get incremented too fast. The reason is a wrong check in the broadcast enter/exit code, which keeps the local apic timer in periodic mode when the switch happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index bd7034542399..cb01cd8f919b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,8 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (bc->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (bc->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; -- cgit v1.2.1 From f6121f4f8708195e88cbdf8dd8d171b226b3f858 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Fri, 3 Oct 2008 17:40:46 +0200 Subject: sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq While working on the new version of the code for SCHED_SPORADIC I noticed something strange in the present throttling mechanism. More specifically in the throttling timer handler in sched_rt.c (do_sched_rt_period_timer()) and in rt_rq_enqueue(). The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only asks for rescheduling if the runqueue has a sched_entity associated to it (i.e., rt_rq->rt_se != NULL). Now, if the runqueue is the root rq (which has a rt_se = NULL) rescheduling does not take place, and it is delayed to some undefined instant in the future. This imply some random bandwidth usage by the RT tasks under throttling. For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT task will get less than 95%. In our tests we got something varying between 70% to 95%. Using smaller time values, e.g., 95ms/100ms, things are even worse, and I can see values also going down to 20-25%!! The tests we performed are simply running 'yes' as a SCHED_FIFO task, and checking the CPU usage with top, but we can investigate thoroughly if you think it is needed. Things go much better, for us, with the attached patch... Don't know if it is the best approach, but it solved the issue for us. Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Acked-by: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d570a8cc4fcd..cdf5740ab03e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } -- cgit v1.2.1 From 34b3ede2353604ec9861c1d900b2a835ff85de47 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 6 Oct 2008 09:27:00 +0800 Subject: sched: remove redundant code in cpu_cgroup_create() css will be initialized by cgroup core. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2caedc47e764..9715f4ce6cfe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9088,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -9097,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } -- cgit v1.2.1 From cc1e0f4f7ad95a9eb81e1904cb16068af226180d Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 6 Oct 2008 13:50:59 -0500 Subject: kgdb: call touch_softlockup_watchdog on resume The softlockup watchdog needs to be touched when resuming the from the kgdb stopped state to avoid the printk that a CPU is stuck if the debugger was active for longer than the softlockup threshold. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 25d955dbb989..e4dcfb2272a4 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1432,6 +1433,7 @@ acquirelock: atomic_read(&kgdb_cpu_doing_single_step) != cpu) { atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1524,6 +1526,7 @@ acquirelock: kgdb_restore: /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); -- cgit v1.2.1 From 2fb7635c4cea310992a39580133099dd99ad151c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Oct 2008 09:16:04 +0200 Subject: sched: sync wakeups vs avg_overlap While looking at the code I wondered why we always do: sync && avg_overlap < migration_cost Which is a bit odd, since the overlap test was meant to detect sync wakeups so using it to specialize sync wakeups doesn't make much sense. Hence change the code to do: sync || avg_overlap < migration_cost Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fcbe850a5a90..18fd17172eb6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1103,6 +1103,11 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1127,11 +1132,8 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); @@ -1268,9 +1270,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && sync && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) { + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } -- cgit v1.2.1 From a5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 9 Oct 2008 11:35:51 +0200 Subject: sched debug: add name to sched_domain sysctl entries add /proc/sys/kernel/sched_domain/cpu0/domain0/name, to make it easier to see which specific scheduler domain remained at that entry. Since we process the scheduler domain tree and simplify it, it's not always immediately clear during debugging which domain came from where. depends on CONFIG_SCHED_DEBUG=y. Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9715f4ce6cfe..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6351,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6379,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -7263,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) -- cgit v1.2.1 From 8083e4ad970e4eb567e31037060cdd4ba346f0c0 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Mon, 4 Aug 2008 11:59:11 -0700 Subject: [CPUFREQ][5/6] cpufreq: Changes to get_cpu_idle_time_us(), used by ondemand governor export get_cpu_idle_time_us() for it to be used in ondemand governor. Last update time can be current time when the CPU is currently non-idle, accounting for the busy time since last idle. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- kernel/time/tick-sched.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb02324bdb88..a4d219398167 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task -- cgit v1.2.1 From a6bebbc87a8c16eabb6bd5c6fd2d994be0236fba Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 5 Oct 2008 00:51:15 +0400 Subject: [PATCH] signal, procfs: some lock_task_sighand() users do not need rcu_read_lock() lock_task_sighand() make sure task->sighand is being protected, so we do not need rcu_read_lock(). [ exec() will get task->sighand->siglock before change task->sighand! ] But code using rcu_read_lock() _just_ to protect lock_task_sighand() only appear in procfs. (and some code in procfs use lock_task_sighand() without such redundant protection.) Other subsystem may put lock_task_sighand() into rcu_read_lock() critical region, but these rcu_read_lock() are used for protecting "for_each_process()", "find_task_by_vpid()" etc. , not for protecting lock_task_sighand(). Signed-off-by: Lai Jiangshan [ok from Oleg] Signed-off-by: Alexey Dobriyan --- kernel/sched_debug.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index bbe6b31c3c56..ad958c1ec708 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) unsigned long flags; int num_threads = 1; - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { num_threads = atomic_read(&p->signal->count); unlock_task_sighand(p, &flags); } - rcu_read_unlock(); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); SEQ_printf(m, -- cgit v1.2.1 From 3bbfe0596746e1590888a6e1e6a07583265238b7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 10 Oct 2008 03:27:16 +0400 Subject: proc: remove kernel.maps_protect After commit 831830b5a2b5d413407adf380ef62fe17d6fcbf2 aka "restrict reading from /proc//maps to those who share ->mm or can ptrace" sysctl stopped being relevant because commit moved security checks from ->show time to ->start time (mm_for_maps()). Signed-off-by: Alexey Dobriyan Acked-by: Kees Cook --- kernel/sysctl.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec0886fa3d..cc3e0d7a5acf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -extern int maps_protect; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST @@ -809,16 +808,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#endif -#ifdef CONFIG_PROC_FS - { - .ctl_name = CTL_UNNUMBERED, - .procname = "maps_protect", - .data = &maps_protect, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #endif { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.1 From 5b7dba4ff834259a5623e03a565748704a8fe449 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Thu, 9 Oct 2008 13:21:30 -0500 Subject: sched_clock: prevent scd->clock from moving backwards When sched_clock_cpu() couples the clocks between two cpus, it may increment scd->clock beyond the GTOD tick window that __update_sched_clock() uses to clamp the clock. A later call to __update_sched_clock() may move the clock back to scd->tick_gtod + TICK_NSEC, violating the clock's monotonic property. This patch ensures that scd->clock will not be set backward. Signed-off-by: Dave Kleikamp Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe3..81787248b60f 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * max(scd->clock, scd->tick_gtod + TICK_NSEC)); */ clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); -- cgit v1.2.1 From 061b1bd394ca8628b7c24eb4658ba3535da4249a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Sep 2008 14:46:44 -0700 Subject: Staging: add TAINT_CRAP for all drivers/staging code We need to add a flag for all code that is in the drivers/staging/ directory to prevent all other kernel developers from worrying about issues here, and to notify users that the drivers might not be as good as they are normally used to. Based on code from Andreas Gruenbacher and Jeff Mahoney to provide a TAINT flag for the support level of a kernel module in the Novell enterprise kernel release. This is the kernel portion of this feature, the ability for the flag to be set needs to be done in the build process and will happen in a follow-up patch. Cc: Andreas Gruenbacher Cc: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 11 +++++++++++ kernel/panic.c | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04b..152b1655bbac 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1806,6 +1806,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1960,6 +1961,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2556,6 +2565,8 @@ static char *module_flags(struct module *mod, char *buf) buf[bx++] = 'P'; if (mod->taints & TAINT_FORCED_MODULE) buf[bx++] = 'F'; + if (mod->taints & TAINT_CRAP) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89b..98e2047f4db7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -155,6 +155,7 @@ EXPORT_SYMBOL(panic); * 'U' - Userspace-defined naughtiness. * 'A' - ACPI table overridden. * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. * * The string is overwritten by the next call to print_taint(). */ @@ -163,7 +164,7 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', @@ -173,7 +174,8 @@ const char *print_tainted(void) tainted & TAINT_USER ? 'U' : ' ', tainted & TAINT_DIE ? 'D' : ' ', tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); + tainted & TAINT_WARN ? 'W' : ' ', + tainted & TAINT_CRAP ? 'C' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); -- cgit v1.2.1 From 030aebd2e439a2ebcca2b0ce30a02ed84feb043e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Oct 2008 12:25:45 -0700 Subject: rangetimer: fix BUG_ON reported by Ingo There's a small race/chance that, while hrtimers are enabled globally, they're later not enabled when we're calling the hrtimer_interrupt() function, which then BUG_ON()'s for that. This patch closes that race/gap. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index eb2cf984959f..b17657d8d81a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1395,11 +1395,15 @@ void hrtimer_peek_ahead_timers(void) unsigned long flags; struct tick_device *td; struct clock_event_device *dev; - + struct hrtimer_cpu_base *cpu_base; if (hrtimer_hres_active()) return; local_irq_save(flags); + cpu_base = &__get_cpu_var(hrtimer_bases); + if (!cpu_base->hres_active) + goto out; + td = &__get_cpu_var(tick_cpu_device); if (!td) goto out; -- cgit v1.2.1 From dc4304f7deee29fcdf6a2b62f7146ea7f505fd42 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 13 Oct 2008 10:32:15 -0400 Subject: rangetimers: fix the bug reported by Ingo for real and please hand me a brown paper bag (thanks to Thomas for pointing out this very obvious bug) Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b17657d8d81a..2bd230be1cb5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1395,15 +1395,11 @@ void hrtimer_peek_ahead_timers(void) unsigned long flags; struct tick_device *td; struct clock_event_device *dev; - struct hrtimer_cpu_base *cpu_base; - if (hrtimer_hres_active()) + + if (!hrtimer_hres_active()) return; local_irq_save(flags); - cpu_base = &__get_cpu_var(hrtimer_bases); - if (!cpu_base->hres_active) - goto out; - td = &__get_cpu_var(tick_cpu_device); if (!td) goto out; -- cgit v1.2.1 From 9c9f4ded90a59eee84e15f5fd38c03d60184e112 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Oct 2008 10:37:26 +0100 Subject: tty: Add a kref count Introduce a kref to the tty structure and use it to protect the tty->signal tty references. For now we don't introduce it for anything else. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++++- kernel/sys.c | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe84796..30de644a40c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -802,6 +802,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; + sig->tty = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; @@ -838,6 +839,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -1227,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - p->signal->tty = current->signal->tty; + tty_kref_put(p->signal->tty); + p->signal->tty = tty_kref_get(current->signal->tty); set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc0901d..234d9454294e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1060,9 +1060,7 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(sid); - spin_lock(&group_leader->sighand->siglock); - group_leader->signal->tty = NULL; - spin_unlock(&group_leader->sighand->siglock); + proc_clear_tty(group_leader); err = session; out: -- cgit v1.2.1 From 95f9bfc6b76e862265a2d70ae061eec18fe14140 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Oct 2008 10:39:23 +0100 Subject: tty: Move tty_write_message out of kernel/printk This is pure tty code so put it in the tty layer where it can be with the locking relevant material it uses Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- kernel/printk.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b51b1567bb55..a430fd04008b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1291,22 +1291,6 @@ static int __init disable_boot_consoles(void) } late_initcall(disable_boot_consoles); -/** - * tty_write_message - write a message to a certain tty, not just the console. - * @tty: the destination tty_struct - * @msg: the message to write - * - * This is used for messages that need to be redirected to a specific tty. - * We don't put it into the syslog queue right now maybe in the future if - * really needed. - */ -void tty_write_message(struct tty_struct *tty, char *msg) -{ - if (tty && tty->ops->write) - tty->ops->write(tty, msg, strlen(msg)); - return; -} - #if defined CONFIG_PRINTK /* -- cgit v1.2.1 From dbda4c0b97b18fd59b3964548361b4f92357f730 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Oct 2008 10:40:53 +0100 Subject: tty: Fix abusers of current->sighand->tty Various people outside the tty layer still stick their noses in behind the scenes. We need to make sure they also obey the locking and referencing rules. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/auditsc.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index dd68b9059418..f6006a60df5d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, #endif spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; + tty = current->signal->tty; /* Safe as we hold the siglock */ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 59cedfb040e7..cf5bc2f5f9c3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask) unsigned n; if (unlikely(!ctx)) return 0; - n = ctx->major; + switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && @@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; - read_unlock(&tasklist_lock); + spin_unlock_irq(&tsk->sighand->siglock); + audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" @@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->egid, context->sgid, context->fsgid, tty, tsk->sessionid); - mutex_unlock(&tty_mutex); audit_log_task_info(ab, tsk); if (context->filterkey) { -- cgit v1.2.1 From 97e1c18e8d17bd87e1e383b2e9d9fc740332c8e2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 18 Jul 2008 12:16:16 -0400 Subject: tracing: Kernel Tracepoints Implementation of kernel tracepoints. Inspired from the Linux Kernel Markers. Allows complete typing verification by declaring both tracing statement inline functions and probe registration/unregistration static inline functions within the same macro "DEFINE_TRACE". No format string is required. See the tracepoint Documentation and Samples patches for usage examples. Taken from the documentation patch : "A tracepoint placed in code provides a hook to call a function (probe) that you can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or "off" (no probe is attached). When a tracepoint is "off" it has no effect, except for adding a tiny time penalty (checking a condition for a branch) and space penalty (adding a few bytes for the function call at the end of the instrumented function and adds a data structure in a separate section). When a tracepoint is "on", the function you provide is called each time the tracepoint is executed, in the execution context of the caller. When the function provided ends its execution, it returns to the caller (continuing from the tracepoint site). You can put tracepoints at important locations in the code. They are lightweight hooks that can pass an arbitrary number of parameters, which prototypes are described in a tracepoint declaration placed in a header file." Addition and removal of tracepoints is synchronized by RCU using the scheduler (and preempt_disable) as guarantees to find a quiescent state (this is really RCU "classic"). The update side uses rcu_barrier_sched() with call_rcu_sched() and the read/execute side uses "preempt_disable()/preempt_enable()". We make sure the previous array containing probes, which has been scheduled for deletion by the rcu callback, is indeed freed before we proceed to the next update. It therefore limits the rate of modification of a single tracepoint to one update per RCU period. The objective here is to permit fast batch add/removal of probes on _different_ tracepoints. Changelog : - Use #name ":" #proto as string to identify the tracepoint in the tracepoint table. This will make sure not type mismatch happens due to connexion of a probe with the wrong type to a tracepoint declared with the same name in a different header. - Add tracepoint_entry_free_old. - Change __TO_TRACE to get rid of the 'i' iterator. Masami Hiramatsu : Tested on x86-64. Performance impact of a tracepoint : same as markers, except that it adds about 70 bytes of instructions in an unlikely branch of each instrumented function (the for loop, the stack setup and the function call). It currently adds a memory read, a test and a conditional branch at the instrumentation site (in the hot path). Immediate values will eventually change this into a load immediate, test and branch, which removes the memory read which will make the i-cache impact smaller (changing the memory read for a load immediate removes 3-4 bytes per site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it also saves the d-cache hit). About the performance impact of tracepoints (which is comparable to markers), even without immediate values optimizations, tests done by Hideo Aoki on ia64 show no regression. His test case was using hackbench on a kernel where scheduler instrumentation (about 5 events in code scheduler code) was added. Quoting Hideo Aoki about Markers : I evaluated overhead of kernel marker using linux-2.6-sched-fixes git tree, which includes several markers for LTTng, using an ia64 server. While the immediate trace mark feature isn't implemented on ia64, there is no major performance regression. So, I think that we don't have any issues to propose merging marker point patches into Linus's tree from the viewpoint of performance impact. I prepared two kernels to evaluate. The first one was compiled without CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS. I downloaded the original hackbench from the following URL: http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c I ran hackbench 5 times in each condition and calculated the average and difference between the kernels. The parameter of hackbench: every 50 from 50 to 800 The number of CPUs of the server: 2, 4, and 8 Below is the results. As you can see, major performance regression wasn't found in any case. Even if number of processes increases, differences between marker-enabled kernel and marker- disabled kernel doesn't increase. Moreover, if number of CPUs increases, the differences doesn't increase either. Curiously, marker-enabled kernel is better than marker-disabled kernel in more than half cases, although I guess it comes from the difference of memory access pattern. * 2 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 4.811 | 4.872 | +0.061 | +1.27 | 100 | 9.854 | 10.309 | +0.454 | +4.61 | 150 | 15.602 | 15.040 | -0.562 | -3.6 | 200 | 20.489 | 20.380 | -0.109 | -0.53 | 250 | 25.798 | 25.652 | -0.146 | -0.56 | 300 | 31.260 | 30.797 | -0.463 | -1.48 | 350 | 36.121 | 35.770 | -0.351 | -0.97 | 400 | 42.288 | 42.102 | -0.186 | -0.44 | 450 | 47.778 | 47.253 | -0.526 | -1.1 | 500 | 51.953 | 52.278 | +0.325 | +0.63 | 550 | 58.401 | 57.700 | -0.701 | -1.2 | 600 | 63.334 | 63.222 | -0.112 | -0.18 | 650 | 68.816 | 68.511 | -0.306 | -0.44 | 700 | 74.667 | 74.088 | -0.579 | -0.78 | 750 | 78.612 | 79.582 | +0.970 | +1.23 | 800 | 85.431 | 85.263 | -0.168 | -0.2 | -------------------------------------------------------------- * 4 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 2.586 | 2.584 | -0.003 | -0.1 | 100 | 5.254 | 5.283 | +0.030 | +0.56 | 150 | 8.012 | 8.074 | +0.061 | +0.76 | 200 | 11.172 | 11.000 | -0.172 | -1.54 | 250 | 13.917 | 14.036 | +0.119 | +0.86 | 300 | 16.905 | 16.543 | -0.362 | -2.14 | 350 | 19.901 | 20.036 | +0.135 | +0.68 | 400 | 22.908 | 23.094 | +0.186 | +0.81 | 450 | 26.273 | 26.101 | -0.172 | -0.66 | 500 | 29.554 | 29.092 | -0.461 | -1.56 | 550 | 32.377 | 32.274 | -0.103 | -0.32 | 600 | 35.855 | 35.322 | -0.533 | -1.49 | 650 | 39.192 | 38.388 | -0.804 | -2.05 | 700 | 41.744 | 41.719 | -0.025 | -0.06 | 750 | 45.016 | 44.496 | -0.520 | -1.16 | 800 | 48.212 | 47.603 | -0.609 | -1.26 | -------------------------------------------------------------- * 8 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 2.094 | 2.072 | -0.022 | -1.07 | 100 | 4.162 | 4.273 | +0.111 | +2.66 | 150 | 6.485 | 6.540 | +0.055 | +0.84 | 200 | 8.556 | 8.478 | -0.078 | -0.91 | 250 | 10.458 | 10.258 | -0.200 | -1.91 | 300 | 12.425 | 12.750 | +0.325 | +2.62 | 350 | 14.807 | 14.839 | +0.032 | +0.22 | 400 | 16.801 | 16.959 | +0.158 | +0.94 | 450 | 19.478 | 19.009 | -0.470 | -2.41 | 500 | 21.296 | 21.504 | +0.208 | +0.98 | 550 | 23.842 | 23.979 | +0.137 | +0.57 | 600 | 26.309 | 26.111 | -0.198 | -0.75 | 650 | 28.705 | 28.446 | -0.259 | -0.9 | 700 | 31.233 | 31.394 | +0.161 | +0.52 | 750 | 34.064 | 33.720 | -0.344 | -1.01 | 800 | 36.320 | 36.114 | -0.206 | -0.57 | -------------------------------------------------------------- Signed-off-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: 'Peter Zijlstra' Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/module.c | 66 +++++++- kernel/tracepoint.c | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 1 deletion(-) create mode 100644 kernel/tracepoint.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df7c3e2..8f9ce7ec21b6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FTRACE) += trace/ diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04b..661d73db786e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,6 +46,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -1831,6 +1832,8 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int tracepointsindex; + unsigned int tracepointsstringsindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2117,6 +2120,9 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); + tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, + "__tracepoints_strings"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2144,6 +2150,12 @@ static noinline struct module *load_module(void __user *umod, mod->num_markers = sechdrs[markersindex].sh_size / sizeof(*mod->markers); #endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; + mod->num_tracepoints = + sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); +#endif + /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2162,11 +2174,16 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + if (!mod->taints) { #ifdef CONFIG_MARKERS - if (!mod->taints) marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif +#ifdef CONFIG_TRACEPOINTS + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); +#endif + } err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2717,3 +2734,50 @@ void module_update_markers(void) mutex_unlock(&module_mutex); } #endif + +#ifdef CONFIG_TRACEPOINTS +void module_update_tracepoints(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!mod->taints) + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!iter_mod->taints) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints, + iter_mod->tracepoints + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c new file mode 100644 index 000000000000..42e86ddbd2a0 --- /dev/null +++ b/kernel/tracepoint.c @@ -0,0 +1,476 @@ +/* + * Copyright (C) 2008 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct tracepoint __start___tracepoints[]; +extern struct tracepoint __stop___tracepoints[]; + +/* Set to 1 to enable tracepoint debug output */ +static const int tracepoint_debug; + +/* + * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the + * builtin and module tracepoints and the hash table. + */ +static DEFINE_MUTEX(tracepoints_mutex); + +/* + * Tracepoint hash table, containing the active tracepoints. + * Protected by tracepoints_mutex. + */ +#define TRACEPOINT_HASH_BITS 6 +#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) + +/* + * Note about RCU : + * It is used to to delay the free of multiple probes array until a quiescent + * state is reached. + * Tracepoint entries modifications are protected by the tracepoints_mutex. + */ +struct tracepoint_entry { + struct hlist_node hlist; + void **funcs; + int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + char name[0]; +}; + +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; + +static void free_old_closure(struct rcu_head *head) +{ + struct tracepoint_entry *entry = container_of(head, + struct tracepoint_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +{ + if (!old) + return; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); +} + +static void debug_print_probes(struct tracepoint_entry *entry) +{ + int i; + + if (!tracepoint_debug) + return; + + for (i = 0; entry->funcs[i]; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); +} + +static void * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0; + void **old, **new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->funcs; + if (old) { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) + if (old[nr_probes] == probe) + return ERR_PTR(-EEXIST); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (old) + memcpy(new, old, nr_probes * sizeof(void *)); + new[nr_probes] = probe; + entry->refcount = nr_probes + 1; + entry->funcs = new; + debug_print_probes(entry); + return old; +} + +static void * +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0, nr_del = 0, i; + void **old, **new; + + old = entry->funcs; + + debug_print_probes(entry); + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) { + if ((!probe || old[nr_probes] == probe)) + nr_del++; + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->funcs = NULL; + entry->refcount = 0; + debug_print_probes(entry); + return old; + } else { + int j = 0; + /* N -> M, (N > 1, M > 0) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i]; i++) + if ((probe && old[i] != probe)) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->funcs = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get tracepoint if the tracepoint is present in the tracepoint hash table. + * Must be called with tracepoints_mutex held. + * Returns NULL if not present. + */ +static struct tracepoint_entry *get_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the tracepoint to the tracepoint hash table. Must be called with + * tracepoints_mutex held. + */ +static struct tracepoint_entry *add_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + + head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "tracepoint %s busy\n", name); + return ERR_PTR(-EEXIST); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(&e->name[0], name, name_len); + e->funcs = NULL; + e->refcount = 0; + e->rcu_pending = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the tracepoint from the tracepoint hash table. Must be called with + * mutex_lock held. + */ +static int remove_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + int found = 0; + size_t len = strlen(name) + 1; + u32 hash = jhash(name, len-1, 0); + + head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->refcount) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; +} + +/* + * Sets the probe callback corresponding to one tracepoint. + */ +static void set_tracepoint(struct tracepoint_entry **entry, + struct tracepoint *elem, int active) +{ + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + /* + * rcu_assign_pointer has a smp_wmb() which makes sure that the new + * probe callbacks array is consistent before setting a pointer to it. + * This array is referenced by __DO_TRACE from + * include/linux/tracepoints.h. A matching smp_read_barrier_depends() + * is used. + */ + rcu_assign_pointer(elem->funcs, (*entry)->funcs); + elem->state = active; +} + +/* + * Disable a tracepoint and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. + */ +static void disable_tracepoint(struct tracepoint *elem) +{ + elem->state = 0; +} + +/** + * tracepoint_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ + struct tracepoint *iter; + struct tracepoint_entry *mark_entry; + + mutex_lock(&tracepoints_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_tracepoint(iter->name); + if (mark_entry) { + set_tracepoint(&mark_entry, iter, + !!mark_entry->refcount); + } else { + disable_tracepoint(iter); + } + } + mutex_unlock(&tracepoints_mutex); +} + +/* + * Update probes, removing the faulty probes. + */ +static void tracepoint_update_probes(void) +{ + /* Core kernel tracepoints */ + tracepoint_update_probe_range(__start___tracepoints, + __stop___tracepoints); + /* tracepoints in modules. */ + module_update_tracepoints(); +} + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @name: tracepoint name + * @probe: probe handler + * + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int tracepoint_probe_register(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + int ret = 0; + void *old; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } + } + /* + * If we detect that a call_rcu is pending for this tracepoint, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + goto end; + } + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + WARN_ON(!entry); + tracepoint_entry_free_old(entry, old); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register); + +/** + * tracepoint_probe_unregister - Disconnect a probe from a tracepoint + * @name: tracepoint name + * @probe: probe function pointer + * + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int tracepoint_probe_unregister(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + int ret = -ENOENT; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier(); + old = tracepoint_entry_remove_probe(entry, probe); + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + tracepoint_entry_free_old(entry, old); + remove_tracepoint(name); /* Ignore busy error message */ + ret = 0; +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +/** + * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. + * @tracepoint: current tracepoints (in), next tracepoint (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next tracepoint has been found (1) or not (0). + * Will return the first tracepoint in the range if the input tracepoint is + * NULL. + */ +int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end) +{ + if (!*tracepoint && begin != end) { + *tracepoint = begin; + return 1; + } + if (*tracepoint >= begin && *tracepoint < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); + +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + if (!iter->module) { + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints, __stop___tracepoints); + if (found) + goto end; + } + /* tracepoints in modules. */ + found = module_get_iter_tracepoints(iter); +end: + if (!found) + tracepoint_iter_reset(iter); +} + +void tracepoint_iter_start(struct tracepoint_iter *iter) +{ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_start); + +void tracepoint_iter_next(struct tracepoint_iter *iter) +{ + iter->tracepoint++; + /* + * iter->tracepoint may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the tracepoints, getting the + * tracepoints from following modules if necessary. + */ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_next); + +void tracepoint_iter_stop(struct tracepoint_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + +void tracepoint_iter_reset(struct tracepoint_iter *iter) +{ + iter->module = NULL; + iter->tracepoint = NULL; +} +EXPORT_SYMBOL_GPL(tracepoint_iter_reset); -- cgit v1.2.1 From 0a16b6075843325dc402edf80c1662838b929aff Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 18 Jul 2008 12:16:17 -0400 Subject: tracing, sched: LTTng instrumentation - scheduler Instrument the scheduler activity (sched_switch, migration, wakeups, wait for a task, signal delivery) and process/thread creation/destruction (fork, exit, kthread stop). Actually, kthread creation is not instrumented in this patch because it is architecture dependent. It allows to connect tracers such as ftrace which detects scheduling latencies, good/bad scheduler decisions. Tools like LTTng can export this scheduler information along with instrumentation of the rest of the kernel activity to perform post-mortem analysis on the scheduler activity. About the performance impact of tracepoints (which is comparable to markers), even without immediate values optimizations, tests done by Hideo Aoki on ia64 show no regression. His test case was using hackbench on a kernel where scheduler instrumentation (about 5 events in code scheduler code) was added. See the "Tracepoints" patch header for performance result detail. Changelog : - Change instrumentation location and parameter to match ftrace instrumentation, previously done with kernel markers. [ mingo@elte.hu: conflict resolutions ] Signed-off-by: Mathieu Desnoyers Acked-by: 'Peter Zijlstra' Signed-off-by: Ingo Molnar --- kernel/exit.c | 10 +++++++++- kernel/fork.c | 3 +++ kernel/kthread.c | 5 +++++ kernel/sched.c | 17 ++++++----------- kernel/signal.c | 3 +++ 5 files changed, 26 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 85a83c831856..7b71f87f1207 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -149,7 +150,10 @@ static void __exit_signal(struct task_struct *tsk) static void delayed_put_task_struct(struct rcu_head *rhp) { - put_task_struct(container_of(rhp, struct task_struct, rcu)); + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); } @@ -1074,6 +1078,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); + trace_sched_process_exit(tsk); + exit_sem(tsk); exit_files(tsk); exit_fs(tsk); @@ -1675,6 +1681,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options, struct task_struct *tsk; int retval; + trace_sched_process_wait(pid); + add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* diff --git a/kernel/fork.c b/kernel/fork.c index 30de644a40c4..cfaff92f61ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1364,6 +1365,8 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + trace_sched_process_fork(current, p); + nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) diff --git a/kernel/kthread.c b/kernel/kthread.c index 96cff2f8710b..50598e29439a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,6 +13,7 @@ #include #include #include +#include #define KTHREAD_NICE_LEVEL (-5) @@ -206,6 +207,8 @@ int kthread_stop(struct task_struct *k) /* It could exit after stop_info.k set, but before wake_up_process. */ get_task_struct(k); + trace_sched_kthread_stop(k); + /* Must init completion *before* thread sees kthread_stop_info.k */ init_completion(&kthread_stop_info.done); smp_wmb(); @@ -221,6 +224,8 @@ int kthread_stop(struct task_struct *k) ret = kthread_stop_info.err; mutex_unlock(&kthread_stop_lock); + trace_sched_kthread_stop_ret(ret); + return ret; } EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/sched.c b/kernel/sched.c index 6f230596bd0c..3d1ad130c24e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -1936,6 +1937,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; @@ -2297,9 +2299,7 @@ out_activate: success = 1; out_running: - trace_mark(kernel_sched_wakeup, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup(rq, p); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@ -2432,9 +2432,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_mark(kernel_sched_wakeup_new, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup_new(rq, p); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2607,11 +2605,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_mark(kernel_sched_schedule, - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - prev->pid, next->pid, prev->state, - rq, prev, next); + trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2851,6 +2845,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; + trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ diff --git a/kernel/signal.c b/kernel/signal.c index e661b01d340f..bf40ecc87b26 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -803,6 +804,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; + trace_sched_signal_send(sig, t); + assert_spin_locked(&t->sighand->siglock); if (!prepare_signal(sig, t)) return 0; -- cgit v1.2.1 From b07c3f193a8074aa4afe43cfa8ae38ec4c7ccfa9 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 18 Jul 2008 12:16:17 -0400 Subject: ftrace: port to tracepoints Porting the trace_mark() used by ftrace to tracepoints. (cleanup) Changelog : - Change error messages : marker -> tracepoint [ mingo@elte.hu: conflict resolutions ] Signed-off-by: Mathieu Desnoyers Acked-by: 'Peter Zijlstra' Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_switch.c | 120 +++++++-------------------------- kernel/trace/trace_sched_wakeup.c | 135 ++++++++++---------------------------- 2 files changed, 58 insertions(+), 197 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb817a209aa0..789e927abc9c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -9,8 +9,8 @@ #include #include #include -#include #include +#include #include "trace.h" @@ -19,16 +19,17 @@ static int __read_mostly tracer_enabled; static atomic_t sched_ref; static void -sched_switch_func(void *private, void *__rq, struct task_struct *prev, +probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) { - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; int cpu; + if (!atomic_read(&sched_ref)) + return; + tracing_record_cmdline(prev); tracing_record_cmdline(next); @@ -37,95 +38,42 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev, local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = ctx_trace->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - tracing_sched_switch_trace(tr, data, prev, next, flags); + tracing_sched_switch_trace(ctx_trace, data, prev, next, flags); atomic_dec(&data->disabled); local_irq_restore(flags); } -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - if (!atomic_read(&sched_ref)) - return; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - sched_switch_func(probe_data, __rq, prev, next); -} - static void -wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct - task_struct *curr) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) { - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; int cpu; - if (!tracer_enabled) + if (!likely(tracer_enabled)) return; - tracing_record_cmdline(curr); + tracing_record_cmdline(current); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = ctx_trace->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, + flags); atomic_dec(&data->disabled); local_irq_restore(flags); } -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_func(probe_data, __rq, task, curr); -} - static void sched_switch_reset(struct trace_array *tr) { int cpu; @@ -140,60 +88,40 @@ static int tracing_sched_register(void) { int ret; - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); + ret = register_trace_sched_wakeup(probe_sched_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); + ret = register_trace_sched_wakeup_new(probe_sched_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &ctx_trace); + ret = register_trace_sched_switch(probe_sched_switch); if (ret) { - pr_info("sched trace: Couldn't add marker" + pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_schedule\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_wakeup_new(probe_sched_wakeup); fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_wakeup(probe_sched_wakeup); return ret; } static void tracing_sched_unregister(void) { - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_switch(probe_sched_switch); + unregister_trace_sched_wakeup_new(probe_sched_wakeup); + unregister_trace_sched_wakeup(probe_sched_wakeup); } static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e303ccb62cdf..08206b4e29c4 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "trace.h" @@ -112,18 +112,18 @@ static int report_latency(cycle_t delta) } static void notrace -wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, +probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; long disabled; int cpu; + tracing_record_cmdline(prev); + if (unlikely(!tracer_enabled)) return; @@ -140,11 +140,11 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, return; /* The task we are waiting for is waking up */ - data = tr->data[wakeup_cpu]; + data = wakeup_trace->data[wakeup_cpu]; /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&tr->data[cpu]->disabled); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (likely(disabled != 1)) goto out; @@ -155,7 +155,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags); /* * usecs conversion is slow so we try to delay the conversion @@ -174,39 +174,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, t0 = nsecs_to_usecs(T0); t1 = nsecs_to_usecs(T1); - update_max_tr(tr, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); out_unlock: - __wakeup_reset(tr); + __wakeup_reset(wakeup_trace); __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - tracing_record_cmdline(prev); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - wakeup_sched_switch(probe_data, __rq, prev, next); + atomic_dec(&wakeup_trace->data[cpu]->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -240,19 +215,24 @@ static void wakeup_reset(struct trace_array *tr) } static void -wakeup_check_start(struct trace_array *tr, struct task_struct *p, - struct task_struct *curr) +probe_wakeup(struct rq *rq, struct task_struct *p) { int cpu = smp_processor_id(); unsigned long flags; long disabled; + if (likely(!tracer_enabled)) + return; + + tracing_record_cmdline(p); + tracing_record_cmdline(current); + if (likely(!rt_task(p)) || p->prio >= wakeup_prio || - p->prio >= curr->prio) + p->prio >= current->prio) return; - disabled = atomic_inc_return(&tr->data[cpu]->disabled); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (unlikely(disabled != 1)) goto out; @@ -264,7 +244,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, goto out_locked; /* reset the trace */ - __wakeup_reset(tr); + __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); wakeup_prio = p->prio; @@ -274,74 +254,37 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); - tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - trace_function(tr, tr->data[wakeup_cpu], + wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); + trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: __raw_spin_unlock(&wakeup_lock); out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct trace_array **ptr = probe_data; - struct trace_array *tr = *ptr; - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_check_start(tr, task, curr); + atomic_dec(&wakeup_trace->data[cpu]->disabled); } static void start_wakeup_tracer(struct trace_array *tr) { int ret; - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); + ret = register_trace_sched_wakeup(probe_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return; } - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); + ret = register_trace_sched_wakeup_new(probe_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &wakeup_trace); + ret = register_trace_sched_switch(probe_wakeup_sched_switch); if (ret) { - pr_info("sched trace: Couldn't add marker" + pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_schedule\n"); goto fail_deprobe_wake_new; } @@ -363,28 +306,18 @@ static void start_wakeup_tracer(struct trace_array *tr) return; fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_wakeup_new(probe_wakeup); fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_wakeup(probe_wakeup); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; unregister_ftrace_function(&trace_ops); - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_switch(probe_wakeup_sched_switch); + unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup); } static void wakeup_tracer_init(struct trace_array *tr) -- cgit v1.2.1 From 5f87f1121895dc09d2d1c1db5f14af6aa4ce3e94 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Jul 2008 14:15:22 +0200 Subject: tracing: clean up tracepoints kconfig structure do not expose users to CONFIG_TRACEPOINTS - tracers can select it just fine. update ftrace to select CONFIG_TRACEPOINTS. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 263e9e6bbd60..cae2637d5e68 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -14,6 +14,7 @@ config TRACING bool select DEBUG_FS select STACKTRACE + select TRACEPOINTS config FTRACE bool "Kernel Function Tracer" -- cgit v1.2.1 From 9795302acf2817d0842e56d23df6008e43df0970 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 24 Jul 2008 16:37:23 -0400 Subject: tracepoints: use TABLE_SIZE macro Steven Rostedt suggested: | Wouldn't it look nicer to have: (TRACEPOINT_TABLE_SIZE - 1) ? Signed-off-by: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 42e86ddbd2a0..c7c62a4a75f5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -177,7 +177,7 @@ static struct tracepoint_entry *get_tracepoint(const char *name) struct tracepoint_entry *e; u32 hash = jhash(name, strlen(name), 0); - head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) return e; @@ -197,7 +197,7 @@ static struct tracepoint_entry *add_tracepoint(const char *name) size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); - head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE @@ -233,7 +233,7 @@ static int remove_tracepoint(const char *name) size_t len = strlen(name) + 1; u32 hash = jhash(name, len-1, 0); - head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { found = 1; -- cgit v1.2.1 From 36dcd67ae994fece615b7c700958d215e884b9ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 12:00:59 +0200 Subject: ftrace: ignore functions that cannot be kprobe-ed kprobes already has an extensive list of annotations for functions that should not be instrumented. Add notrace annotations to these functions as well. This is particularly useful for functions called by the NMI path. Signed-off-by: Ingo Molnar --- kernel/notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 823be11584ef..4282c0a40a57 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { -- cgit v1.2.1 From 8da3821ba5634497da63d58a69e24a97697c4a2b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:07 -0400 Subject: ftrace: create __mcount_loc section This patch creates a section in the kernel called "__mcount_loc". This will hold a list of pointers to the mcount relocation for each call site of mcount. For example: objdump -dr init/main.o [...] Disassembly of section .text: 0000000000000000 : 0: 55 push %rbp [...] 000000000000017b : 17b: 55 push %rbp 17c: 48 89 e5 mov %rsp,%rbp 17f: 53 push %rbx 180: 48 83 ec 08 sub $0x8,%rsp 184: e8 00 00 00 00 callq 189 185: R_X86_64_PC32 mcount+0xfffffffffffffffc [...] We will add a section to point to each function call. .section __mcount_loc,"a",@progbits [...] .quad .text + 0x185 [...] The offset to of the mcount call site in init_post is an offset from the start of the section, and not the start of the function init_post. The mcount relocation is at the call site 0x185 from the start of the .text section. .text + 0x185 == init_post + 0xa We need a way to add this __mcount_loc section in a way that we do not lose the relocations after final link. The .text section here will be attached to all other .text sections after final link and the offsets will be meaningless. We need to keep track of where these .text sections are. To do this, we use the start of the first function in the section. do_one_initcall. We can make a tmp.s file with this function as a reference to the start of the .text section. .section __mcount_loc,"a",@progbits [...] .quad do_one_initcall + 0x185 [...] Then we can compile the tmp.s into a tmp.o gcc -c tmp.s -o tmp.o And link it into back into main.o. ld -r main.o tmp.o -o tmp_main.o mv tmp_main.o main.o But we have a problem. What happens if the first function in a section is not exported, and is a static function. The linker will not let the tmp.o use it. This case exists in main.o as well. Disassembly of section .init.text: 0000000000000000 : 0: 55 push %rbp 1: 48 89 e5 mov %rsp,%rbp 4: e8 00 00 00 00 callq 9 5: R_X86_64_PC32 mcount+0xfffffffffffffffc The first function in .init.text is a static function. 00000000000000a8 t __setup_set_reset_devices 000000000000105f t __setup_str_set_reset_devices 0000000000000000 t set_reset_devices The lowercase 't' means that set_reset_devices is local and is not exported. If we simply try to link the tmp.o with the set_reset_devices we end up with two symbols: one local and one global. .section __mcount_loc,"a",@progbits .quad set_reset_devices + 0x10 00000000000000a8 t __setup_set_reset_devices 000000000000105f t __setup_str_set_reset_devices 0000000000000000 t set_reset_devices U set_reset_devices We still have an undefined reference to set_reset_devices, and if we try to compile the kernel, we will end up with an undefined reference to set_reset_devices, or even worst, it could be exported someplace else, and then we will have a reference to the wrong location. To handle this case, we make an intermediate step using objcopy. We convert set_reset_devices into a global exported symbol before linking it with tmp.o and set it back afterwards. 00000000000000a8 t __setup_set_reset_devices 000000000000105f t __setup_str_set_reset_devices 0000000000000000 T set_reset_devices 00000000000000a8 t __setup_set_reset_devices 000000000000105f t __setup_str_set_reset_devices 0000000000000000 T set_reset_devices 00000000000000a8 t __setup_set_reset_devices 000000000000105f t __setup_str_set_reset_devices 0000000000000000 t set_reset_devices Now we have a section in main.o called __mcount_loc that we can place somewhere in the kernel using vmlinux.ld.S and access it to convert all these locations that call mcount into nops before starting SMP and thus, eliminating the need to do this with kstop_machine. Note, A well documented perl script (scripts/recordmcount.pl) is used to do all this in one location. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cae2637d5e68..14d9505178ca 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -7,6 +7,9 @@ config HAVE_FTRACE config HAVE_DYNAMIC_FTRACE bool +config HAVE_FTRACE_MCOUNT_RECORD + bool + config TRACER_MAX_TRACE bool @@ -122,6 +125,11 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + config FTRACE_SELFTEST bool -- cgit v1.2.1 From 68bf21aa15c85d2e9b623dcda2b1ed8893275fa1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:08 -0400 Subject: ftrace: mcount call site on boot nops core This is the infrastructure to the converting the mcount call sites recorded by the __mcount_loc section into nops on boot. It also allows for using these sites to enable tracing as normal. When the __mcount_loc section is used, the "ftraced" kernel thread is disabled. This uses the current infrastructure to record the mcount call sites as well as convert them to nops. The mcount function is kept as a stub on boot up and not converted to the ftrace_record_ip function. We use the ftrace_record_ip to only record from the table. This patch does not handle modules. That comes with a later patch. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 148 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 105 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6e3af31b403..df96d5990c04 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -792,47 +792,7 @@ static int ftrace_update_code(void) return 1; } -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dyn_table_alloc(void) +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) { struct ftrace_page *pg; int cnt; @@ -859,7 +819,9 @@ static int __init ftrace_dyn_table_alloc(void) pg = ftrace_pages = ftrace_pages_start; - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld hash entries in %d pages\n", + num_to_init, cnt); for (i = 0; i < cnt; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -1556,6 +1518,104 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +static int ftrace_convert_nops(unsigned long *start, + unsigned long *end) +{ + unsigned long *p; + unsigned long addr; + unsigned long flags; + + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + ftrace_record_ip(addr); + ftrace_shutdown_replenish(); + } + + /* p is ignored */ + local_irq_save(flags); + __ftrace_update_code(p); + local_irq_restore(flags); + + return 0; +} + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ + unsigned long count, addr, flags; + int ret; + + /* Keep the ftrace pointer to the stub */ + addr = (unsigned long)ftrace_stub; + + local_irq_save(flags); + ftrace_dyn_arch_init(&addr); + local_irq_restore(flags); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + + ret = ftrace_dyn_table_alloc(count); + if (ret) + goto failed; + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_convert_nops(__start_mcount_loc, + __stop_mcount_loc); + + return; + failed: + ftrace_disabled = 1; +} +#else /* CONFIG_FTRACE_MCOUNT_RECORD */ +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + static int __init ftrace_dynamic_init(void) { struct task_struct *p; @@ -1572,7 +1632,7 @@ static int __init ftrace_dynamic_init(void) goto failed; } - ret = ftrace_dyn_table_alloc(); + ret = ftrace_dyn_table_alloc(NR_TO_INIT); if (ret) goto failed; @@ -1593,6 +1653,8 @@ static int __init ftrace_dynamic_init(void) } core_initcall(ftrace_dynamic_init); +#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ + #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) -- cgit v1.2.1 From 90d595fe5ca4b685465c068907e6e554760abea8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:09 -0400 Subject: ftrace: enable mcount recording for modules This patch enables the loading of the __mcount_section of modules and changing all the callers of mcount into nops. The modification is done before the init_module function is called, so again, we do not need to use kstop_machine to make these changes. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/module.c | 11 +++++++++++ kernel/trace/ftrace.c | 5 +++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 661d73db786e..d753fd9d83ec 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -1834,6 +1835,7 @@ static noinline struct module *load_module(void __user *umod, unsigned int markersstringsindex; unsigned int tracepointsindex; unsigned int tracepointsstringsindex; + unsigned int mcountindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2124,6 +2126,9 @@ static noinline struct module *load_module(void __user *umod, tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints_strings"); + mcountindex = find_sec(hdr, sechdrs, secstrings, + "__mcount_loc"); + /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { const char *strtab = (char *)sechdrs[strindex].sh_addr; @@ -2184,6 +2189,12 @@ static noinline struct module *load_module(void __user *umod, mod->tracepoints + mod->num_tracepoints); #endif } + + if (mcountindex) { + void *mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + } + err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index df96d5990c04..ea45bb1c0fd6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,11 @@ static int ftrace_convert_nops(unsigned long *start, return 0; } +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ + ftrace_convert_nops(start, end); +} + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; -- cgit v1.2.1 From a9fdda33cd7c7519b082e37538fe790f9ff684bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 22:47:17 -0400 Subject: ftrace: do not show freed records in available_filter_functions Seems that freed records can appear in the available_filter_functions list. This patch fixes that. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ea45bb1c0fd6..8affb6d00ec1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -872,15 +872,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((!(iter->flags & FTRACE_ITER_FAILURES) && + if ((rec->flags & FTRACE_FL_FREE) || + + (!(iter->flags & FTRACE_ITER_FAILURES) && (rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_FAILURES) && - (!(rec->flags & FTRACE_FL_FAILED) || - (rec->flags & FTRACE_FL_FREE))) || - - ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { -- cgit v1.2.1 From fed1939c64d2288938fdc1c367d49082da65e195 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 22:47:19 -0400 Subject: ftrace: remove old pointers to mcount When a mcount pointer is recorded into a table, it is used to add or remove calls to mcount (replacing them with nops). If the code is removed via removing a module, the pointers still exist. At modifying the code a check is always made to make sure the code being replaced is the code expected. In-other-words, the code being replaced is compared to what it is expected to be before being replaced. There is a very small chance that the code being replaced just happens to look like code that calls mcount (very small since the call to mcount is relative). To remove this chance, this patch adds ftrace_release to allow module unloading to remove the pointers to mcount within the module. Another change for init calls is made to not trace calls marked with __init. The tracing can not be started until after init is done anyway. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/module.c | 12 ++++++++---- kernel/trace/ftrace.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d753fd9d83ec..7576c2d9462f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1431,6 +1431,9 @@ static void free_module(struct module *mod) /* Module unload stuff */ module_unload_free(mod); + /* release any pointers to mcount in this module */ + ftrace_release(mod->module_core, mod->core_size); + /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1839,6 +1842,7 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *mseg; struct exception_table_entry *extable; mm_segment_t old_fs; @@ -2190,10 +2194,9 @@ static noinline struct module *load_module(void __user *umod, #endif } - if (mcountindex) { - void *mseg = (void *)sechdrs[mcountindex].sh_addr; - ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); - } + /* sechdrs[0].sh_size is always zero */ + mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2264,6 +2267,7 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); + ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); module_free(mod, mod->module_init); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8affb6d00ec1..eadd0eaea9b6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -294,13 +294,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node) static void ftrace_free_rec(struct dyn_ftrace *rec) { - /* no locking, only called from kstop_machine */ - rec->ip = (unsigned long)ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } +void ftrace_release(void *start, unsigned long size) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = s + size; + int i; + + if (!start) + return; + + /* No interrupt should call this */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if ((rec->ip >= s) && (rec->ip < e)) + ftrace_free_rec(rec); + } + } + spin_unlock(&ftrace_lock); + +} + static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -1527,7 +1551,9 @@ static int ftrace_convert_nops(unsigned long *start, p = start; while (p < end) { addr = ftrace_call_adjust(*p++); + spin_lock(&ftrace_lock); ftrace_record_ip(addr); + spin_unlock(&ftrace_lock); ftrace_shutdown_replenish(); } @@ -1541,6 +1567,8 @@ static int ftrace_convert_nops(unsigned long *start, void ftrace_init_module(unsigned long *start, unsigned long *end) { + if (start == end) + return; ftrace_convert_nops(start, end); } -- cgit v1.2.1 From 2e2ca155cd2213b4f398031180fb3d399d5b7db9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2008 12:26:40 -0400 Subject: ftrace: new continue entry - separate out from trace_entry Some tracers will need to work with more than one entry. In order to do this the trace_entry structure was split into two fields. One for the start of all entries, and one to continue an existing entry. The trace_entry structure now has a "field" entry that consists of the previous content of the trace_entry, and a "cont" entry that is just a string buffer the size of the "field" entry. Thanks to Andrew Morton for suggesting this idea. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 267 +++++++++++++++++++++-------------------- kernel/trace/trace.h | 17 ++- kernel/trace/trace_mmiotrace.c | 12 +- 3 files changed, 160 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f3fb3db61c3..76dfe6d2466c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -817,10 +817,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) pc = preempt_count(); - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->t = ftrace_now(raw_smp_processor_id()); - entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + entry->field.preempt_count = pc & 0xff; + entry->field.pid = (tsk) ? tsk->pid : 0; + entry->field.t = ftrace_now(raw_smp_processor_id()); + entry->field.flags = + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); @@ -835,11 +836,11 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, raw_local_irq_save(irq_flags); __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; + entry->type = TRACE_FN; + entry->field.fn.ip = ip; + entry->field.fn.parent_ip = parent_ip; __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); } @@ -862,10 +863,10 @@ void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, raw_local_irq_save(irq_flags); __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_RW; - entry->mmiorw = *rw; + entry->type = TRACE_MMIO_RW; + entry->field.mmiorw = *rw; __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -882,10 +883,10 @@ void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, raw_local_irq_save(irq_flags); __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_MAP; - entry->mmiomap = *map; + entry->type = TRACE_MMIO_MAP; + entry->field.mmiomap = *map; __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -909,12 +910,12 @@ void __trace_stack(struct trace_array *tr, tracing_generic_entry_update(entry, flags); entry->type = TRACE_STACK; - memset(&entry->stack, 0, sizeof(entry->stack)); + memset(&entry->field.stack, 0, sizeof(entry->field.stack)); trace.nr_entries = 0; trace.max_entries = FTRACE_STACK_ENTRIES; trace.skip = skip; - trace.entries = entry->stack.caller; + trace.entries = entry->field.stack.caller; save_stack_trace(&trace); } @@ -930,12 +931,12 @@ __trace_special(void *__tr, void *__data, raw_local_irq_save(irq_flags); __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->special.arg1 = arg1; - entry->special.arg2 = arg2; - entry->special.arg3 = arg3; + entry->type = TRACE_SPECIAL; + entry->field.special.arg1 = arg1; + entry->field.special.arg2 = arg2; + entry->field.special.arg3 = arg3; __trace_stack(tr, data, irq_flags, 4); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -955,15 +956,15 @@ tracing_sched_switch_trace(struct trace_array *tr, raw_local_irq_save(irq_flags); __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); - entry->type = TRACE_CTX; - entry->ctx.prev_pid = prev->pid; - entry->ctx.prev_prio = prev->prio; - entry->ctx.prev_state = prev->state; - entry->ctx.next_pid = next->pid; - entry->ctx.next_prio = next->prio; - entry->ctx.next_state = next->state; + entry->type = TRACE_CTX; + entry->field.ctx.prev_pid = prev->pid; + entry->field.ctx.prev_prio = prev->prio; + entry->field.ctx.prev_state = prev->state; + entry->field.ctx.next_pid = next->pid; + entry->field.ctx.next_prio = next->prio; + entry->field.ctx.next_state = next->state; __trace_stack(tr, data, flags, 5); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -984,12 +985,12 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_WAKE; - entry->ctx.prev_pid = curr->pid; - entry->ctx.prev_prio = curr->prio; - entry->ctx.prev_state = curr->state; - entry->ctx.next_pid = wakee->pid; - entry->ctx.next_prio = wakee->prio; - entry->ctx.next_state = wakee->state; + entry->field.ctx.prev_pid = curr->pid; + entry->field.ctx.prev_prio = curr->prio; + entry->field.ctx.prev_state = curr->state; + entry->field.ctx.next_pid = wakee->pid; + entry->field.ctx.next_prio = wakee->prio; + entry->field.ctx.next_state = wakee->state; __trace_stack(tr, data, flags, 6); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -1118,7 +1119,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) /* * Pick the entry with the smallest timestamp: */ - if (ent && (!next || ent->t < next->t)) { + if (ent && (!next || ent->field.t < next->field.t)) { next = ent; next_cpu = cpu; } @@ -1422,19 +1423,20 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) static void lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { + struct trace_field *field = &entry->field; int hardirq, softirq; char *comm; - comm = trace_find_cmdline(entry->pid); + comm = trace_find_cmdline(field->pid); - trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%8.8s-%-5d ", comm, field->pid); trace_seq_printf(s, "%d", cpu); trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + (field->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((field->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; - softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + hardirq = field->flags & TRACE_FLAG_HARDIRQ; + softirq = field->flags & TRACE_FLAG_SOFTIRQ; if (hardirq && softirq) { trace_seq_putc(s, 'H'); } else { @@ -1448,8 +1450,8 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) } } - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); + if (field->preempt_count) + trace_seq_printf(s, "%x", field->preempt_count); else trace_seq_puts(s, "."); } @@ -1479,6 +1481,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) struct trace_entry *next_entry = find_next_entry(iter, NULL); unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; + struct trace_field *field = &entry->field; unsigned long abs_usecs; unsigned long rel_usecs; char *comm; @@ -1488,17 +1491,17 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) if (!next_entry) next_entry = entry; - rel_usecs = ns2usecs(next_entry->t - entry->t); - abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + rel_usecs = ns2usecs(next_entry->field.t - entry->field.t); + abs_usecs = ns2usecs(entry->field.t - iter->tr->time_start); if (verbose) { - comm = trace_find_cmdline(entry->pid); + comm = trace_find_cmdline(field->pid); trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, cpu, entry->flags, - entry->preempt_count, trace_idx, - ns2usecs(entry->t), + field->pid, cpu, field->flags, + field->preempt_count, trace_idx, + ns2usecs(field->t), abs_usecs/1000, abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); @@ -1508,41 +1511,42 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) } switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(s, entry->fn.ip, sym_flags); + seq_print_ip_sym(s, field->fn.ip, sym_flags); trace_seq_puts(s, " ("); - if (kretprobed(entry->fn.parent_ip)) + if (kretprobed(field->fn.parent_ip)) trace_seq_puts(s, KRETPROBE_MSG); else - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + seq_print_ip_sym(s, field->fn.parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; case TRACE_CTX: case TRACE_WAKE: - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + T = field->ctx.next_state < sizeof(state_to_char) ? + state_to_char[field->ctx.next_state] : 'X'; - state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + state = field->ctx.prev_state ? + __ffs(field->ctx.prev_state) + 1 : 0; S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; - comm = trace_find_cmdline(entry->ctx.next_pid); + comm = trace_find_cmdline(field->ctx.next_pid); trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + field->ctx.prev_pid, + field->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, + field->ctx.next_pid, + field->ctx.next_prio, T, comm); break; case TRACE_SPECIAL: trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->special.arg1, + field->special.arg2, + field->special.arg3); break; case TRACE_STACK: for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + seq_print_ip_sym(s, field->stack.caller[i], sym_flags); } trace_seq_puts(s, "\n"); break; @@ -1557,6 +1561,7 @@ static int print_trace_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; + struct trace_field *field; unsigned long usec_rem; unsigned long long t; unsigned long secs; @@ -1566,14 +1571,15 @@ static int print_trace_fmt(struct trace_iterator *iter) int i; entry = iter->ent; + field = &entry->field; - comm = trace_find_cmdline(iter->ent->pid); + comm = trace_find_cmdline(iter->ent->field.pid); - t = ns2usecs(entry->t); + t = ns2usecs(field->t); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + ret = trace_seq_printf(s, "%16s-%-5d ", comm, field->pid); if (!ret) return 0; ret = trace_seq_printf(s, "[%02d] ", iter->cpu); @@ -1585,18 +1591,19 @@ static int print_trace_fmt(struct trace_iterator *iter) switch (entry->type) { case TRACE_FN: - ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + ret = seq_print_ip_sym(s, field->fn.ip, sym_flags); if (!ret) return 0; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - entry->fn.parent_ip) { + field->fn.parent_ip) { ret = trace_seq_printf(s, " <-"); if (!ret) return 0; - if (kretprobed(entry->fn.parent_ip)) + if (kretprobed(field->fn.parent_ip)) ret = trace_seq_puts(s, KRETPROBE_MSG); else - ret = seq_print_ip_sym(s, entry->fn.parent_ip, + ret = seq_print_ip_sym(s, + field->fn.parent_ip, sym_flags); if (!ret) return 0; @@ -1607,26 +1614,26 @@ static int print_trace_fmt(struct trace_iterator *iter) break; case TRACE_CTX: case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + S = field->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[field->ctx.prev_state] : 'X'; + T = field->ctx.next_state < sizeof(state_to_char) ? + state_to_char[field->ctx.next_state] : 'X'; ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + field->ctx.prev_pid, + field->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, + field->ctx.next_pid, + field->ctx.next_prio, T); if (!ret) return 0; break; case TRACE_SPECIAL: ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->special.arg1, + field->special.arg2, + field->special.arg3); if (!ret) return 0; break; @@ -1637,7 +1644,7 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; } - ret = seq_print_ip_sym(s, entry->stack.caller[i], + ret = seq_print_ip_sym(s, field->stack.caller[i], sym_flags); if (!ret) return 0; @@ -1654,37 +1661,40 @@ static int print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; + struct trace_field *field; int ret; int S, T; entry = iter->ent; + field = &entry->field; ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, entry->t); + field->pid, iter->cpu, field->t); if (!ret) return 0; switch (entry->type) { case TRACE_FN: ret = trace_seq_printf(s, "%x %x\n", - entry->fn.ip, entry->fn.parent_ip); + field->fn.ip, + field->fn.parent_ip); if (!ret) return 0; break; case TRACE_CTX: case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + S = field->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[field->ctx.prev_state] : 'X'; + T = field->ctx.next_state < sizeof(state_to_char) ? + state_to_char[field->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + field->ctx.prev_pid, + field->ctx.prev_prio, S, - entry->ctx.next_pid, - entry->ctx.next_prio, + field->ctx.next_pid, + field->ctx.next_prio, T); if (!ret) return 0; @@ -1692,9 +1702,9 @@ static int print_raw_fmt(struct trace_iterator *iter) case TRACE_SPECIAL: case TRACE_STACK: ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->special.arg1, + field->special.arg2, + field->special.arg3); if (!ret) return 0; break; @@ -1719,40 +1729,41 @@ static int print_hex_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; + struct trace_field *field; int S, T; entry = iter->ent; + field = &entry->field; - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, field->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, entry->t); + SEQ_PUT_HEX_FIELD_RET(s, field->t); switch (entry->type) { case TRACE_FN: - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, field->fn.ip); + SEQ_PUT_HEX_FIELD_RET(s, field->fn.parent_ip); break; case TRACE_CTX: case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + S = field->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[field->ctx.prev_state] : 'X'; + T = field->ctx.next_state < sizeof(state_to_char) ? + state_to_char[field->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, field->ctx.prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->ctx.prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_prio); SEQ_PUT_HEX_FIELD_RET(s, T); break; case TRACE_SPECIAL: case TRACE_STACK: - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + SEQ_PUT_HEX_FIELD_RET(s, field->special.arg1); + SEQ_PUT_HEX_FIELD_RET(s, field->special.arg2); + SEQ_PUT_HEX_FIELD_RET(s, field->special.arg3); break; } SEQ_PUT_FIELD_RET(s, newline); @@ -1764,31 +1775,33 @@ static int print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; + struct trace_field *field; entry = iter->ent; + field = &entry->field; - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); - SEQ_PUT_FIELD_RET(s, entry->t); + SEQ_PUT_FIELD_RET(s, field->pid); + SEQ_PUT_FIELD_RET(s, field->cpu); + SEQ_PUT_FIELD_RET(s, field->t); switch (entry->type) { case TRACE_FN: - SEQ_PUT_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_FIELD_RET(s, field->fn.ip); + SEQ_PUT_FIELD_RET(s, field->fn.parent_ip); break; case TRACE_CTX: - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + SEQ_PUT_FIELD_RET(s, field->ctx.prev_pid); + SEQ_PUT_FIELD_RET(s, field->ctx.prev_prio); + SEQ_PUT_FIELD_RET(s, field->ctx.prev_state); + SEQ_PUT_FIELD_RET(s, field->ctx.next_pid); + SEQ_PUT_FIELD_RET(s, field->ctx.next_prio); + SEQ_PUT_FIELD_RET(s, field->ctx.next_state); break; case TRACE_SPECIAL: case TRACE_STACK: - SEQ_PUT_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_FIELD_RET(s, entry->special.arg3); + SEQ_PUT_FIELD_RET(s, field->special.arg1); + SEQ_PUT_FIELD_RET(s, field->special.arg2); + SEQ_PUT_FIELD_RET(s, field->special.arg3); break; } return 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f69f86788c2b..6ddd6a6556cf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -61,13 +61,12 @@ struct stack_entry { }; /* - * The trace entry - the most basic unit of tracing. This is what + * The trace field - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ -struct trace_entry { - char type; +struct trace_field { char cpu; char flags; char preempt_count; @@ -83,6 +82,18 @@ struct trace_entry { }; }; +struct trace_field_cont { + char buf[sizeof(struct trace_field)]; +}; + +struct trace_entry { + char type; + union { + struct trace_field field; + struct trace_field_cont cont; + }; +}; + #define TRACE_ENTRY_SIZE sizeof(struct trace_entry) /* diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b13dc19dcbb4..9b7a936f4b1f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -174,14 +174,14 @@ print_out: static int mmio_print_rw(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_rw *rw = &entry->mmiorw; + struct mmiotrace_rw *rw = &entry->field.mmiorw; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); + unsigned long long t = ns2usecs(entry->field.t); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; - switch (entry->mmiorw.opcode) { + switch (entry->field.mmiorw.opcode) { case MMIO_READ: ret = trace_seq_printf(s, "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", @@ -216,14 +216,14 @@ static int mmio_print_rw(struct trace_iterator *iter) static int mmio_print_map(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_map *m = &entry->mmiomap; + struct mmiotrace_map *m = &entry->field.mmiomap; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); + unsigned long long t = ns2usecs(entry->field.t); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; - switch (entry->mmiorw.opcode) { + switch (entry->field.mmiorw.opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", -- cgit v1.2.1 From dd0e545f061f90099a3dcc13aa77e29c6295cf23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2008 12:26:41 -0400 Subject: ftrace: printk formatting infrastructure This patch adds a feature that can help kernel developers debug their code using ftrace. int ftrace_printk(const char *fmt, ...); This records into the ftrace buffer using printf formatting. The entry size in the buffers are still a fixed length. A new type has been added that allows for more entries to be used for a single recording. The start of the print is still the same as the other entries. It returns the number of characters written to the ftrace buffer. For example: Having a module with the following code: static int __init ftrace_print_test(void) { ftrace_printk("jiffies are %ld\n", jiffies); return 0; } Gives me: insmod-5441 3...1 7569us : ftrace_print_test: jiffies are 4296626666 for the latency_trace file and: insmod-5441 [03] 1959.370498: ftrace_print_test jiffies are 4296626666 for the trace file. Note: Only the infrastructure should go into the kernel. It is to help facilitate debugging for other kernel developers. Calls to ftrace_printk is not intended to be left in the kernel, and should be frowned upon just like scattering printks around in the code. But having this easily at your fingertips helps the debugging go faster and bugs be solved quicker. Maybe later on, we can hook this with markers and have their printf format be sucked into ftrace output. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 273 +++++++++++++++++++++++++++++++++++++----- kernel/trace/trace.h | 11 ++ kernel/trace/trace_selftest.c | 11 +- 3 files changed, 262 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 76dfe6d2466c..a917bea82715 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -197,12 +197,14 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) * NEED_RESCED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler + * CONT - multiple entries hold the trace item */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_NEED_RESCHED = 0x02, TRACE_FLAG_HARDIRQ = 0x04, TRACE_FLAG_SOFTIRQ = 0x08, + TRACE_FLAG_CONT = 0x10, }; /* @@ -1074,6 +1076,7 @@ enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; +/* Return the current entry. */ static struct trace_entry * trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, struct trace_iterator *iter, int cpu) @@ -1104,8 +1107,58 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return &array[iter->next_page_idx[cpu]]; } +/* Increment the index counter of an iterator by one */ +static void trace_iterator_increment(struct trace_iterator *iter, int cpu) +{ + iter->idx++; + iter->next_idx[cpu]++; + iter->next_page_idx[cpu]++; + + if (iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[cpu]; + + iter->next_page_idx[cpu] = 0; + iter->next_page[cpu] = + trace_next_list(data, iter->next_page[cpu]); + } +} + static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu) +trace_entry_next(struct trace_array *tr, struct trace_array_cpu *data, + struct trace_iterator *iter, int cpu) +{ + struct list_head *next_page; + struct trace_entry *ent; + int idx, next_idx, next_page_idx; + + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + + if (likely(!ent || ent->type != TRACE_CONT)) + return ent; + + /* save the iterator details */ + idx = iter->idx; + next_idx = iter->next_idx[cpu]; + next_page_idx = iter->next_page_idx[cpu]; + next_page = iter->next_page[cpu]; + + /* find a real entry */ + do { + trace_iterator_increment(iter, cpu); + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + } while (ent && ent->type != TRACE_CONT); + + /* reset the iterator */ + iter->idx = idx; + iter->next_idx[cpu] = next_idx; + iter->next_page_idx[cpu] = next_page_idx; + iter->next_page[cpu] = next_page; + + return ent; +} + +static struct trace_entry * +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, int inc) { struct trace_array *tr = iter->tr; struct trace_entry *ent, *next = NULL; @@ -1115,7 +1168,23 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) for_each_tracing_cpu(cpu) { if (!head_page(tr->data[cpu])) continue; + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + + if (ent && ent->type == TRACE_CONT) { + struct trace_array_cpu *data = tr->data[cpu]; + + if (!inc) + ent = trace_entry_next(tr, data, iter, cpu); + else { + while (ent && ent->type == TRACE_CONT) { + trace_iterator_increment(iter, cpu); + ent = trace_entry_idx(tr, tr->data[cpu], + iter, cpu); + } + } + } + /* * Pick the entry with the smallest timestamp: */ @@ -1131,25 +1200,39 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static void trace_iterator_increment(struct trace_iterator *iter) +/* Find the next real entry, without updating the iterator itself */ +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) { - iter->idx++; - iter->next_idx[iter->cpu]++; - iter->next_page_idx[iter->cpu]++; + return __find_next_entry(iter, ent_cpu, 0); +} + +/* Find the next real entry, and increment the iterator to the next entry */ +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; - if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + next = __find_next_entry(iter, &next_cpu, 1); - iter->next_page_idx[iter->cpu] = 0; - iter->next_page[iter->cpu] = - trace_next_list(data, iter->next_page[iter->cpu]); - } + iter->prev_ent = iter->ent; + iter->prev_cpu = iter->cpu; + + iter->ent = next; + iter->cpu = next_cpu; + + if (next) + trace_iterator_increment(iter, iter->cpu); + + return next ? iter : NULL; } static void trace_consume(struct trace_iterator *iter) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + struct trace_entry *ent; + again: data->trace_tail_idx++; if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { data->trace_tail = trace_next_page(data, data->trace_tail); @@ -1160,25 +1243,11 @@ static void trace_consume(struct trace_iterator *iter) if (data->trace_head == data->trace_tail && data->trace_head_idx == data->trace_tail_idx) data->trace_idx = 0; -} - -static void *find_next_entry_inc(struct trace_iterator *iter) -{ - struct trace_entry *next; - int next_cpu = -1; - - next = find_next_entry(iter, &next_cpu); - - iter->prev_ent = iter->ent; - iter->prev_cpu = iter->cpu; - - iter->ent = next; - iter->cpu = next_cpu; - - if (next) - trace_iterator_increment(iter); - return next ? iter : NULL; + ent = trace_entry_idx(iter->tr, iter->tr->data[iter->cpu], + iter, iter->cpu); + if (ent && ent->type == TRACE_CONT) + goto again; } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1473,6 +1542,26 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static void +trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[iter->cpu]; + struct trace_entry *ent; + + ent = trace_entry_idx(tr, data, iter, iter->cpu); + if (!ent || ent->type != TRACE_CONT) { + trace_seq_putc(s, '\n'); + return; + } + + do { + trace_seq_printf(s, "%s", ent->cont.buf); + trace_iterator_increment(iter, iter->cpu); + ent = trace_entry_idx(tr, data, iter, iter->cpu); + } while (ent && ent->type == TRACE_CONT); +} + static int print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { @@ -1491,6 +1580,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) if (!next_entry) next_entry = entry; + + if (entry->type == TRACE_CONT) + return 1; + rel_usecs = ns2usecs(next_entry->field.t - entry->field.t); abs_usecs = ns2usecs(entry->field.t - iter->tr->time_start); @@ -1550,6 +1643,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) } trace_seq_puts(s, "\n"); break; + case TRACE_PRINT: + seq_print_ip_sym(s, field->print.ip, sym_flags); + trace_seq_printf(s, ": %s", field->print.buf); + if (field->flags && TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1571,6 +1670,10 @@ static int print_trace_fmt(struct trace_iterator *iter) int i; entry = iter->ent; + + if (entry->type == TRACE_CONT) + return 1; + field = &entry->field; comm = trace_find_cmdline(iter->ent->field.pid); @@ -1653,6 +1756,12 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_PRINT: + seq_print_ip_sym(s, field->print.ip, sym_flags); + trace_seq_printf(s, ": %s", field->print.buf); + if (field->flags && TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; } return 1; } @@ -1666,6 +1775,10 @@ static int print_raw_fmt(struct trace_iterator *iter) int S, T; entry = iter->ent; + + if (entry->type == TRACE_CONT) + return 1; + field = &entry->field; ret = trace_seq_printf(s, "%d %d %llu ", @@ -1708,6 +1821,12 @@ static int print_raw_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_PRINT: + trace_seq_printf(s, "# %lx %s", + field->print.ip, field->print.buf); + if (field->flags && TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; } return 1; } @@ -1733,6 +1852,10 @@ static int print_hex_fmt(struct trace_iterator *iter) int S, T; entry = iter->ent; + + if (entry->type == TRACE_CONT) + return 1; + field = &entry->field; SEQ_PUT_HEX_FIELD_RET(s, field->pid); @@ -1778,6 +1901,10 @@ static int print_bin_fmt(struct trace_iterator *iter) struct trace_field *field; entry = iter->ent; + + if (entry->type == TRACE_CONT) + return 1; + field = &entry->field; SEQ_PUT_FIELD_RET(s, field->pid); @@ -2943,6 +3070,94 @@ static __init void tracer_init_debugfs(void) #endif } +#define TRACE_BUF_SIZE 1024 +#define TRACE_PRINT_BUF_SIZE \ + (sizeof(struct trace_field) - offsetof(struct trace_field, print.buf)) +#define TRACE_CONT_BUF_SIZE sizeof(struct trace_field) + +/** + * ftrace_printk - printf formatting in the ftrace buffer + * @fmt - the printf format for printing. + * + * Note: __ftrace_printk is an internal function for ftrace_printk and + * the @ip is passed in via the ftrace_printk macro. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please reframe from leaving ftrace_printks scattered around in + * your code. + */ +int __ftrace_printk(unsigned long ip, const char *fmt, ...) +{ + struct trace_array *tr = &global_trace; + static DEFINE_SPINLOCK(trace_buf_lock); + static char trace_buf[TRACE_BUF_SIZE]; + struct trace_array_cpu *data; + struct trace_entry *entry; + unsigned long flags; + long disabled; + va_list ap; + int cpu, len = 0, write, written = 0; + + if (likely(!ftrace_function_enabled)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (unlikely(disabled != 1 || !ftrace_function_enabled)) + goto out; + + spin_lock(&trace_buf_lock); + va_start(ap, fmt); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, ap); + va_end(ap); + + len = min(len, TRACE_BUF_SIZE-1); + trace_buf[len] = 0; + + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_PRINT; + entry->field.print.ip = ip; + + write = min(len, (int)(TRACE_PRINT_BUF_SIZE-1)); + + memcpy(&entry->field.print.buf, trace_buf, write); + entry->field.print.buf[write] = 0; + written = write; + + if (written != len) + entry->field.flags |= TRACE_FLAG_CONT; + + while (written != len) { + entry = tracing_get_trace_entry(tr, data); + + entry->type = TRACE_CONT; + write = min(len - written, (int)(TRACE_CONT_BUF_SIZE-1)); + memcpy(&entry->cont.buf, trace_buf+written, write); + entry->cont.buf[write] = 0; + written += write; + } + __raw_spin_unlock(&data->lock); + + spin_unlock(&trace_buf_lock); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return len; +} +EXPORT_SYMBOL_GPL(__ftrace_printk); + static int trace_alloc_page(void) { struct trace_array_cpu *data; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ddd6a6556cf..50b6d7a6f01a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,7 +13,9 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, + TRACE_CONT, TRACE_STACK, + TRACE_PRINT, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -60,6 +62,14 @@ struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; }; +/* + * ftrace_printk entry: + */ +struct print_entry { + unsigned long ip; + char buf[]; +}; + /* * The trace field - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: @@ -77,6 +87,7 @@ struct trace_field { struct ctx_switch_entry ctx; struct special_entry special; struct stack_entry stack; + struct print_entry print; struct mmiotrace_rw mmiorw; struct mmiotrace_map mmiomap; }; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0911b7e073bf..630715bbd572 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,7 +9,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: + case TRACE_CONT: case TRACE_STACK: + case TRACE_PRINT: case TRACE_SPECIAL: return 1; } @@ -120,11 +122,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, int (*func)(void)) { - unsigned long count; - int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + unsigned long count; char *func_name; + int ret; /* The ftrace test PASSED */ printk(KERN_CONT "PASSED\n"); @@ -157,6 +159,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* enable tracing */ tr->ctrl = 1; trace->init(tr); + /* Sleep for a 1/10 of a second */ msleep(100); @@ -212,10 +215,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + unsigned long count; + int ret; /* make sure msleep has been recorded */ msleep(1); -- cgit v1.2.1 From 2f2c99dba2398ef7d9c21f7c793180a50e68b1f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2008 16:45:49 -0400 Subject: ftrace: ftrace_printk doc moved Based on Randy Dunlap's suggestion, the ftrace_printk kernel-doc belongs with the ftrace_printk macro that should be used. Not with the __ftrace_printk internal function. Signed-off-by: Steven Rostedt Acked-by: Randy Dunlap Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a917bea82715..2597e7e49c35 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3075,22 +3075,6 @@ static __init void tracer_init_debugfs(void) (sizeof(struct trace_field) - offsetof(struct trace_field, print.buf)) #define TRACE_CONT_BUF_SIZE sizeof(struct trace_field) -/** - * ftrace_printk - printf formatting in the ftrace buffer - * @fmt - the printf format for printing. - * - * Note: __ftrace_printk is an internal function for ftrace_printk and - * the @ip is passed in via the ftrace_printk macro. - * - * This function allows a kernel developer to debug fast path sections - * that printk is not appropriate for. By scattering in various - * printk like tracing in the code, a developer can quickly see - * where problems are occurring. - * - * This is intended as a debugging tool for the developer only. - * Please reframe from leaving ftrace_printks scattered around in - * your code. - */ int __ftrace_printk(unsigned long ip, const char *fmt, ...) { struct trace_array *tr = &global_trace; -- cgit v1.2.1 From 3f5a54e371ca20b119b73704f6c01b71295c1714 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 Jul 2008 22:36:46 -0400 Subject: ftrace: dump out ftrace buffers to console on panic At OLS I had a lot of interest to be able to have the ftrace buffers dumped on panic. Usually one would expect to uses kexec and examine the buffers after a new kernel is loaded. But sometimes the resources do not permit kdump and kexec, so having an option to still see the sequence of events up to the crash is very advantageous. This patch adds the option to have the ftrace buffers dumped to the console in the latency_trace format on a panic. When the option is set, the default entries per CPU buffer are lowered to 16384, since the writing to the serial (if that is the console) may take an awful long time otherwise. [ Changes since -v1: Got alpine to send correctly (as well as spell check working). Removed config option. Moved the static variables into ftrace_dump itself. Gave printk a log level. ] Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 174 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2597e7e49c35..97513c8ecd67 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -103,8 +105,15 @@ int ftrace_function_enabled; * trace_nr_entries is the number of entries that is allocated * for a buffer. Note, the number of entries is always rounded * to ENTRIES_PER_PAGE. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable. */ -static unsigned long trace_nr_entries = 65536UL; +#define TRACE_ENTRIES_DEFAULT 16384UL + +static unsigned long trace_nr_entries = TRACE_ENTRIES_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; @@ -3142,6 +3151,165 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) } EXPORT_SYMBOL_GPL(__ftrace_printk); +static int trace_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + ftrace_dump(); + return NOTIFY_OK; +} + +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; + +static int trace_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + switch (val) { + case DIE_OOPS: + ftrace_dump(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block trace_die_notifier = { + .notifier_call = trace_die_handler, + .priority = 200 +}; + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT 1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE KERN_INFO + +static void +trace_printk_seq(struct trace_seq *s) +{ + /* Probably should print a warning here. */ + if (s->len >= 1000) + s->len = 1000; + + /* should be zero ended, but we are paranoid. */ + s->buffer[s->len] = 0; + + printk(KERN_TRACE "%s", s->buffer); + + trace_seq_reset(s); +} + + +void ftrace_dump(void) +{ + static DEFINE_SPINLOCK(ftrace_dump_lock); + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + struct trace_array_cpu *data; + static cpumask_t mask; + static int dump_ran; + unsigned long flags; + int cnt = 0; + int cpu; + + /* only one dump */ + spin_lock_irqsave(&ftrace_dump_lock, flags); + if (dump_ran) + goto out; + + dump_ran = 1; + + /* No turning back! */ + ftrace_kill_atomic(); + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + + iter.tr = &global_trace; + iter.trace = current_trace; + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + cpus_clear(mask); + + for_each_tracing_cpu(cpu) { + data = iter.tr->data[cpu]; + + if (!head_page(data) || !data->trace_idx) + continue; + + atomic_inc(&data->disabled); + cpu_set(cpu, mask); + } + + for_each_cpu_mask(cpu, mask) { + data = iter.tr->data[cpu]; + __raw_spin_lock(&data->lock); + + if (data->overrun > iter.last_overrun[cpu]) + iter.overrun[cpu] += + data->overrun - iter.last_overrun[cpu]; + iter.last_overrun[cpu] = data->overrun; + } + + while (!trace_empty(&iter)) { + + if (!cnt) + printk(KERN_TRACE "---------------------------------\n"); + + cnt++; + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (find_next_entry_inc(&iter) != NULL) { + print_trace_line(&iter); + trace_consume(&iter); + } + + trace_printk_seq(&iter.seq); + } + + if (!cnt) + printk(KERN_TRACE " (ftrace buffer empty)\n"); + else + printk(KERN_TRACE "---------------------------------\n"); + + for_each_cpu_mask(cpu, mask) { + data = iter.tr->data[cpu]; + __raw_spin_unlock(&data->lock); + } + + for_each_cpu_mask(cpu, mask) { + data = iter.tr->data[cpu]; + atomic_dec(&data->disabled); + } + + + out: + spin_unlock_irqrestore(&ftrace_dump_lock, flags); +} + static int trace_alloc_page(void) { struct trace_array_cpu *data; @@ -3338,6 +3506,11 @@ __init static int tracer_alloc_buffers(void) global_trace.ctrl = tracer_enabled; tracing_disabled = 0; + atomic_notifier_chain_register(&panic_notifier_list, + &trace_panic_notifier); + + register_die_notifier(&trace_die_notifier); + return 0; free_buffers: -- cgit v1.2.1 From 98a983aad2e5b3dc83a8a761675445cdd8f3e6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Fri, 15 Aug 2008 21:08:22 +0200 Subject: ftrace: fix some mistakes in error messages This patch fixes some mistakes on the tracer in warning messages when debugfs fails to create tracing files. Signed-off-by: Frederic Weisbecker Cc: srostedt@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 97513c8ecd67..896e59f772c9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3030,12 +3030,12 @@ static __init void tracer_init_debugfs(void) entry = debugfs_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + pr_warning("Could not create debugfs 'available_tracers' entry\n"); entry = debugfs_create_file("current_tracer", 0444, d_tracer, &global_trace, &set_tracer_fops); if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + pr_warning("Could not create debugfs 'current_tracer' entry\n"); entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, @@ -3048,7 +3048,7 @@ static __init void tracer_init_debugfs(void) &tracing_thresh, &tracing_max_lat_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'tracing_thresh' entry\n"); entry = debugfs_create_file("README", 0644, d_tracer, NULL, &tracing_readme_fops); if (!entry) @@ -3058,13 +3058,13 @@ static __init void tracer_init_debugfs(void) NULL, &tracing_pipe_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'trace_pipe' entry\n"); entry = debugfs_create_file("trace_entries", 0644, d_tracer, &global_trace, &tracing_entries_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'trace_entries' entry\n"); #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, -- cgit v1.2.1 From 00fd61aee10533e003f2f00ab7163207660a4051 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 Aug 2008 21:40:04 -0400 Subject: ftrace: do not init module on ftrace disabled If one of the self tests of ftrace has disabled the function tracer, do not run the code to convert the mcount calls in modules. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eadd0eaea9b6..11d94f2dc485 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -307,7 +307,7 @@ void ftrace_release(void *start, unsigned long size) unsigned long e = s + size; int i; - if (!start) + if (ftrace_disabled || !start) return; /* No interrupt should call this */ @@ -1567,7 +1567,7 @@ static int ftrace_convert_nops(unsigned long *start, void ftrace_init_module(unsigned long *start, unsigned long *end) { - if (start == end) + if (ftrace_disabled || start == end) return; ftrace_convert_nops(start, end); } -- cgit v1.2.1 From 99ecdc43bc17faf5fa571db8569df171ecd0e5b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 Aug 2008 21:40:05 -0400 Subject: ftrace: add necessary locking for ftrace records The new design of pre-recorded mcounts and updating the code outside of kstop_machine has changed the way the records themselves are protected. This patch uses the ftrace_lock to protect the records. Note, the lock still does not need to be taken within calls that are only called via kstop_machine, since the that code can not run while the spin lock is held. Also removed the hash_lock needed for the daemon when MCOUNT_RECORD is configured. Also did a slight cleanup of an unused variable. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11d94f2dc485..43665add9805 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -81,7 +81,7 @@ void clear_ftrace_function(void) static int __register_ftrace_function(struct ftrace_ops *ops) { - /* Should never be called by interrupts */ + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); ops->next = ftrace_list; @@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) struct ftrace_ops **p; int ret = 0; + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); /* @@ -153,6 +154,21 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +/* + * The hash lock is only needed when the recording of the mcount + * callers are dynamic. That is, by the caller themselves and + * not recorded via the compilation. + */ +static DEFINE_SPINLOCK(ftrace_hash_lock); +#define ftrace_hash_lock(flags) spin_lock_irqsave(ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) spin_lock_irqsave(ftrace_hash_lock, flags) +#else +/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ +#define ftrace_hash_lock(flags) do { (void)flags; } while (0) +#define ftrace_hash_unlock(flags) do { } while(0) +#endif + static struct task_struct *ftraced_task; enum { @@ -171,7 +187,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); static DEFINE_MUTEX(ftrace_regex_lock); @@ -310,7 +325,7 @@ void ftrace_release(void *start, unsigned long size) if (ftrace_disabled || !start) return; - /* No interrupt should call this */ + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); for (pg = ftrace_pages_start; pg; pg = pg->next) { @@ -362,7 +377,6 @@ ftrace_record_ip(unsigned long ip) unsigned long flags; unsigned long key; int resched; - int atomic; int cpu; if (!ftrace_enabled || ftrace_disabled) @@ -392,9 +406,7 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out; - atomic = irqs_disabled(); - - spin_lock_irqsave(&ftrace_shutdown_lock, flags); + ftrace_hash_lock(flags); /* This ip may have hit the hash before the lock */ if (ftrace_ip_in_hash(ip, key)) @@ -411,7 +423,7 @@ ftrace_record_ip(unsigned long ip) ftraced_trigger = 1; out_unlock: - spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + ftrace_hash_unlock(flags); out: per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -887,6 +899,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); retry: if (iter->idx >= iter->pg->index) { if (iter->pg->next) { @@ -910,6 +924,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) goto retry; } } + spin_unlock(&ftrace_lock); iter->pos = *pos; @@ -1023,8 +1038,8 @@ static void ftrace_filter_reset(int enable) unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 0; pg = ftrace_pages_start; @@ -1037,7 +1052,7 @@ static void ftrace_filter_reset(int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static int @@ -1149,8 +1164,8 @@ ftrace_match(unsigned char *buff, int len, int enable) } } - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 1; pg = ftrace_pages_start; @@ -1187,7 +1202,7 @@ ftrace_match(unsigned char *buff, int len, int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static ssize_t @@ -1551,6 +1566,7 @@ static int ftrace_convert_nops(unsigned long *start, p = start; while (p < end) { addr = ftrace_call_adjust(*p++); + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); ftrace_record_ip(addr); spin_unlock(&ftrace_lock); -- cgit v1.2.1 From 2d7da80f7138c4276ef4fa0334be400b805d0fbf Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 25 Aug 2008 13:08:44 +1000 Subject: ftrace: fix build failure After disabling FTRACE_MCOUNT_RECORD via a patch, a dormant build failure surfaced: kernel/trace/ftrace.c: In function 'ftrace_record_ip': kernel/trace/ftrace.c:416: error: incompatible type for argument 1 of '_spin_lock_irqsave' kernel/trace/ftrace.c:433: error: incompatible type for argument 1 of '_spin_lock_irqsave' Introduced by commit 6dad8e07f4c10b17b038e84d29f3ca41c2e55cd0 ("ftrace: add necessary locking for ftrace records"). Signed-off-by: Stephen Rothwell Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 43665add9805..7599abdf6d4d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -161,8 +161,8 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * not recorded via the compilation. */ static DEFINE_SPINLOCK(ftrace_hash_lock); -#define ftrace_hash_lock(flags) spin_lock_irqsave(ftrace_hash_lock, flags) -#define ftrace_hash_unlock(flags) spin_lock_irqsave(ftrace_hash_lock, flags) +#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) #else /* This is protected via the ftrace_lock with MCOUNT_RECORD. */ #define ftrace_hash_lock(flags) do { (void)flags; } while (0) -- cgit v1.2.1 From ac8825ec6d941b6899331b84c7d6bf027c3bb4f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 Aug 2008 08:12:04 +0200 Subject: ftrace: clean up macro usage enclose the argument in parenthesis. (especially since we cast it, which is a high prio operation) Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7599abdf6d4d..969a83f75a3e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -165,7 +165,7 @@ static DEFINE_SPINLOCK(ftrace_hash_lock); #define ftrace_hash_unlock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) #else /* This is protected via the ftrace_lock with MCOUNT_RECORD. */ -#define ftrace_hash_lock(flags) do { (void)flags; } while (0) +#define ftrace_hash_lock(flags) do { (void)(flags); } while (0) #define ftrace_hash_unlock(flags) do { } while(0) #endif -- cgit v1.2.1 From e5a81b629ea8feb9e7530cfac35cfb41c45facf3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Aug 2008 23:31:01 -0400 Subject: ftrace: add stack tracer This is another tracer using the ftrace infrastructure, that examines at each function call the size of the stack. If the stack use is greater than the previous max it is recorded. You can always see (and set) the max stack size seen. By setting it to zero will start the recording again. The backtrace is also available. For example: # cat /debug/tracing/stack_max_size 1856 # cat /debug/tracing/stack_trace [] stack_trace_call+0x8f/0x101 [] ftrace_call+0x5/0x8 [] clocksource_get_next+0x12/0x48 [] update_wall_time+0x538/0x6d1 [] do_timer+0x23/0xb0 [] tick_do_update_jiffies64+0xd9/0xf1 [] tick_sched_timer+0x4a/0xad [] __run_hrtimer+0x3e/0x75 [] hrtimer_interrupt+0xf1/0x154 [] smp_apic_timer_interrupt+0x71/0x84 [] apic_timer_interrupt+0x2d/0x34 [] finish_task_switch+0x29/0xa0 [] schedule+0x765/0x7be [] schedule_timeout+0x1b/0x90 [] wait_for_common+0xab/0x101 [] wait_for_completion+0x12/0x14 [] blk_execute_rq+0x84/0x99 [] scsi_execute+0xc2/0x105 [] scsi_execute_req+0x57/0x7f [] sr_test_unit_ready+0x3e/0x97 [] sr_media_change+0x43/0x205 [] media_changed+0x48/0x77 [] cdrom_media_changed+0x31/0x37 [] sr_block_media_changed+0x16/0x18 [] check_disk_change+0x1b/0x63 [] cdrom_open+0x7a1/0x806 [] sr_block_open+0x78/0x8d [] do_open+0x90/0x257 [] blkdev_open+0x2d/0x56 [] __dentry_open+0x14d/0x23c [] nameidata_to_filp+0x24/0x38 [] do_filp_open+0x347/0x626 [] do_sys_open+0x47/0xbc [] sys_open+0x23/0x2b [] sysenter_do_call+0x12/0x26 I've tested this on both x86_64 and i386. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 9 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_stack.c | 254 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 kernel/trace/trace_stack.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 14d9505178ca..2a22e46390d3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -106,6 +106,15 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FTRACE + select FTRACE + select STACKTRACE + help + This tracer records the max stack of the kernel, and displays + it in debugfs/tracing/stack_trace + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 71d17de17288..58ec61c44bd6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 000000000000..4d1e522e3fe8 --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2008 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES] = + { [0 ... (STACK_TRACE_ENTRIES-1)] = ULONG_MAX }; +static struct stack_trace max_stack_trace = { + .max_entries = STACK_TRACE_ENTRIES, + .entries = stack_dump_trace, +}; + +static unsigned long max_stack_size; +static raw_spinlock_t max_stack_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static int stack_trace_disabled __read_mostly; +static DEFINE_PER_CPU(int, trace_active); + +static inline void check_stack(void) +{ + unsigned long this_size; + unsigned long flags; + + this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = THREAD_SIZE - this_size; + + if (this_size <= max_stack_size) + return; + + raw_local_irq_save(flags); + __raw_spin_lock(&max_stack_lock); + + /* a race could have already updated it */ + if (this_size <= max_stack_size) + goto out; + + max_stack_size = this_size; + + max_stack_trace.nr_entries = 0; + max_stack_trace.skip = 1; + + save_stack_trace(&max_stack_trace); + + out: + __raw_spin_unlock(&max_stack_lock); + raw_local_irq_restore(flags); +} + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + int cpu, resched; + + if (unlikely(!ftrace_enabled || stack_trace_disabled)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + /* no atomic needed, we only modify this variable by this cpu */ + if (per_cpu(trace_active, cpu)++ != 0) + goto out; + + check_stack(); + + out: + per_cpu(trace_active, cpu)--; + /* prevent recursion in schedule */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = stack_trace_call, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + long *ptr = filp->private_data; + unsigned long val, flags; + char buf[64]; + int ret; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + raw_local_irq_save(flags); + __raw_spin_lock(&max_stack_lock); + *ptr = val; + __raw_spin_unlock(&max_stack_lock); + raw_local_irq_restore(flags); + + return count; +} + +static struct file_operations stack_max_size_fops = { + .open = tracing_open_generic, + .read = stack_max_size_read, + .write = stack_max_size_write, +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned long *t = m->private; + + (*pos)++; + + if (!t || *t == ULONG_MAX) + return NULL; + + t++; + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + unsigned long *t = m->private; + loff_t l = 0; + + local_irq_disable(); + __raw_spin_lock(&max_stack_lock); + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + __raw_spin_unlock(&max_stack_lock); + local_irq_enable(); +} + +static int trace_lookup_stack(struct seq_file *m, unsigned long addr) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, addr); + + return seq_printf(m, "[<%p>] %s\n", (void*)addr, str); +#else + return seq_printf(m, "%p\n", (void*)addr); +#endif +} + +static int t_show(struct seq_file *m, void *v) +{ + unsigned long *t = v; + + if (!t || *t == ULONG_MAX) + return 0; + + trace_lookup_stack(m, *t); + + return 0; +} + +static struct seq_operations stack_trace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &stack_trace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = stack_dump_trace; + } + + return ret; +} + +static struct file_operations stack_trace_fops = { + .open = stack_trace_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +static __init int stack_trace_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); + if (!entry) + pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + + entry = debugfs_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); + if (!entry) + pr_warning("Could not create debugfs 'stack_trace' entry\n"); + + register_ftrace_function(&trace_ops); + + return 0; +} + +device_initcall(stack_trace_init); -- cgit v1.2.1 From 3b47bfc1fca01cccad9cce2d18b79b18ef2e4131 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Aug 2008 23:24:15 -0400 Subject: ftrace: remove direct reference to mcount in trace code The mcount record method of ftrace scans objdump for references to mcount. Using mcount as the reference to test if the calls to mcount being replaced are indeed calls to mcount, this use of mcount was also caught as a location to change. Using a variable that points to the mcount address moves this reference into the data section that is not scanned, and we do not use a false location to try and modify. The warn on code was what was used to detect this bug. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 969a83f75a3e..b69966f0f144 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -36,6 +36,14 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* + * Since MCOUNT_ADDR may point to mcount itself, we do not want + * to get it confused by reading a reference in the code as we + * are parsing on objcopy output of text. Use a variable for + * it instead. + */ +static unsigned long mcount_addr = MCOUNT_ADDR; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -577,7 +585,7 @@ ftrace_code_disable(struct dyn_ftrace *rec) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, MCOUNT_ADDR); + call = ftrace_call_replace(ip, mcount_addr); failed = ftrace_modify_code(ip, call, nop); if (failed) { -- cgit v1.2.1 From 1b6cced6ec9677fa65471e890dfdcb4bf5387643 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Aug 2008 16:51:43 -0400 Subject: ftrace: stack trace add indexes This patch adds indexes into the stack that the functions in the stack dump were found at. As an added bonus, I also added a diff to show which function is the most notorious consumer of the stack. The output now looks like this: # cat /debug/tracing/stack_trace Depth Size Location (48 entries) ----- ---- -------- 0) 2476 212 blk_recount_segments+0x39/0x59 1) 2264 12 bio_phys_segments+0x16/0x1d 2) 2252 20 blk_rq_bio_prep+0x23/0xaf 3) 2232 12 init_request_from_bio+0x74/0x77 4) 2220 56 __make_request+0x294/0x331 5) 2164 136 generic_make_request+0x34f/0x37d 6) 2028 56 submit_bio+0xe7/0xef 7) 1972 28 submit_bh+0xd1/0xf0 8) 1944 112 block_read_full_page+0x299/0x2a9 9) 1832 8 blkdev_readpage+0x14/0x16 10) 1824 28 read_cache_page_async+0x7e/0x109 11) 1796 16 read_cache_page+0x11/0x49 12) 1780 32 read_dev_sector+0x3c/0x72 13) 1748 48 read_lba+0x4d/0xaa 14) 1700 168 efi_partition+0x85/0x61b 15) 1532 72 rescan_partitions+0x10e/0x266 16) 1460 40 do_open+0x1c7/0x24e 17) 1420 292 __blkdev_get+0x79/0x84 18) 1128 12 blkdev_get+0x12/0x14 19) 1116 20 register_disk+0xd1/0x11e 20) 1096 28 add_disk+0x34/0x90 21) 1068 52 sd_probe+0x2b1/0x366 22) 1016 20 driver_probe_device+0xa5/0x120 23) 996 8 __device_attach+0xd/0xf 24) 988 32 bus_for_each_drv+0x3e/0x68 25) 956 24 device_attach+0x56/0x6c 26) 932 16 bus_attach_device+0x26/0x4d 27) 916 64 device_add+0x380/0x4b4 28) 852 28 scsi_sysfs_add_sdev+0xa1/0x1c9 29) 824 160 scsi_probe_and_add_lun+0x919/0xa2a 30) 664 36 __scsi_add_device+0x88/0xae 31) 628 44 ata_scsi_scan_host+0x9e/0x21c 32) 584 28 ata_host_register+0x1cb/0x1db 33) 556 24 ata_host_activate+0x98/0xb5 34) 532 192 ahci_init_one+0x9bd/0x9e9 35) 340 20 pci_device_probe+0x3e/0x5e 36) 320 20 driver_probe_device+0xa5/0x120 37) 300 20 __driver_attach+0x3f/0x5e 38) 280 36 bus_for_each_dev+0x40/0x62 39) 244 12 driver_attach+0x19/0x1b 40) 232 28 bus_add_driver+0x9c/0x1af 41) 204 28 driver_register+0x76/0xd2 42) 176 20 __pci_register_driver+0x44/0x71 43) 156 8 ahci_init+0x14/0x16 44) 148 100 _stext+0x42/0x122 45) 48 20 kernel_init+0x175/0x1dc 46) 28 28 kernel_thread_helper+0x7/0x10 The first column is simply an index starting from the inner most function and counting down to the outer most. The next column is the location that the function was found on the stack. The next column is the size of the stack for that function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 90 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4d1e522e3fe8..74c5d9a3afae 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -16,8 +16,10 @@ #define STACK_TRACE_ENTRIES 500 -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES] = - { [0 ... (STACK_TRACE_ENTRIES-1)] = ULONG_MAX }; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = + { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; + static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES, .entries = stack_dump_trace, @@ -32,8 +34,9 @@ static DEFINE_PER_CPU(int, trace_active); static inline void check_stack(void) { - unsigned long this_size; - unsigned long flags; + unsigned long this_size, flags; + unsigned long *p, *top, *start; + int i; this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -51,10 +54,42 @@ static inline void check_stack(void) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 1; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); + /* + * Now find where in the stack these are. + */ + i = 0; + start = &this_size; + top = (unsigned long *) + (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + + /* + * Loop through all the entries. One of the entries may + * for some reason be missed on the stack, so we may + * have to account for them. If they are all there, this + * loop will only happen once. This code only takes place + * on a new max, so it is far from a fast path. + */ + while (i < max_stack_trace.nr_entries) { + + stack_dump_index[i] = this_size; + p = start; + + for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (*p == stack_dump_trace[i]) { + this_size = stack_dump_index[i++] = + (top - p) * sizeof(unsigned long); + /* Start the search from here */ + start = p + 1; + } + } + + i++; + } + out: __raw_spin_unlock(&max_stack_lock); raw_local_irq_restore(flags); @@ -145,22 +180,24 @@ static struct file_operations stack_max_size_fops = { static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - unsigned long *t = m->private; + long i = (long)m->private; (*pos)++; - if (!t || *t == ULONG_MAX) + i++; + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) return NULL; - t++; - m->private = t; + m->private = (void *)i; - return t; + return &m->private; } static void *t_start(struct seq_file *m, loff_t *pos) { - unsigned long *t = m->private; + void *t = &m->private; loff_t l = 0; local_irq_disable(); @@ -178,14 +215,15 @@ static void t_stop(struct seq_file *m, void *p) local_irq_enable(); } -static int trace_lookup_stack(struct seq_file *m, unsigned long addr) +static int trace_lookup_stack(struct seq_file *m, long i) { + unsigned long addr = stack_dump_trace[i]; #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; sprint_symbol(str, addr); - return seq_printf(m, "[<%p>] %s\n", (void*)addr, str); + return seq_printf(m, "%s\n", str); #else return seq_printf(m, "%p\n", (void*)addr); #endif @@ -193,12 +231,30 @@ static int trace_lookup_stack(struct seq_file *m, unsigned long addr) static int t_show(struct seq_file *m, void *v) { - unsigned long *t = v; + long i = *(long *)v; + int size; + + if (i < 0) { + seq_printf(m, " Depth Size Location" + " (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries); + return 0; + } - if (!t || *t == ULONG_MAX) + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) return 0; - trace_lookup_stack(m, *t); + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + + trace_lookup_stack(m, i); return 0; } @@ -217,7 +273,7 @@ static int stack_trace_open(struct inode *inode, struct file *file) ret = seq_open(file, &stack_trace_seq_ops); if (!ret) { struct seq_file *m = file->private_data; - m->private = stack_dump_trace; + m->private = (void *)-1; } return ret; -- cgit v1.2.1 From 2ff01c6a17391225a18256d510b6e5b4aba40aa1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Sep 2008 15:04:37 +0200 Subject: stack tracer: depends on DEBUG_KERNEL Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2a22e46390d3..5a9cffb0fafb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -109,6 +109,7 @@ config CONTEXT_SWITCH_TRACER config STACK_TRACER bool "Trace max stack" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select FTRACE select STACKTRACE help -- cgit v1.2.1 From a6168353d1c0b24b7512e14d1c3e47ed69881a23 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 20 Aug 2008 16:36:11 -0700 Subject: ftrace: make output nicely spaced for up to 999 cpus Currently some of the ftrace output goes skewiff if you have more than 9 cpus, and some if you have more than 99. Twiddle with the headers and format strings to make up to 999 cpus display without causing spacing problems. Signed-off-by: Michael Ellerman Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 896e59f772c9..1801900908e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1409,21 +1409,21 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); } @@ -1508,7 +1508,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) comm = trace_find_cmdline(field->pid); trace_seq_printf(s, "%8.8s-%-5d ", comm, field->pid); - trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%3d", cpu); trace_seq_printf(s, "%c%c", (field->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', ((field->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); @@ -1598,7 +1598,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) if (verbose) { comm = trace_find_cmdline(field->pid); - trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, field->pid, cpu, field->flags, @@ -1694,7 +1694,7 @@ static int print_trace_fmt(struct trace_iterator *iter) ret = trace_seq_printf(s, "%16s-%-5d ", comm, field->pid); if (!ret) return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + ret = trace_seq_printf(s, "[%03d] ", iter->cpu); if (!ret) return 0; ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); -- cgit v1.2.1 From 652567aa2000f1d4a1fd434382a30d8dd4a7c980 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Sep 2008 17:42:50 -0400 Subject: ftrace: binary and not logical for continue test Peter Zijlstra provided me with a nice brown paper bag while letting me know that I was doing a logical AND and not a binary one, making a condition true more often than it should be. Luckily, a false true is handled by the calling function and no harm is done. But this needs to be fixed regardless. Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1801900908e1..9639e45f0860 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1655,7 +1655,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) case TRACE_PRINT: seq_print_ip_sym(s, field->print.ip, sym_flags); trace_seq_printf(s, ": %s", field->print.buf); - if (field->flags && TRACE_FLAG_CONT) + if (field->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); break; default: @@ -1768,7 +1768,7 @@ static int print_trace_fmt(struct trace_iterator *iter) case TRACE_PRINT: seq_print_ip_sym(s, field->print.ip, sym_flags); trace_seq_printf(s, ": %s", field->print.buf); - if (field->flags && TRACE_FLAG_CONT) + if (field->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); break; } @@ -1833,7 +1833,7 @@ static int print_raw_fmt(struct trace_iterator *iter) case TRACE_PRINT: trace_seq_printf(s, "# %lx %s", field->print.ip, field->print.buf); - if (field->flags && TRACE_FLAG_CONT) + if (field->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); break; } -- cgit v1.2.1 From 5a90f577e5369a84b720ead42e621fcb1b8a8b21 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Sep 2008 17:42:51 -0400 Subject: ftrace: print continue index fix An item in the trace buffer that is bigger than one entry may be split up using the TRACE_CONT entry. This makes it a virtual single entry. The current code increments the iterator index even while traversing TRACE_CONT entries, making it look like the iterator is further than it actually is. This patch adds code to not increment the iterator index while skipping over TRACE_CONT entries. Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9639e45f0860..d24101cfc425 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1117,9 +1117,8 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, } /* Increment the index counter of an iterator by one */ -static void trace_iterator_increment(struct trace_iterator *iter, int cpu) +static void __trace_iterator_increment(struct trace_iterator *iter, int cpu) { - iter->idx++; iter->next_idx[cpu]++; iter->next_page_idx[cpu]++; @@ -1132,6 +1131,12 @@ static void trace_iterator_increment(struct trace_iterator *iter, int cpu) } } +static void trace_iterator_increment(struct trace_iterator *iter, int cpu) +{ + iter->idx++; + __trace_iterator_increment(iter, cpu); +} + static struct trace_entry * trace_entry_next(struct trace_array *tr, struct trace_array_cpu *data, struct trace_iterator *iter, int cpu) @@ -1153,7 +1158,7 @@ trace_entry_next(struct trace_array *tr, struct trace_array_cpu *data, /* find a real entry */ do { - trace_iterator_increment(iter, cpu); + __trace_iterator_increment(iter, cpu); ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); } while (ent && ent->type != TRACE_CONT); @@ -1187,7 +1192,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, int inc) ent = trace_entry_next(tr, data, iter, cpu); else { while (ent && ent->type == TRACE_CONT) { - trace_iterator_increment(iter, cpu); + __trace_iterator_increment(iter, cpu); ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); } @@ -1566,7 +1571,7 @@ trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) do { trace_seq_printf(s, "%s", ent->cont.buf); - trace_iterator_increment(iter, iter->cpu); + __trace_iterator_increment(iter, iter->cpu); ent = trace_entry_idx(tr, data, iter, iter->cpu); } while (ent && ent->type == TRACE_CONT); } -- cgit v1.2.1 From f09ce573f57ddc35c67b39e51f34545877b30031 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Sep 2008 10:24:14 +0200 Subject: ftrace: make ftrace_printk usable with the other tracers Currently ftrace_printk only works with the ftrace tracer, switch it to an iter_ctrl setting so we can make us of them with other tracers too. [rostedt@redhat.com: tweak to the disable condition] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 8 +++++--- kernel/trace/trace.h | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d24101cfc425..8a00d6c5c0f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,6 +235,7 @@ static const char *trace_options[] = { "block", "stacktrace", "sched-tree", + "ftrace_printk", NULL }; @@ -3091,9 +3092,10 @@ static __init void tracer_init_debugfs(void) int __ftrace_printk(unsigned long ip, const char *fmt, ...) { - struct trace_array *tr = &global_trace; static DEFINE_SPINLOCK(trace_buf_lock); static char trace_buf[TRACE_BUF_SIZE]; + + struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct trace_entry *entry; unsigned long flags; @@ -3101,7 +3103,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) va_list ap; int cpu, len = 0, write, written = 0; - if (likely(!ftrace_function_enabled)) + if (!(trace_flags & TRACE_ITER_PRINTK) || !tr->ctrl || tracing_disabled) return 0; local_irq_save(flags); @@ -3109,7 +3111,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1 || !ftrace_function_enabled)) + if (unlikely(disabled != 1)) goto out; spin_lock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 50b6d7a6f01a..5f54c875c8be 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -356,6 +356,7 @@ enum trace_iterator_flags { TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, + TRACE_ITER_PRINTK = 0x400, }; #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.1 From 80b5e940050c273ba277acbf3a9fbc1d4441e681 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Sep 2008 10:24:16 +0200 Subject: ftrace: sched_switch: show the wakee's cpu While profiling the smp behaviour of the scheduler it was needed to know to which cpu a task got woken. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 12 +++++++++--- kernel/trace/trace.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a00d6c5c0f5..7e6cb4fe62f2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -977,6 +977,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->field.ctx.next_pid = next->pid; entry->field.ctx.next_prio = next->prio; entry->field.ctx.next_state = next->state; + entry->field.ctx.next_cpu = task_cpu(next); __trace_stack(tr, data, flags, 5); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -1003,6 +1004,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->field.ctx.next_pid = wakee->pid; entry->field.ctx.next_prio = wakee->prio; entry->field.ctx.next_state = wakee->state; + entry->field.ctx.next_cpu = task_cpu(wakee); __trace_stack(tr, data, flags, 6); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -1636,10 +1638,11 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) __ffs(field->ctx.prev_state) + 1 : 0; S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; comm = trace_find_cmdline(field->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", + trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->ctx.prev_pid, field->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", + field->ctx.next_cpu, field->ctx.next_pid, field->ctx.next_prio, T, comm); @@ -1736,11 +1739,12 @@ static int print_trace_fmt(struct trace_iterator *iter) state_to_char[field->ctx.prev_state] : 'X'; T = field->ctx.next_state < sizeof(state_to_char) ? state_to_char[field->ctx.next_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", + ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", field->ctx.prev_pid, field->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", + field->ctx.next_cpu, field->ctx.next_pid, field->ctx.next_prio, T); @@ -1817,10 +1821,11 @@ static int print_raw_fmt(struct trace_iterator *iter) state_to_char[field->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", + ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", field->ctx.prev_pid, field->ctx.prev_prio, S, + field->ctx.next_cpu, field->ctx.next_pid, field->ctx.next_prio, T); @@ -1893,6 +1898,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, field->ctx.prev_pid); SEQ_PUT_HEX_FIELD_RET(s, field->ctx.prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_cpu); SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_pid); SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_prio); SEQ_PUT_HEX_FIELD_RET(s, T); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5f54c875c8be..77c265f6a779 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -41,6 +41,7 @@ struct ctx_switch_entry { unsigned int next_pid; unsigned char next_prio; unsigned char next_state; + unsigned int next_cpu; }; /* -- cgit v1.2.1 From d3ee6d992821f471193a7ee7a00af9ebb4bf5d01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Sep 2008 14:04:51 +0200 Subject: ftrace: make it depend on DEBUG_KERNEL make most of the tracers depend on DEBUG_KERNEL - that's their intended purpose. (most distributions have DEBUG_KERNEL enabled anyway so this is not a practical limitation - but it simplifies the tracing menu in the normal case) Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5a9cffb0fafb..16e5bb5daaa5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -22,6 +22,7 @@ config TRACING config FTRACE bool "Kernel Function Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select FRAME_POINTER select TRACING select CONTEXT_SWITCH_TRACER @@ -40,6 +41,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -63,6 +65,7 @@ config PREEMPT_TRACER depends on GENERIC_TIME depends on PREEMPT depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -90,6 +93,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -100,6 +104,7 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select MARKERS help @@ -120,6 +125,7 @@ config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE depends on HAVE_DYNAMIC_FTRACE + depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -146,6 +152,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" depends on TRACING + depends on DEBUG_KERNEL select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup -- cgit v1.2.1 From 644f991d4b920ab1f5043509651479420b293490 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 6 Sep 2008 01:06:04 -0400 Subject: ftrace: fix unlocking of hash This must be brown paper bag week for Steven Rostedt! While working on ftrace for PPC, I discovered that the hash locking done when CONFIG_FTRACE_MCOUNT_RECORD is not set, is totally incorrect. With a cut and paste error, I had the hash lock macro to lock for both hash_lock _and_ hash_unlock! This bug did not affect x86 since this bug was introduced when CONFIG_FTRACE_MCOUNT_RECORD was added to x86. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b69966f0f144..c9e09d86e1d8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -170,7 +170,8 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) */ static DEFINE_SPINLOCK(ftrace_hash_lock); #define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) -#define ftrace_hash_unlock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) \ + spin_unlock_irqrestore(&ftrace_hash_lock, flags) #else /* This is protected via the ftrace_lock with MCOUNT_RECORD. */ #define ftrace_hash_lock(flags) do { (void)(flags); } while (0) -- cgit v1.2.1 From 45dcd8b8a8ca855591e3ac882d3a7fc255d09d43 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 21:56:41 +0300 Subject: ftrace: move mmiotrace functions out of trace.c Moves the mmiotrace specific functions from trace.c to trace_mmiotrace.c. Functions trace_wake_up(), tracing_get_trace_entry(), and tracing_generic_entry_update() are therefore made available outside trace.c. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 46 ++---------------------------------------- kernel/trace/trace.h | 15 ++++++-------- kernel/trace/trace_mmiotrace.c | 42 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7e6cb4fe62f2..d372bc535963 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -785,7 +785,7 @@ trace_next_page(struct trace_array_cpu *data, void *addr) return page_address(page); } -static inline struct trace_entry * +struct trace_entry * tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; @@ -821,7 +821,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) return entry; } -static inline void +void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { struct task_struct *tsk = current; @@ -865,48 +865,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -#ifdef CONFIG_MMIOTRACE -void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_rw *rw) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_RW; - entry->field.mmiorw = *rw; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - -void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_map *map) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_MAP; - entry->field.mmiomap = *map; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} -#endif - void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 77c265f6a779..9d39aa00a9c6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -213,11 +213,17 @@ struct trace_iterator { long idx; }; +void trace_wake_up(void); void tracing_reset(struct trace_array_cpu *data); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags); + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, @@ -291,15 +297,6 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif -#ifdef CONFIG_MMIOTRACE -extern void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_rw *rw); -extern void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_map *map); -#endif - #ifdef CONFIG_FTRACE_STARTUP_TEST #ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 9b7a936f4b1f..ef02747b26d9 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -276,6 +276,27 @@ __init static int init_mmio_trace(void) } device_initcall(init_mmio_trace); +static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_RW; + entry->field.mmiorw = *rw; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; @@ -283,6 +304,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw) __trace_mmiotrace_rw(tr, data, rw); } +static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_MAP; + entry->field.mmiomap = *map; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; -- cgit v1.2.1 From 801fe40001dfc263848552fb28924b766ed44ea4 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 21:58:24 +0300 Subject: ftrace: add trace_vprintk() trace_vprintk() for easier implementation of tracer specific *_printk functions. Add check check for no_tracer, and implement __ftrace_printk() as a wrapper. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 24 ++++++++++++++++++------ kernel/trace/trace.h | 1 + 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d372bc535963..406de9cf2820 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3054,7 +3054,7 @@ static __init void tracer_init_debugfs(void) (sizeof(struct trace_field) - offsetof(struct trace_field, print.buf)) #define TRACE_CONT_BUF_SIZE sizeof(struct trace_field) -int __ftrace_printk(unsigned long ip, const char *fmt, ...) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { static DEFINE_SPINLOCK(trace_buf_lock); static char trace_buf[TRACE_BUF_SIZE]; @@ -3064,10 +3064,9 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) struct trace_entry *entry; unsigned long flags; long disabled; - va_list ap; int cpu, len = 0, write, written = 0; - if (!(trace_flags & TRACE_ITER_PRINTK) || !tr->ctrl || tracing_disabled) + if (current_trace == &no_tracer || !tr->ctrl || tracing_disabled) return 0; local_irq_save(flags); @@ -3079,9 +3078,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) goto out; spin_lock(&trace_buf_lock); - va_start(ap, fmt); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, ap); - va_end(ap); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); trace_buf[len] = 0; @@ -3120,6 +3117,21 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_vprintk); + +int __ftrace_printk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, fmt, ap); + va_end(ap); + return ret; +} EXPORT_SYMBOL_GPL(__ftrace_printk); static int trace_panic_handler(struct notifier_block *this, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9d39aa00a9c6..be3b3cf95f4b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -333,6 +333,7 @@ extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); +extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; -- cgit v1.2.1 From 9e57fb35d711331a9b1410c5c56ebeb3733428a0 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:00:34 +0300 Subject: x86 mmiotrace: implement mmiotrace_printk() Offer mmiotrace users a function to inject markers from inside the kernel. This depends on the trace_vprintk() patch. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ef02747b26d9..767d1faf56e5 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -335,3 +335,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map) __trace_mmiotrace_map(tr, data, map); preempt_enable(); } + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} -- cgit v1.2.1 From fc5e27ae4b45a0619701a83f30d9b7fad7ed9400 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:02:27 +0300 Subject: mmiotrace: handle TRACE_PRINT entries Also make trace_seq_print_cont() non-static, and add a newline if the seq buffer can't hold all data. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 31 +++++++++++-------------------- kernel/trace/trace.h | 19 +++++++++++++++++++ kernel/trace/trace_mmiotrace.c | 23 +++++++++++++++++++++++ 3 files changed, 53 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 406de9cf2820..7e7154f77009 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -199,23 +199,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -/* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * NEED_RESCED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - * CONT - multiple entries hold the trace item - */ -enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_NEED_RESCHED = 0x02, - TRACE_FLAG_HARDIRQ = 0x04, - TRACE_FLAG_SOFTIRQ = 0x08, - TRACE_FLAG_CONT = 0x10, -}; - /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. @@ -1517,12 +1500,16 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static void -trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) +/* + * The message is supposed to contain an ending newline. + * If the printing stops prematurely, try to add a newline of our own. + */ +void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[iter->cpu]; struct trace_entry *ent; + bool ok = true; ent = trace_entry_idx(tr, data, iter, iter->cpu); if (!ent || ent->type != TRACE_CONT) { @@ -1531,10 +1518,14 @@ trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) } do { - trace_seq_printf(s, "%s", ent->cont.buf); + if (ok) + ok = (trace_seq_printf(s, "%s", ent->cont.buf) > 0); __trace_iterator_increment(iter, iter->cpu); ent = trace_entry_idx(tr, data, iter, iter->cpu); } while (ent && ent->type == TRACE_CONT); + + if (!ok) + trace_seq_putc(s, '\n'); } static int diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index be3b3cf95f4b..648433d18ccb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -71,6 +71,23 @@ struct print_entry { char buf[]; }; +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + * CONT - multiple entries hold the trace item + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, + TRACE_FLAG_CONT = 0x10, +}; + /* * The trace field - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: @@ -330,6 +347,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern void trace_seq_print_cont(struct trace_seq *s, + struct trace_iterator *iter); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 767d1faf56e5..a108c326f36e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -245,6 +245,27 @@ static int mmio_print_map(struct trace_iterator *iter) return 0; } +static int mmio_print_mark(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + const char *msg = entry->field.print.buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->field.t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret; + + /* The trailing newline must be in the message. */ + ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); + if (!ret) + return 0; + + if (entry->field.flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return 1; +} + /* return 0 to abort printing without consuming current entry in pipe mode */ static int mmio_print_line(struct trace_iterator *iter) { @@ -253,6 +274,8 @@ static int mmio_print_line(struct trace_iterator *iter) return mmio_print_rw(iter); case TRACE_MMIO_MAP: return mmio_print_map(iter); + case TRACE_PRINT: + return mmio_print_mark(iter); default: return 1; /* ignore unknown entries */ } -- cgit v1.2.1 From 5bf9a1ee350a10feb94107de32a203d81fbbe706 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:06:42 +0300 Subject: ftrace: inject markers via trace_marker file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow a user to inject a marker (TRACE_PRINT entry) into the trace ring buffer. The related file operations are derived from code by Frédéric Weisbecker . Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 4 +++ 2 files changed, 75 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7e7154f77009..eee1fd964898 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2879,6 +2879,66 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int tracing_open_mark(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_open_generic(inode, filp); + if (ret) + return ret; + + if (current_trace == &no_tracer) + return -ENODEV; + + return 0; +} + +static int mark_printk(const char *fmt, ...) +{ + int ret; + va_list args; + va_start(args, fmt); + ret = trace_vprintk(0, fmt, args); + va_end(args); + return ret; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char *buf; + char *end; + struct trace_array *tr = &global_trace; + + if (current_trace == &no_tracer || !tr->ctrl || tracing_disabled) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + buf = kmalloc(cnt + 1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + kfree(buf); + return -EFAULT; + } + + /* Cut from the first nil or newline. */ + buf[cnt] = '\0'; + end = strchr(buf, '\n'); + if (end) + *end = '\0'; + + cnt = mark_printk("%s\n", buf); + kfree(buf); + *fpos += cnt; + + return cnt; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -2910,6 +2970,11 @@ static struct file_operations tracing_entries_fops = { .write = tracing_entries_write, }; +static struct file_operations tracing_mark_fops = { + .open = tracing_open_mark, + .write = tracing_mark_write, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -3027,6 +3092,12 @@ static __init void tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_entries' entry\n"); + entry = debugfs_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'trace_marker' entry\n"); + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, @@ -3040,11 +3111,6 @@ static __init void tracer_init_debugfs(void) #endif } -#define TRACE_BUF_SIZE 1024 -#define TRACE_PRINT_BUF_SIZE \ - (sizeof(struct trace_field) - offsetof(struct trace_field, print.buf)) -#define TRACE_CONT_BUF_SIZE sizeof(struct trace_field) - int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { static DEFINE_SPINLOCK(trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 648433d18ccb..42f65d0097f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -124,6 +124,10 @@ struct trace_entry { }; #define TRACE_ENTRY_SIZE sizeof(struct trace_entry) +#define TRACE_BUF_SIZE 1024 +#define TRACE_PRINT_BUF_SIZE \ + (sizeof(struct trace_field) - offsetof(struct trace_field, print.buf)) +#define TRACE_CONT_BUF_SIZE sizeof(struct trace_field) /* * The CPU trace array - it consists of thousands of trace entries -- cgit v1.2.1 From fb1b6d8b5154c692172a424e45fbd0573295cb93 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Fri, 19 Sep 2008 03:06:43 -0700 Subject: ftrace: add nop tracer A no-op tracer which can serve two purposes: 1. A template for development of a new tracer. 2. A convenient way to see ftrace_printk() calls without an irrelevant trace making the output messy. [ mingo@elte.hu: resolved conflicts ] Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 10 +++++++ kernel/trace/Makefile | 1 + kernel/trace/trace.h | 4 +++ kernel/trace/trace_nop.c | 65 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_selftest.c | 9 ++++++ 5 files changed, 89 insertions(+) create mode 100644 kernel/trace/trace_nop.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 16e5bb5daaa5..d7b2de744631 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -101,6 +101,16 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. +config NOP_TRACER + bool "NOP Tracer" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select TRACING + help + This tracer does nothing. The primary purpose for it is to + politely print the output of ftrace_printk() calls without + the overhead of an irrelevant trace taking place. + config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on HAVE_FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 58ec61c44bd6..73ba13f5a461 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 42f65d0097f0..447d4b9b6391 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -339,6 +339,10 @@ extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); #endif +#ifdef CONFIG_NOP_TRACER +extern int trace_selftest_startup_nop(struct tracer *trace, + struct trace_array *tr); +#endif #ifdef CONFIG_CONTEXT_SWITCH_TRACER extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 000000000000..dafaefb84038 --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,65 @@ +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan + * + */ + +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void nop_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_nop_trace(tr); +} + +static void nop_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_nop_trace(tr); +} + +static void nop_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_nop_trace(tr); + else + stop_nop_trace(tr); +} + +static struct tracer nop_trace __read_mostly = +{ + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, + .ctrl_update = nop_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, +#endif +}; + +__init static int init_nop_trace(void) +{ + return register_tracer(&nop_trace); +} +device_initcall(init_nop_trace); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 630715bbd572..82db9103b9bc 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -418,6 +418,15 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } #endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ + /* What could possibly go wrong? */ + return 0; +} +#endif + #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { -- cgit v1.2.1 From 71c67d58b5660f8e42c7d4c3e77cbc03fac5ed31 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sat, 20 Sep 2008 01:00:37 -0700 Subject: ftrace: mcount_addr defined but not used When CONFIG_DYNAMIC_FTRACE isn't used, neither is mcount_addr. This patch eliminates that warning. Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c9e09d86e1d8..7694f3e59797 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -36,14 +36,6 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; -/* - * Since MCOUNT_ADDR may point to mcount itself, we do not want - * to get it confused by reading a reference in the code as we - * are parsing on objcopy output of text. Use a variable for - * it instead. - */ -static unsigned long mcount_addr = MCOUNT_ADDR; - /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -178,6 +170,14 @@ static DEFINE_SPINLOCK(ftrace_hash_lock); #define ftrace_hash_unlock(flags) do { } while(0) #endif +/* + * Since MCOUNT_ADDR may point to mcount itself, we do not want + * to get it confused by reading a reference in the code as we + * are parsing on objcopy output of text. Use a variable for + * it instead. + */ +static unsigned long mcount_addr = MCOUNT_ADDR; + static struct task_struct *ftraced_task; enum { -- cgit v1.2.1 From 8925b394eca0bb0a444a6487d702d0f650e94a10 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sat, 20 Sep 2008 01:00:38 -0700 Subject: trace: remove pointless ifdefs The functions are already 'extern' anyway, so there's no problem with linkage. Removing these ifdefs also helps find any potential compiler errors. Suggested by Andrew Morton. Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 447d4b9b6391..c8c687088b4d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -319,38 +319,22 @@ extern int DYN_FTRACE_TEST_NAME(void); #endif #ifdef CONFIG_FTRACE_STARTUP_TEST -#ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_IRQSOFF_TRACER extern int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_PREEMPT_TRACER extern int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr); -#endif -#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_SCHED_TRACER extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_NOP_TRACER extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_SYSPROF_TRACER extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); -#endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); -- cgit v1.2.1 From 35cb5ed01261f5669657d2f1720aca902299887d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Sun, 21 Sep 2008 20:10:14 +0200 Subject: tracing/ftrace: make nop tracer reset previous entries If nop tracer is selected, some old entries from the previous tracer could still be enqueued. Tracing have to be reset. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/trace/trace_nop.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index dafaefb84038..9fb02c17ad0c 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -26,8 +26,12 @@ static void stop_nop_trace(struct trace_array *tr) static void nop_trace_init(struct trace_array *tr) { + int cpu; ctx_trace = tr; + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); + if (tr->ctrl) start_nop_trace(tr); } -- cgit v1.2.1 From 2a3a4f669df2164288d11406d11d5e4933bf5e53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Sun, 21 Sep 2008 20:12:14 +0200 Subject: tracing/ftrace: tracing engine depends on Nop Tracer Now that the nop tracer is used as the default tracer by replacing the "none" tracer, tracing engine depends on it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d7b2de744631..254328dec672 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,8 +1,13 @@ # # Architectures that offer an FTRACE implementation should select HAVE_FTRACE: # + +config NOP_TRACER + bool + config HAVE_FTRACE bool + select NOP_TRACER config HAVE_DYNAMIC_FTRACE bool @@ -101,16 +106,6 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config NOP_TRACER - bool "NOP Tracer" - depends on HAVE_FTRACE - depends on DEBUG_KERNEL - select TRACING - help - This tracer does nothing. The primary purpose for it is to - politely print the output of ftrace_printk() calls without - the overhead of an irrelevant trace taking place. - config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on HAVE_FTRACE -- cgit v1.2.1 From 43a15386c4faf913f7d70a47748c266d6210cd6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Sun, 21 Sep 2008 20:16:30 +0200 Subject: tracing/ftrace: replace none tracer by nop tracer Replace "none" tracer by the recently created "nop" tracer. Both are pretty similar except that nop accepts TRACE_PRINT or TRACE_SPECIAL entries. And as a consequence, changing the size of the ring buffer now requires that tracing has already been disabled. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 49 +++++++++--------------------------------------- kernel/trace/trace.h | 2 ++ kernel/trace/trace_nop.c | 7 +------ 3 files changed, 12 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eee1fd964898..f2ef72fa3f17 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -142,24 +142,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -static notrace void no_trace_init(struct trace_array *tr) -{ - int cpu; - - ftrace_function_enabled = 0; - if(tr->ctrl) - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); - tracer_enabled = 0; -} - -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { - .name = "none", - .init = no_trace_init -}; - - /** * trace_wake_up - wake up tasks waiting for trace input * @@ -962,7 +944,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) long disabled; int cpu; - if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + if (tracing_disabled || !tr->ctrl) return; local_irq_save(flags); @@ -2795,6 +2777,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, unsigned long val; char buf[64]; int i, ret; + struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2814,9 +2797,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); - if (current_trace != &no_tracer) { + if (tr->ctrl) { cnt = -EBUSY; - pr_info("ftrace: set current_tracer to none" + pr_info("ftrace: please disable tracing" " before modifying buffer size\n"); goto out; } @@ -2879,20 +2862,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } -static int tracing_open_mark(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_open_generic(inode, filp); - if (ret) - return ret; - - if (current_trace == &no_tracer) - return -ENODEV; - - return 0; -} - static int mark_printk(const char *fmt, ...) { int ret; @@ -2911,7 +2880,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, char *end; struct trace_array *tr = &global_trace; - if (current_trace == &no_tracer || !tr->ctrl || tracing_disabled) + if (!tr->ctrl || tracing_disabled) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -2971,7 +2940,7 @@ static struct file_operations tracing_entries_fops = { }; static struct file_operations tracing_mark_fops = { - .open = tracing_open_mark, + .open = tracing_open_generic, .write = tracing_mark_write, }; @@ -3123,7 +3092,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) long disabled; int cpu, len = 0, write, written = 0; - if (current_trace == &no_tracer || !tr->ctrl || tracing_disabled) + if (!tr->ctrl || tracing_disabled) return 0; local_irq_save(flags); @@ -3539,8 +3508,8 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); - register_tracer(&no_tracer); - current_trace = &no_tracer; + register_tracer(&nop_trace); + current_trace = &nop_trace; /* All seems OK, enable tracing */ global_trace.ctrl = tracer_enabled; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c8c687088b4d..cb2c3fb7dd54 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -369,4 +369,6 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK = 0x400, }; +extern struct tracer nop_trace; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 9fb02c17ad0c..16c9ba060bab 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -51,7 +51,7 @@ static void nop_trace_ctrl_update(struct trace_array *tr) stop_nop_trace(tr); } -static struct tracer nop_trace __read_mostly = +struct tracer nop_trace __read_mostly = { .name = "nop", .init = nop_trace_init, @@ -62,8 +62,3 @@ static struct tracer nop_trace __read_mostly = #endif }; -__init static int init_nop_trace(void) -{ - return register_tracer(&nop_trace); -} -device_initcall(init_nop_trace); -- cgit v1.2.1 From 05736a427f7e16be948ccbf39782bd3a6ae16b14 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 22 Sep 2008 14:55:47 -0700 Subject: ftrace: warn on failure to disable mcount callers With the recent updates to ftrace, there should not be any failures when modifying the code. If there is, then we need to warn about it. This patch has a cleaned up version of the code that I used to discover that the weak symbols were causing failures. Signed-off-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7694f3e59797..4dda4f60a2a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -576,6 +576,16 @@ static void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + static int ftrace_code_disable(struct dyn_ftrace *rec) { @@ -590,6 +600,23 @@ ftrace_code_disable(struct dyn_ftrace *rec) failed = ftrace_modify_code(ip, call, nop); if (failed) { + switch (failed) { + case 1: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case 2: + WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" expected: ", call); + print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" replace: ", nop); + printk(KERN_CONT "\n"); + break; + } + rec->flags |= FTRACE_FL_FAILED; return 0; } -- cgit v1.2.1 From d74185ed27651ad8d920b37d7851306ad01f7d6f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 29 Sep 2008 16:00:05 +0800 Subject: markers: fix unregister bug and reenter bug unregister bug: codes using makers are typically calling marker_probe_unregister() and then destroying the data that marker_probe_func needs(or unloading this module). This is bug when the corresponding marker_probe_func is still running(on other cpus), it is using the destroying/ed data. we should call synchronize_sched() after marker_update_probes(). reenter bug: marker_probe_register(), marker_probe_unregister() and marker_probe_unregister_private_data() are not reentrant safe functions. these 3 functions release markers_mutex and then require it again and do "entry->oldptr = old; ...", but entry->oldptr maybe is using now for these 3 functions may reenter when markers_mutex is released. we use synchronize_sched() instead of call_rcu_sched() to fix this bug. actually we can do: " if (entry->rcu_pending) rcu_barrier_sched(); " after require markers_mutex again. but synchronize_sched() is better and simpler. For these 3 functions are not critical path. Signed-off-by: Lai Jiangshan Cc: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 52 ++++++---------------------------------------------- 1 file changed, 6 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 7d1faecd7a51..9f76c4a622ec 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -60,9 +60,6 @@ struct marker_entry { struct marker_probe_closure single; struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - unsigned char rcu_pending:1; unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -199,16 +196,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) } EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); -static void free_old_closure(struct rcu_head *head) -{ - struct marker_entry *entry = container_of(head, - struct marker_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; -} - static void debug_print_probes(struct marker_entry *entry) { int i; @@ -417,7 +404,6 @@ static struct marker_entry *add_marker(const char *name, const char *format) e->multi = NULL; e->ptype = 0; e->refcount = 0; - e->rcu_pending = 0; hlist_add_head(&e->hlist, head); return e; } @@ -447,9 +433,6 @@ static int remove_marker(const char *name) if (e->single.func != __mark_empty_function) return -EBUSY; hlist_del(&e->hlist); - /* Make sure the call_rcu has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); kfree(e); return 0; } @@ -479,12 +462,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format) e->multi = (*entry)->multi; e->ptype = (*entry)->ptype; e->refcount = (*entry)->refcount; - e->rcu_pending = 0; hlist_add_before(&e->hlist, &(*entry)->hlist); hlist_del(&(*entry)->hlist); - /* Make sure the call_rcu has been executed */ - if ((*entry)->rcu_pending) - rcu_barrier_sched(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -658,12 +637,6 @@ int marker_probe_register(const char *name, const char *format, goto end; } } - /* - * If we detect that a call_rcu is pending for this marker, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); old = marker_entry_add_probe(entry, probe, probe_private); if (IS_ERR(old)) { ret = PTR_ERR(old); @@ -671,14 +644,11 @@ int marker_probe_register(const char *name, const char *format, } mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ + synchronize_sched(); + kfree(old); mutex_lock(&markers_mutex); entry = get_marker(name); WARN_ON(!entry); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -708,20 +678,15 @@ int marker_probe_unregister(const char *name, entry = get_marker(name); if (!entry) goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ + synchronize_sched(); + kfree(old); mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) goto end; - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ ret = 0; end: @@ -787,19 +752,14 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, ret = -ENOENT; goto end; } - if (entry->rcu_pending) - rcu_barrier_sched(); old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ + synchronize_sched(); + kfree(old); mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); WARN_ON(!entry); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); -- cgit v1.2.1 From ca2db6cf30d6a45798b7a760d0c4f7735b16799d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 30 Sep 2008 01:49:39 -0400 Subject: tracepoints: use rcu sched Make tracepoints use rcu sched. (cleanup) Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c7c62a4a75f5..db39961a0d03 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -80,10 +80,7 @@ static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); -#ifdef CONFIG_PREEMPT_RCU - synchronize_sched(); /* Until we have the call_rcu_sched() */ -#endif - call_rcu(&entry->rcu, free_old_closure); + call_rcu_sched(&entry->rcu, free_old_closure); } static void debug_print_probes(struct tracepoint_entry *entry) @@ -245,9 +242,9 @@ static int remove_tracepoint(const char *name) if (e->refcount) return -EBUSY; hlist_del(&e->hlist); - /* Make sure the call_rcu has been executed */ + /* Make sure the call_rcu_sched has been executed */ if (e->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); kfree(e); return 0; } @@ -344,11 +341,11 @@ int tracepoint_probe_register(const char *name, void *probe) } } /* - * If we detect that a call_rcu is pending for this tracepoint, + * If we detect that a call_rcu_sched is pending for this tracepoint, * make sure it's executed now. */ if (entry->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); old = tracepoint_entry_add_probe(entry, probe); if (IS_ERR(old)) { ret = PTR_ERR(old); @@ -387,7 +384,7 @@ int tracepoint_probe_unregister(const char *name, void *probe) if (!entry) goto end; if (entry->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); old = tracepoint_entry_remove_probe(entry, probe); mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ -- cgit v1.2.1 From 9a1e9693f534174945154197fec4ec92f168ce21 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 30 Sep 2008 01:51:12 -0400 Subject: tracepoints: fix reentrancy The tracepoints had the same problem markers did have wrt reentrancy. Apply a similar fix using a rcu_barrier after each tracepoint mutex lock. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index db39961a0d03..f2b7c28a4708 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -356,6 +356,8 @@ int tracepoint_probe_register(const char *name, void *probe) mutex_lock(&tracepoints_mutex); entry = get_tracepoint(name); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); tracepoint_entry_free_old(entry, old); end: mutex_unlock(&tracepoints_mutex); @@ -392,6 +394,8 @@ int tracepoint_probe_unregister(const char *name, void *probe) entry = get_tracepoint(name); if (!entry) goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); tracepoint_entry_free_old(entry, old); remove_tracepoint(name); /* Ignore busy error message */ ret = 0; -- cgit v1.2.1 From e2d3b75dbc486253c910722486ac64087f96c59f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Sep 2008 11:08:03 -0400 Subject: markers: fix unregister bug and reenter bug, cleanup Use the new rcu_read_lock_sched/unlock_sched() in marker code around the call site instead of preempt_disable/enable(). It helps reviewing the code more easily. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 9f76c4a622ec..fe5ca72f9659 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -100,11 +100,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) char ptype; /* - * preempt_disable does two things : disabling preemption to make sure - * the teardown of the callbacks can be done correctly when they are in - * modules and they insure RCU read coherency. + * rcu_read_lock_sched does two things : disabling preemption to make + * sure the teardown of the callbacks can be done correctly when they + * are in modules and they insure RCU read coherency. */ - preempt_disable(); + rcu_read_lock_sched(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -142,7 +142,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) va_end(args); } } - preempt_enable(); + rcu_read_unlock_sched(); } EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -159,7 +159,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) va_list args; /* not initialized */ char ptype; - preempt_disable(); + rcu_read_lock_sched(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -192,7 +192,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); } - preempt_enable(); + rcu_read_unlock_sched(); } EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); @@ -539,7 +539,7 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * Disable a marker and its probe callback. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured - * by preempt_disable around the call site. + * by rcu_read_lock_sched around the call site. */ static void disable_marker(struct marker *elem) { -- cgit v1.2.1 From ed86a59071a4ccd0e9bcefc61e013759a21eda78 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 1 Oct 2008 12:03:25 -0400 Subject: markers: re-enable fast batch registration Lai Jiangshan discovered a reentrancy issue with markers and fixed it by adding synchronize_sched() calls at each registration/unregistraiton. It works, but it removes the ability to do batch registration/unregistration and can cause registration of ~100 markers to take about 30 seconds on a loaded machine (synchronize_sched() is much slower on such workloads). This patch implements a version of the fix which won't slow down marker batch registration/unregistration. It also go back to the original non-synchronized reg/unreg. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index fe5ca72f9659..05a25776f71f 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -60,6 +60,9 @@ struct marker_entry { struct marker_probe_closure single; struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -196,6 +199,16 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) } EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); +static void free_old_closure(struct rcu_head *head) +{ + struct marker_entry *entry = container_of(head, + struct marker_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + static void debug_print_probes(struct marker_entry *entry) { int i; @@ -404,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format) e->multi = NULL; e->ptype = 0; e->refcount = 0; + e->rcu_pending = 0; hlist_add_head(&e->hlist, head); return e; } @@ -433,6 +447,9 @@ static int remove_marker(const char *name) if (e->single.func != __mark_empty_function) return -EBUSY; hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier_sched(); kfree(e); return 0; } @@ -462,8 +479,12 @@ static int marker_set_format(struct marker_entry **entry, const char *format) e->multi = (*entry)->multi; e->ptype = (*entry)->ptype; e->refcount = (*entry)->refcount; + e->rcu_pending = 0; hlist_add_before(&e->hlist, &(*entry)->hlist); hlist_del(&(*entry)->hlist); + /* Make sure the call_rcu has been executed */ + if ((*entry)->rcu_pending) + rcu_barrier_sched(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -637,6 +658,12 @@ int marker_probe_register(const char *name, const char *format, goto end; } } + /* + * If we detect that a call_rcu is pending for this marker, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier_sched(); old = marker_entry_add_probe(entry, probe, probe_private); if (IS_ERR(old)) { ret = PTR_ERR(old); @@ -644,11 +671,16 @@ int marker_probe_register(const char *name, const char *format, } mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ - synchronize_sched(); - kfree(old); mutex_lock(&markers_mutex); entry = get_marker(name); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu_sched(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -678,15 +710,22 @@ int marker_probe_unregister(const char *name, entry = get_marker(name); if (!entry) goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ - synchronize_sched(); - kfree(old); mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu_sched(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ ret = 0; end: @@ -752,14 +791,21 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, ret = -ENOENT; goto end; } + if (entry->rcu_pending) + rcu_barrier_sched(); old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ - synchronize_sched(); - kfree(old); mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu_sched(&entry->rcu, free_old_closure); remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); -- cgit v1.2.1 From 48043bcdf8f398d28e75ffed6ee85208d751f87f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 8 Oct 2008 10:23:36 +0800 Subject: markers: fix unchecked format when the second, third... probe is registered, its format is not checked, this patch fixes it. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 05a25776f71f..3b75d0e8b5a4 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -653,11 +653,17 @@ int marker_probe_register(const char *name, const char *format, entry = get_marker(name); if (!entry) { entry = add_marker(name, format); - if (IS_ERR(entry)) { + if (IS_ERR(entry)) ret = PTR_ERR(entry); - goto end; - } + } else if (format) { + if (!entry->format) + ret = marker_set_format(&entry, format); + else if (strcmp(entry->format, format)) + ret = -EPERM; } + if (ret) + goto end; + /* * If we detect that a call_rcu is pending for this marker, * make sure it's executed now. -- cgit v1.2.1 From 1b7ae37c030a9fbbb5ebbf5d7bbfd7208cf805b7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 10 Oct 2008 14:43:57 +0800 Subject: markers: bit-field is not thread-safe nor smp-safe bit-field is not thread-safe nor smp-safe. struct marker_entry.rcu_pending is not protected by any lock in rcu-callback free_old_closure(). so we must turn it into a safe type. detail: I suppose rcu_pending and ptype are store in struct marker_entry.tmp1 free_old_closure() side: change ptype side: | load struct marker_entry.tmp1 --------------------------------|-------------------------------- | change ptype bit in tmp1 load struct marker_entry.tmp1 | change rcu_pending bit in tmp1 | store tmp1 | --------------------------------|-------------------------------- | store tmp1 now this result equals that free_old_closure() do not change rcu_pending bit, bug! This bug will cause redundant rcu_barrier_sched() called. not too harmful. ----- corresponding: free_old_closure() side: change ptype side: load struct marker_entry.tmp1 | --------------------------------|-------------------------------- | load struct marker_entry.tmp1 change rcu_pending bit in tmp1 | | change ptype bit in tmp1 | store tmp1 --------------------------------|-------------------------------- store tmp1 | now this result equals that change ptype side do not change ptype bit, bug! this bug cause marker_probe_cb() access to invalid memory. oops! see also: http://en.wikipedia.org/wiki/Bit_field Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 3b75d0e8b5a4..e9c6b2bc9400 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -62,7 +62,7 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - unsigned char rcu_pending:1; + int rcu_pending; unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; -- cgit v1.2.1 From d13744cd6e3fef373a3fe656ac349b4e7c49ff79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Tue, 23 Sep 2008 11:32:08 +0100 Subject: tracing/ftrace: add the boot tracer Add the boot/initcall tracer. It's primary purpose is to be able to trace the initcalls. It is intended to be used with scripts/bootgraph.pl after some small improvements. Note that it is not active after its init. To avoid tracing (and so crashing) before the whole tracing engine init, you have to explicitly call start_boot_trace() after do_pre_smp_initcalls() to enable it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 4 ++ kernel/trace/trace_boot.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 kernel/trace/trace_boot.c (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cb2c3fb7dd54..b28bf8812efc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -6,6 +6,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -19,6 +20,7 @@ enum trace_type { TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, + TRACE_BOOT, __TRACE_LAST_TYPE }; @@ -30,6 +32,7 @@ struct ftrace_entry { unsigned long ip; unsigned long parent_ip; }; +extern struct tracer boot_tracer; /* * Context switch trace entry - which task (and prio) we switched from/to: @@ -108,6 +111,7 @@ struct trace_field { struct print_entry print; struct mmiotrace_rw mmiorw; struct mmiotrace_map mmiomap; + struct boot_trace initcall; }; }; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c new file mode 100644 index 000000000000..c65ef8ffd6b4 --- /dev/null +++ b/kernel/trace/trace_boot.c @@ -0,0 +1,101 @@ +/* + * ring buffer based initcalls tracer + * + * Copyright (C) 2008 Frederic Weisbecker + * + */ + +#include +#include +#include + +#include "trace.h" + +static struct trace_array *boot_trace; +static int trace_boot_enabled; + + +/* Should be started after do_pre_smp_initcalls() in init/main.c */ +void start_boot_trace(void) +{ + trace_boot_enabled = 1; +} + +void stop_boot_trace(struct trace_array *tr) +{ + trace_boot_enabled = 0; +} + +static void boot_trace_init(struct trace_array *tr) +{ + int cpu; + boot_trace = tr; + + trace_boot_enabled = 0; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr->data[cpu]); +} + +static void boot_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_boot_trace(); + else + stop_boot_trace(tr); +} + +static int initcall_print_line(struct trace_iterator *iter) +{ + int ret = 1; + struct trace_entry *entry = iter->ent; + struct boot_trace *it = &entry->field.initcall; + struct trace_seq *s = &iter->seq; + + if (iter->ent->type == TRACE_BOOT) + ret = trace_seq_printf(s, "%pF called from %i " + "returned %d after %lld msecs\n", + it->func, it->caller, it->result, + it->duration); + if (ret) + return 1; + return 0; +} + +struct tracer boot_tracer __read_mostly = +{ + .name = "initcall", + .init = boot_trace_init, + .reset = stop_boot_trace, + .ctrl_update = boot_trace_ctrl_update, + .print_line = initcall_print_line, +}; + + +void trace_boot(struct boot_trace *it) +{ + struct trace_entry *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = boot_trace; + + if (!trace_boot_enabled) + return; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_BOOT; + entry->field.initcall = *it; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + trace_wake_up(); + + preempt_enable(); +} -- cgit v1.2.1 From b5ad384e79add1d87fff54070000dadcf218ffab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Tue, 23 Sep 2008 11:34:32 +0100 Subject: tracing/ftrace: make tracing suitable to run the boot tracer The tracing engine have now to be init in early_initcall to set the boot tracer. Only the debugfs settings will be initialized at fs_initcall time. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2ef72fa3f17..6ada059832a6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2990,7 +2990,7 @@ struct dentry *tracing_init_dentry(void) #include "trace_selftest.c" #endif -static __init void tracer_init_debugfs(void) +static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; struct dentry *entry; @@ -3078,6 +3078,7 @@ static __init void tracer_init_debugfs(void) #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + return 0; } int trace_vprintk(unsigned long ip, const char *fmt, va_list args) @@ -3504,17 +3505,20 @@ __init static int tracer_alloc_buffers(void) pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); pr_info(" actual entries %ld\n", global_trace.entries); - tracer_init_debugfs(); - trace_init_cmdlines(); register_tracer(&nop_trace); +#ifdef CONFIG_BOOT_TRACER + register_tracer(&boot_tracer); + current_trace = &boot_tracer; + current_trace->init(&global_trace); +#else current_trace = &nop_trace; +#endif /* All seems OK, enable tracing */ global_trace.ctrl = tracer_enabled; tracing_disabled = 0; - atomic_notifier_chain_register(&panic_notifier_list, &trace_panic_notifier); @@ -3548,4 +3552,5 @@ __init static int tracer_alloc_buffers(void) } return ret; } -fs_initcall(tracer_alloc_buffers); +early_initcall(tracer_alloc_buffers); +fs_initcall(tracer_init_debugfs); -- cgit v1.2.1 From 1f5c2abbdeb2bb07b20c6a66bfecefe6c867b1ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Tue, 23 Sep 2008 11:36:20 +0100 Subject: tracing/ftrace: give an entry on the config for boot tracer Bring the entry to choose the boot tracer on the kernel config. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 12 ++++++++++++ kernel/trace/Makefile | 1 + 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 254328dec672..81a17ef6b942 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -116,6 +116,18 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. +config BOOT_TRACER + bool "Trace boot initcalls" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select TRACING + help + This tracer helps developers to optimize boot times: it records + the timings of the initcalls. Its aim is to be parsed by the + /scripts/bootgraph.pl tool to produce pretty graphics about + boot inefficiencies, giving a visual representation of the + delays during initcalls. + config STACK_TRACER bool "Trace max stack" depends on HAVE_FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 73ba13f5a461..35a07f7cfa86 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -22,5 +22,6 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_BOOT_TRACER) += trace_boot.o libftrace-y := ftrace.o -- cgit v1.2.1 From 3ce2b9200da8b7170cc7463b7ee4212fad7b291e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Wed, 24 Sep 2008 10:36:09 +0100 Subject: ftrace/fastboot: disable tracers self-tests when boot tracer is selected The tracing engine resets the ring buffer and the tracers touch it too during self-tests. These self-tests happen during tracers registering and work against boot tracing which is logging initcalls. We have to disable tracing self-tests if the boot-tracer is selected. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 81a17ef6b942..4feb3c81f94d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -126,7 +126,9 @@ config BOOT_TRACER the timings of the initcalls. Its aim is to be parsed by the /scripts/bootgraph.pl tool to produce pretty graphics about boot inefficiencies, giving a visual representation of the - delays during initcalls. + delays during initcalls. Note that tracers self tests can't + be enabled if this tracer is selected since only one tracer + should touch the tracing buffer at a time. config STACK_TRACER bool "Trace max stack" @@ -168,8 +170,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING - depends on DEBUG_KERNEL + depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup -- cgit v1.2.1 From 7c572ac0cf5e8cd8e17f084bc6c253296cc42279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Thu, 25 Sep 2008 13:25:30 +0100 Subject: tracing/ftrace: don't consume unhandled entries by boot tracer When the boot tracer can't handle an entry output, it returns 1. It should return 0 to relay on other output functions. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index c65ef8ffd6b4..d5c9e2e4a9c4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -47,7 +47,7 @@ static void boot_trace_ctrl_update(struct trace_array *tr) static int initcall_print_line(struct trace_iterator *iter) { - int ret = 1; + int ret = 0; struct trace_entry *entry = iter->ent; struct boot_trace *it = &entry->field.initcall; struct trace_seq *s = &iter->seq; -- cgit v1.2.1 From 5aa60c6073456812251caf9177cb921b2de68f77 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:37 -0400 Subject: ftrace: give time for wakeup test to run It is possible that the testing thread in the ftrace wakeup test does not run before we stop the trace. This will cause the trace to fail since nothing will be in the buffers. This patch adds a small wait in the wakeup test to allow for the woken task to run and be traced. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 82db9103b9bc..5ebd4b135498 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -498,6 +498,9 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wake_up_process(p); + /* give a little time to let the thread wake up */ + msleep(100); + /* stop the tracing. */ tr->ctrl = 0; trace->ctrl_update(tr); -- cgit v1.2.1 From 7a8e76a3829f1067b70f715771ff88baf2fbf3c3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:38 -0400 Subject: tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: - array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: - array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 + kernel/trace/Makefile | 1 + kernel/trace/ring_buffer.c | 1672 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1677 insertions(+) create mode 100644 kernel/trace/ring_buffer.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4feb3c81f94d..396aea11398e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -18,9 +18,13 @@ config HAVE_FTRACE_MCOUNT_RECORD config TRACER_MAX_TRACE bool +config RING_BUFFER + bool + config TRACING bool select DEBUG_FS + select RING_BUFFER select STACKTRACE select TRACEPOINTS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 35a07f7cfa86..a85dfba88ba0 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o endif obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 000000000000..830a2930dd91 --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,1672 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include /* used for sched_clock() (for now) */ +#include +#include +#include +#include + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +/* FIXME!!! */ +u64 ring_buffer_time_stamp(int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return sched_clock() << DEBUG_SHIFT; +} + +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} + +#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_ALIGNMENT_SHIFT 2 +#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +#define RB_MAX_SMALL_DATA 28 + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +/* inline for ring buffer fast paths */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + /* undefined */ + return -1; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + if (event->len) + length = event->len << RB_ALIGNMENT_SHIFT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; + default: + BUG(); + } + /* not hit */ + return 0; +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + return rb_event_length(event); +} + +/* inline for ring buffer fast paths */ +static inline void * +rb_event_data(struct ring_buffer_event *event) +{ + BUG_ON(event->type != RINGBUF_TYPE_DATA); + /* If length is in len field, then array[0] has the data */ + if (event->len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu_mask(cpu, buffer->cpumask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * This hack stolen from mm/slob.c. + * We can store per page timing information in the page frame of the page. + * Thanks to Peter Zijlstra for suggesting this idea. + */ +struct buffer_page { + union { + struct { + unsigned long flags; /* mandatory */ + atomic_t _count; /* mandatory */ + u64 time_stamp; /* page time stamp */ + unsigned size; /* size of page data */ + struct list_head list; /* list of free pages */ + }; + struct page page; + }; +}; + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + +#define BUF_PAGE_SIZE PAGE_SIZE + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + struct ring_buffer *buffer; + spinlock_t lock; + struct lock_class_key lock_key; + struct list_head pages; + unsigned long head; /* read from head */ + unsigned long tail; /* write to tail */ + struct buffer_page *head_page; + struct buffer_page *tail_page; + unsigned long overrun; + unsigned long entries; + u64 write_stamp; + u64 read_stamp; + atomic_t record_disabled; +}; + +struct ring_buffer { + unsigned long size; + unsigned pages; + unsigned flags; + int cpus; + cpumask_t cpumask; + atomic_t record_disabled; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + struct buffer_page *head_page; + u64 read_stamp; +}; + +#define RB_WARN_ON(buffer, cond) \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return -1; \ + } + +/** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + RB_WARN_ON(cpu_buffer, head->next->prev != head); + RB_WARN_ON(cpu_buffer, head->prev->next != head); + + list_for_each_entry_safe(page, tmp, head, list) { + RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list); + RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list); + } + + return 0; +} + +static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->head_page->size; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + unsigned long addr; + LIST_HEAD(pages); + unsigned i; + + for (i = 0; i < nr_pages; i++) { + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page = (struct buffer_page *)virt_to_page(addr); + list_add(&page->list, &pages); + } + + list_splice(&pages, head); + + rb_check_pages(cpu_buffer); + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + __free_page(&page->page); + } + return -ENOMEM; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->lock); + INIT_LIST_HEAD(&cpu_buffer->pages); + + ret = rb_allocate_pages(cpu_buffer, buffer->pages); + if (ret < 0) + goto fail_free_buffer; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->tail_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + + return cpu_buffer; + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + list_for_each_entry_safe(page, tmp, head, list) { + list_del_init(&page->list); + __free_page(&page->page); + } + kfree(cpu_buffer); +} + +/** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +{ + struct ring_buffer *buffer; + int bsize; + int cpu; + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + + /* need at least two pages */ + if (buffer->pages == 1) + buffer->pages++; + + buffer->cpumask = cpu_possible_map; + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_buffer; + + for_each_buffer_cpu(buffer, cpu) { + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + } + + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_buffer: + kfree(buffer); + return NULL; +} + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + + kfree(buffer); +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static void +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(&cpu_buffer->pages)); + p = cpu_buffer->pages.next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + __free_page(&page->page); + } + BUG_ON(list_empty(&cpu_buffer->pages)); + + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + +} + +static void +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *pages, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(pages)); + p = pages->next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + list_add_tail(&page->list, &cpu_buffer->pages); + } + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * + * The tracer is responsible for making sure that the buffer is + * not being used while changing the size. + * Note: We may be able to change the above requirement by using + * RCU synchronizations. + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns -1 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned nr_pages, rm_pages, new_pages; + struct buffer_page *page, *tmp; + unsigned long buffer_size; + unsigned long addr; + LIST_HEAD(pages); + int i, cpu; + + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size *= BUF_PAGE_SIZE; + buffer_size = buffer->pages * BUF_PAGE_SIZE; + + /* we need a minimum of two pages */ + if (size < BUF_PAGE_SIZE * 2) + size = BUF_PAGE_SIZE * 2; + + if (size == buffer_size) + return size; + + mutex_lock(&buffer->mutex); + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + if (size < buffer_size) { + + /* easy case, just free pages */ + BUG_ON(nr_pages >= buffer->pages); + + rm_pages = buffer->pages - nr_pages; + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_remove_pages(cpu_buffer, rm_pages); + } + goto out; + } + + /* + * This is a bit more difficult. We only want to add pages + * when we can allocate enough for all CPUs. We do this + * by allocating all the pages and storing them on a local + * link list. If we succeed in our allocation, then we + * add these pages to the cpu_buffers. Otherwise we just free + * them all and return -ENOMEM; + */ + BUG_ON(nr_pages <= buffer->pages); + new_pages = nr_pages - buffer->pages; + + for_each_buffer_cpu(buffer, cpu) { + for (i = 0; i < new_pages; i++) { + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page = (struct buffer_page *)virt_to_page(addr); + list_add(&page->list, &pages); + } + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_insert_pages(cpu_buffer, &pages, new_pages); + } + + BUG_ON(!list_empty(&pages)); + + out: + buffer->pages = nr_pages; + mutex_unlock(&buffer->mutex); + + return size; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + __free_page(&page->page); + } + return -ENOMEM; +} + +static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->head_page == cpu_buffer->tail_page && + cpu_buffer->head == cpu_buffer->tail; +} + +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING; +} + +static inline void *rb_page_index(struct buffer_page *page, unsigned index) +{ + void *addr = page_address(&page->page); + + return addr + index; +} + +static inline struct ring_buffer_event * +rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_index(cpu_buffer->head_page, + cpu_buffer->head); +} + +static inline struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ + return rb_page_index(iter->head_page, + iter->head); +} + +/* + * When the tail hits the head and the buffer is in overwrite mode, + * the head jumps to the next page and all content on the previous + * page is discarded. But before doing so, we update the overrun + * variable of the buffer. + */ +static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + unsigned long head; + + for (head = 0; head < rb_head_size(cpu_buffer); + head += rb_event_length(event)) { + + event = rb_page_index(cpu_buffer->head_page, head); + BUG_ON(rb_null_event(event)); + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->overrun++; + cpu_buffer->entries--; + } +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **page) +{ + struct list_head *p = (*page)->list.next; + + if (p == &cpu_buffer->pages) + p = p->next; + + *page = list_entry(p, struct buffer_page, list); +} + +static inline void +rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) +{ + cpu_buffer->tail_page->time_stamp = *ts; + cpu_buffer->write_stamp = *ts; +} + +static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp; + cpu_buffer->head = 0; +} + +static void +rb_reset_iter_read_page(struct ring_buffer_iter *iter) +{ + iter->read_stamp = iter->head_page->time_stamp; + iter->head = 0; +} + +/** + * ring_buffer_update_event - update event type and data + * @event: the even to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static inline void +rb_update_event(struct ring_buffer_event *event, + unsigned type, unsigned length) +{ + event->type = type; + + switch (type) { + + case RINGBUF_TYPE_PADDING: + break; + + case RINGBUF_TYPE_TIME_EXTEND: + event->len = + (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + + case RINGBUF_TYPE_TIME_STAMP: + event->len = + (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + + case RINGBUF_TYPE_DATA: + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA) { + event->len = 0; + event->array[0] = length; + } else + event->len = + (length + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + default: + BUG(); + } +} + +static inline unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length = 1; + + if (length > RB_MAX_SMALL_DATA) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ALIGNMENT); + + return length; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *head_page, *tail_page; + unsigned long tail; + struct ring_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_event *event; + + tail_page = cpu_buffer->tail_page; + head_page = cpu_buffer->head_page; + tail = cpu_buffer->tail; + + if (tail + length > BUF_PAGE_SIZE) { + struct buffer_page *next_page = tail_page; + + rb_inc_page(cpu_buffer, &next_page); + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) + return NULL; + + /* count overflows */ + rb_update_overflow(cpu_buffer); + + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + rb_reset_read_page(cpu_buffer); + } + + if (tail != BUF_PAGE_SIZE) { + event = rb_page_index(tail_page, tail); + /* page padding */ + event->type = RINGBUF_TYPE_PADDING; + } + + tail_page->size = tail; + tail_page = next_page; + tail_page->size = 0; + tail = 0; + cpu_buffer->tail_page = tail_page; + cpu_buffer->tail = tail; + rb_add_stamp(cpu_buffer, ts); + } + + BUG_ON(tail + length > BUF_PAGE_SIZE); + + event = rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + return event; +} + +static int +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + u64 *ts, u64 *delta) +{ + struct ring_buffer_event *event; + static int once; + + if (unlikely(*delta > (1ULL << 59) && !once++)) { + printk(KERN_WARNING "Delta way too big! %llu" + " ts=%llu write stamp = %llu\n", + *delta, *ts, cpu_buffer->write_stamp); + WARN_ON(1); + } + + /* + * The delta is too big, we to add a + * new timestamp. + */ + event = __rb_reserve_next(cpu_buffer, + RINGBUF_TYPE_TIME_EXTEND, + RB_LEN_TIME_EXTEND, + ts); + if (!event) + return -1; + + /* check to see if we went to the next page */ + if (cpu_buffer->tail) { + /* Still on same page, update timestamp */ + event->time_delta = *delta & TS_MASK; + event->array[0] = *delta >> TS_SHIFT; + /* commit the time event */ + cpu_buffer->tail += + rb_event_length(event); + cpu_buffer->write_stamp = *ts; + *delta = 0; + } + + return 0; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length) +{ + struct ring_buffer_event *event; + u64 ts, delta; + + ts = ring_buffer_time_stamp(cpu_buffer->cpu); + + if (cpu_buffer->tail) { + delta = ts - cpu_buffer->write_stamp; + + if (test_time_stamp(delta)) { + int ret; + + ret = rb_add_time_stamp(cpu_buffer, &ts, &delta); + if (ret < 0) + return NULL; + } + } else { + rb_add_stamp(cpu_buffer, &ts); + delta = 0; + } + + event = __rb_reserve_next(cpu_buffer, type, length, &ts); + if (!event) + return NULL; + + /* If the reserve went to the next page, our delta is zero */ + if (!cpu_buffer->tail) + delta = 0; + + event->time_delta = delta; + + return event; +} + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * @flags: a pointer to save the interrupt flags + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, + unsigned long length, + unsigned long *flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int cpu; + + if (atomic_read(&buffer->record_disabled)) + return NULL; + + raw_local_irq_save(*flags); + cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, buffer->cpumask)) + goto out_irq; + + cpu_buffer = buffer->buffers[cpu]; + spin_lock(&cpu_buffer->lock); + + if (atomic_read(&cpu_buffer->record_disabled)) + goto no_record; + + length = rb_calculate_event_length(length); + if (length > BUF_PAGE_SIZE) + return NULL; + + event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + if (!event) + goto no_record; + + return event; + + no_record: + spin_unlock(&cpu_buffer->lock); + out_irq: + local_irq_restore(*flags); + return NULL; +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + cpu_buffer->tail += rb_event_length(event); + cpu_buffer->tail_page->size = cpu_buffer->tail; + cpu_buffer->write_stamp += event->time_delta; + cpu_buffer->entries++; +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * @flags: the interrupt flags received from ring_buffer_lock_reserve. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + assert_spin_locked(&cpu_buffer->lock); + + rb_commit(cpu_buffer, event); + + spin_unlock(&cpu_buffer->lock); + raw_local_irq_restore(flags); + + return 0; +} + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, + void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned long event_length, flags; + void *body; + int ret = -EBUSY; + int cpu; + + if (atomic_read(&buffer->record_disabled)) + return -EBUSY; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, buffer->cpumask)) + goto out_irq; + + cpu_buffer = buffer->buffers[cpu]; + spin_lock(&cpu_buffer->lock); + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + event_length = rb_calculate_event_length(length); + event = rb_reserve_next_event(cpu_buffer, + RINGBUF_TYPE_DATA, event_length); + if (!event) + goto out; + + body = rb_event_data(event); + + memcpy(body, data, length); + + rb_commit(cpu_buffer, event); + + ret = 0; + out: + spin_unlock(&cpu_buffer->lock); + out_irq: + local_irq_restore(flags); + + return ret; +} + +/** + * ring_buffer_lock - lock the ring buffer + * @buffer: The ring buffer to lock + * @flags: The place to store the interrupt flags + * + * This locks all the per CPU buffers. + * + * Must be unlocked by ring_buffer_unlock. + */ +void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + local_irq_save(*flags); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + spin_lock(&cpu_buffer->lock); + } +} + +/** + * ring_buffer_unlock - unlock a locked buffer + * @buffer: The locked buffer to unlock + * @flags: The interrupt flags received by ring_buffer_lock + */ +void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) { + if (!cpu_isset(cpu, buffer->cpumask)) + continue; + cpu_buffer = buffer->buffers[cpu]; + spin_unlock(&cpu_buffer->lock); + } + + local_irq_restore(flags); +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable(struct ring_buffer *buffer) +{ + atomic_inc(&buffer->record_disabled); +} + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct ring_buffer *buffer) +{ + atomic_dec(&buffer->record_disabled); +} + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_inc(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->overrun; +} + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long entries = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + entries += cpu_buffer->entries; + } + + return entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long overruns = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + overruns += cpu_buffer->overrun; + } + + return overruns; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + iter->head_page = cpu_buffer->head_page; + iter->head = cpu_buffer->head; + rb_reset_iter_read_page(iter); +} + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = iter->cpu_buffer; + + return iter->head_page == cpu_buffer->tail_page && + iter->head == cpu_buffer->tail; +} + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + cpu_buffer->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + iter->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + iter->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + unsigned length; + + /* + * Check if we are at the end of the buffer. + */ + if (cpu_buffer->head >= cpu_buffer->head_page->size) { + BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page); + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + rb_reset_read_page(cpu_buffer); + return; + } + + event = rb_head_event(cpu_buffer); + + if (event->type == RINGBUF_TYPE_DATA) + cpu_buffer->entries--; + + length = rb_event_length(event); + + /* + * This should not be called to advance the header if we are + * at the tail of the buffer. + */ + BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) && + (cpu_buffer->head + length > cpu_buffer->tail)); + + rb_update_read_stamp(cpu_buffer, event); + + cpu_buffer->head += length; + + /* check for end of page */ + if ((cpu_buffer->head >= cpu_buffer->head_page->size) && + (cpu_buffer->head_page != cpu_buffer->tail_page)) + rb_advance_head(cpu_buffer); +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned length; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + /* + * Check if we are at the end of the buffer. + */ + if (iter->head >= iter->head_page->size) { + BUG_ON(iter->head_page == cpu_buffer->tail_page); + rb_inc_page(cpu_buffer, &iter->head_page); + rb_reset_iter_read_page(iter); + return; + } + + event = rb_iter_head_event(iter); + + length = rb_event_length(event); + + /* + * This should not be called to advance the header if we are + * at the tail of the buffer. + */ + BUG_ON((iter->head_page == cpu_buffer->tail_page) && + (iter->head + length > cpu_buffer->tail)); + + rb_update_iter_read_stamp(iter, event); + + iter->head += length; + + /* check for end of page padding */ + if ((iter->head >= iter->head_page->size) && + (iter->head_page != cpu_buffer->tail_page)) + rb_advance_iter(iter); +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + again: + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + event = rb_head_event(cpu_buffer); + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + rb_reset_read_page(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_head(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_head(cpu_buffer); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = cpu_buffer->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (ring_buffer_iter_empty(iter)) + return NULL; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + again: + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + event = rb_iter_head_event(iter); + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + rb_inc_page(cpu_buffer, &iter->head_page); + rb_reset_iter_read_page(iter); + goto again; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = iter->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + event = ring_buffer_peek(buffer, cpu, ts); + if (!event) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + rb_advance_head(cpu_buffer); + + return event; +} + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * + * This starts up an iteration through the buffer. It also disables + * the recording to the buffer until the reading is finished. + * This prevents the reading from being corrupted. This is not + * a consuming read, so a producer is not expected. + * + * Must be paired with ring_buffer_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_iter *iter; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + iter->cpu_buffer = cpu_buffer; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + spin_lock(&cpu_buffer->lock); + iter->head = cpu_buffer->head; + iter->head_page = cpu_buffer->head_page; + rb_reset_iter_read_page(iter); + spin_unlock(&cpu_buffer->lock); + + return iter; +} + +/** + * ring_buffer_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + atomic_dec(&cpu_buffer->record_disabled); + kfree(iter); +} + +/** + * ring_buffer_read - read the next item in the ring buffer by the iterator + * @iter: The ring buffer iterator + * @ts: The time stamp of the event read. + * + * This reads the next event in the ring buffer and increments the iterator. + */ +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_event *event; + + event = ring_buffer_iter_peek(iter, ts); + if (!event) + return NULL; + + rb_advance_iter(iter); + + return event; +} + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + */ +unsigned long ring_buffer_size(struct ring_buffer *buffer) +{ + return BUF_PAGE_SIZE * buffer->pages; +} + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->tail_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + + cpu_buffer->head = cpu_buffer->tail = 0; + cpu_buffer->overrun = 0; + cpu_buffer->entries = 0; +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + unsigned long flags; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + raw_local_irq_save(flags); + spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + spin_unlock(&cpu_buffer->lock); + raw_local_irq_restore(flags); +} + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct ring_buffer *buffer) +{ + unsigned long flags; + int cpu; + + ring_buffer_lock(buffer, &flags); + + for_each_buffer_cpu(buffer, cpu) + rb_reset_cpu(buffer->buffers[cpu]); + + ring_buffer_unlock(buffer, flags); +} + +/** + * rind_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +int ring_buffer_empty(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* yes this is racy, but if you don't like the race, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!rb_per_cpu_empty(cpu_buffer)) + return 0; + } + return 1; +} + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 1; + + cpu_buffer = buffer->buffers[cpu]; + return rb_per_cpu_empty(cpu_buffer); +} + +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; + + if (!cpu_isset(cpu, buffer_a->cpumask) || + !cpu_isset(cpu, buffer_b->cpumask)) + return -EINVAL; + + /* At least make sure the two buffers are somewhat the same */ + if (buffer_a->size != buffer_b->size || + buffer_a->pages != buffer_b->pages) + return -EINVAL; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + + /* + * We can't do a synchronize_sched here because this + * function can be called in atomic context. + * Normally this will be called from the same CPU as cpu. + * If not it's up to the caller to protect this. + */ + atomic_inc(&cpu_buffer_a->record_disabled); + atomic_inc(&cpu_buffer_b->record_disabled); + + buffer_a->buffers[cpu] = cpu_buffer_b; + buffer_b->buffers[cpu] = cpu_buffer_a; + + cpu_buffer_b->buffer = buffer_a; + cpu_buffer_a->buffer = buffer_b; + + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); + + return 0; +} + -- cgit v1.2.1 From a7b1374333407f409cf8df7e623b12490f073c84 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:39 -0400 Subject: ring_buffer: add paranoid check for buffer page If for some strange reason the buffer_page gets bigger, or the page struct gets smaller, I want to know this ASAP. The best way is to not let the kernel compile. This patch adds code to test the size of the struct buffer_page against the page struct and will cause compile issues if the buffer_page ever gets bigger than the page struct. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 830a2930dd91..95ca9338cb6c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -289,6 +289,12 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* + * Causes compile errors if the struct buffer_page gets bigger + * than the struct page. + */ +extern int ring_buffer_page_too_big(void); + /** * ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes that is needed. @@ -305,6 +311,11 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) int bsize; int cpu; + /* Paranoid! Optimizes out when all is well */ + if (sizeof(struct buffer_page) > sizeof(struct page)) + ring_buffer_page_too_big(); + + /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); -- cgit v1.2.1 From ed56829cb3195de499f97fa6108fe9134319bae6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:40 -0400 Subject: ring_buffer: reset buffer page when freeing Mathieu Desnoyers pointed out that the freeing of the page frame needs to be reset otherwise we might trigger BUG_ON in the page free code. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 95ca9338cb6c..cfa711374d9a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -127,6 +127,17 @@ struct buffer_page { }; }; +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static inline void free_buffer_page(struct buffer_page *bpage) +{ + reset_page_mapcount(&bpage->page); + bpage->page.mapping = NULL; + __free_page(&bpage->page); +} + /* * We need to fit the time_stamp delta into 27 bits. */ @@ -240,7 +251,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, free_pages: list_for_each_entry_safe(page, tmp, &pages, list) { list_del_init(&page->list); - __free_page(&page->page); + free_buffer_page(page); } return -ENOMEM; } @@ -284,7 +295,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) list_for_each_entry_safe(page, tmp, head, list) { list_del_init(&page->list); - __free_page(&page->page); + free_buffer_page(page); } kfree(cpu_buffer); } @@ -393,7 +404,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) p = cpu_buffer->pages.next; page = list_entry(p, struct buffer_page, list); list_del_init(&page->list); - __free_page(&page->page); + free_buffer_page(page); } BUG_ON(list_empty(&cpu_buffer->pages)); @@ -520,7 +531,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) free_pages: list_for_each_entry_safe(page, tmp, &pages, list) { list_del_init(&page->list); - __free_page(&page->page); + free_buffer_page(page); } return -ENOMEM; } -- cgit v1.2.1 From 3928a8a2d98081d1bc3c0a84a2d70e29b90ecf1c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:41 -0400 Subject: ftrace: make work with new ring buffer This patch ports ftrace over to the new ring buffer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 932 ++++++++++---------------------------- kernel/trace/trace.h | 22 +- kernel/trace/trace_boot.c | 16 +- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_irqsoff.c | 6 +- kernel/trace/trace_mmiotrace.c | 40 +- kernel/trace/trace_nop.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 60 +-- kernel/trace/trace_sysprof.c | 2 +- 11 files changed, 288 insertions(+), 798 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6ada059832a6..ef80793858b8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -33,25 +33,22 @@ #include #include +#include #include "trace.h" +#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) + unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; -static unsigned long __read_mostly tracing_nr_buffers; static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -static int trace_alloc_page(void); -static int trace_free_page(void); - static int tracing_disabled = 1; -static unsigned long tracing_pages_allocated; - long ns2usecs(cycle_t nsec) { @@ -62,7 +59,9 @@ ns2usecs(cycle_t nsec) cycle_t ftrace_now(int cpu) { - return cpu_clock(cpu); + u64 ts = ring_buffer_time_stamp(cpu); + ring_buffer_normalize_time_stamp(cpu, &ts); + return ts; } /* @@ -102,18 +101,18 @@ static int tracer_enabled = 1; int ftrace_function_enabled; /* - * trace_nr_entries is the number of entries that is allocated - * for a buffer. Note, the number of entries is always rounded - * to ENTRIES_PER_PAGE. + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. * * This number is purposely set to a low number of 16384. * If the dump on oops happens, it will be much appreciated * to not have to wait for all that output. Anyway this can be * boot time and run time configurable. */ -#define TRACE_ENTRIES_DEFAULT 16384UL +#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ -static unsigned long trace_nr_entries = TRACE_ENTRIES_DEFAULT; +static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; @@ -158,23 +157,21 @@ void trace_wake_up(void) wake_up(&trace_wait); } -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) - -static int __init set_nr_entries(char *str) +static int __init set_buf_size(char *str) { - unsigned long nr_entries; + unsigned long buf_size; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &nr_entries); + ret = strict_strtoul(str, 0, &buf_size); /* nr_entries can not be zero */ - if (ret < 0 || nr_entries == 0) + if (ret < 0 || buf_size == 0) return 0; - trace_nr_entries = nr_entries; + trace_buf_size = buf_size; return 1; } -__setup("trace_entries=", set_nr_entries); +__setup("trace_buf_size=", set_buf_size); unsigned long nsecs_to_usecs(unsigned long nsecs) { @@ -243,54 +240,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } -#define CHECK_COND(cond) \ - if (unlikely(cond)) { \ - tracing_disabled = 1; \ - WARN_ON(1); \ - return -1; \ - } - -/** - * check_pages - integrity check of trace buffers - * - * As a safty measure we check to make sure the data pages have not - * been corrupted. - */ -int check_pages(struct trace_array_cpu *data) -{ - struct page *page, *tmp; - - CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); - CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); - - list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - CHECK_COND(page->lru.next->prev != &page->lru); - CHECK_COND(page->lru.prev->next != &page->lru); - } - - return 0; -} - -/** - * head_page - page address of the first page in per_cpu buffer. - * - * head_page returns the page address of the first page in - * a per_cpu buffer. This also preforms various consistency - * checks to make sure the buffer has not been corrupted. - */ -void *head_page(struct trace_array_cpu *data) -{ - struct page *page; - - if (list_empty(&data->trace_pages)) - return NULL; - - page = list_entry(data->trace_pages.next, struct page, lru); - BUG_ON(&page->lru == &data->trace_pages); - - return page_address(page); -} - /** * trace_seq_printf - sequence printing of trace information * @s: trace sequence descriptor @@ -437,34 +386,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } -/* - * flip the trace buffers between two trace descriptors. - * This usually is the buffers between the global_trace and - * the max_tr to record a snapshot of a current trace. - * - * The ftrace_max_lock must be held. - */ -static void -flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) -{ - struct list_head flip_pages; - - INIT_LIST_HEAD(&flip_pages); - - memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, - sizeof(struct trace_array_cpu) - - offsetof(struct trace_array_cpu, trace_head_idx)); - - check_pages(tr1); - check_pages(tr2); - list_splice_init(&tr1->trace_pages, &flip_pages); - list_splice_init(&tr2->trace_pages, &tr1->trace_pages); - list_splice_init(&flip_pages, &tr2->trace_pages); - BUG_ON(!list_empty(&flip_pages)); - check_pages(tr1); - check_pages(tr2); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -477,17 +398,15 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data; - int i; + struct ring_buffer *buf = tr->buffer; WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - /* clear out all the previous traces */ - for_each_tracing_cpu(i) { - data = tr->data[i]; - flip_trace(max_tr.data[i], data); - tracing_reset(data); - } + + tr->buffer = max_tr.buffer; + max_tr.buffer = buf; + + ring_buffer_reset(tr->buffer); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -504,16 +423,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; - int i; + int ret; WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - for_each_tracing_cpu(i) - tracing_reset(max_tr.data[i]); - flip_trace(max_tr.data[cpu], data); - tracing_reset(data); + ring_buffer_reset(max_tr.buffer); + ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + + WARN_ON_ONCE(ret); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -550,7 +468,6 @@ int register_tracer(struct tracer *type) #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest) { struct tracer *saved_tracer = current_trace; - struct trace_array_cpu *data; struct trace_array *tr = &global_trace; int saved_ctrl = tr->ctrl; int i; @@ -562,10 +479,7 @@ int register_tracer(struct tracer *type) * If we fail, we do not register this tracer. */ for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); + tracing_reset(tr, i); } current_trace = type; tr->ctrl = 0; @@ -581,10 +495,7 @@ int register_tracer(struct tracer *type) } /* Only reset on passing, to avoid touching corrupted buffers */ for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); + tracing_reset(tr, i); } printk(KERN_CONT "PASSED\n"); } @@ -630,13 +541,9 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void tracing_reset(struct trace_array_cpu *data) +void tracing_reset(struct trace_array *tr, int cpu) { - data->trace_idx = 0; - data->overrun = 0; - data->trace_head = data->trace_tail = head_page(data); - data->trace_head_idx = 0; - data->trace_tail_idx = 0; + ring_buffer_reset_cpu(tr->buffer, cpu); } #define SAVED_CMDLINES 128 @@ -722,70 +629,6 @@ void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } -static inline struct list_head * -trace_next_list(struct trace_array_cpu *data, struct list_head *next) -{ - /* - * Roundrobin - but skip the head (which is not a real page): - */ - next = next->next; - if (unlikely(next == &data->trace_pages)) - next = next->next; - BUG_ON(next == &data->trace_pages); - - return next; -} - -static inline void * -trace_next_page(struct trace_array_cpu *data, void *addr) -{ - struct list_head *next; - struct page *page; - - page = virt_to_page(addr); - - next = trace_next_list(data, &page->lru); - page = list_entry(next, struct page, lru); - - return page_address(page); -} - -struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) -{ - unsigned long idx, idx_next; - struct trace_entry *entry; - - data->trace_idx++; - idx = data->trace_head_idx; - idx_next = idx + 1; - - BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - - entry = data->trace_head + idx * TRACE_ENTRY_SIZE; - - if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { - data->trace_head = trace_next_page(data, data->trace_head); - idx_next = 0; - } - - if (data->trace_head == data->trace_tail && - idx_next == data->trace_tail_idx) { - /* overrun */ - data->overrun++; - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = - trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } - } - - data->trace_head_idx = idx_next; - - return entry; -} - void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { @@ -796,7 +639,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) entry->field.preempt_count = pc & 0xff; entry->field.pid = (tsk) ? tsk->pid : 0; - entry->field.t = ftrace_now(raw_smp_processor_id()); entry->field.flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | @@ -808,18 +650,20 @@ void trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { + struct ring_buffer_event *event; struct trace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->field.fn.ip = ip; entry->field.fn.parent_ip = parent_ip; - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } void @@ -835,13 +679,19 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip) { + struct ring_buffer_event *event; struct trace_entry *entry; struct stack_trace trace; + unsigned long irq_flags; if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, flags); entry->type = TRACE_STACK; @@ -853,28 +703,31 @@ void __trace_stack(struct trace_array *tr, trace.entries = entry->field.stack.caller; save_stack_trace(&trace); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } void __trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { + struct ring_buffer_event *event; struct trace_array_cpu *data = __data; struct trace_array *tr = __tr; struct trace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, 0); entry->type = TRACE_SPECIAL; entry->field.special.arg1 = arg1; entry->field.special.arg2 = arg2; entry->field.special.arg3 = arg3; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); __trace_stack(tr, data, irq_flags, 4); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -886,12 +739,15 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags) { + struct ring_buffer_event *event; struct trace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; entry->field.ctx.prev_pid = prev->pid; @@ -901,9 +757,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->field.ctx.next_prio = next->prio; entry->field.ctx.next_state = next->state; entry->field.ctx.next_cpu = task_cpu(next); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); __trace_stack(tr, data, flags, 5); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); } void @@ -913,12 +768,15 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags) { + struct ring_buffer_event *event; struct trace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, flags); entry->type = TRACE_WAKE; entry->field.ctx.prev_pid = curr->pid; @@ -928,9 +786,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->field.ctx.next_prio = wakee->prio; entry->field.ctx.next_state = wakee->state; entry->field.ctx.next_cpu = task_cpu(wakee); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); __trace_stack(tr, data, flags, 6); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -1011,183 +868,77 @@ enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; -/* Return the current entry. */ -static struct trace_entry * -trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, - struct trace_iterator *iter, int cpu) -{ - struct page *page; - struct trace_entry *array; - - if (iter->next_idx[cpu] >= tr->entries || - iter->next_idx[cpu] >= data->trace_idx || - (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx)) - return NULL; - - if (!iter->next_page[cpu]) { - /* Initialize the iterator for this cpu trace buffer */ - WARN_ON(!data->trace_tail); - page = virt_to_page(data->trace_tail); - iter->next_page[cpu] = &page->lru; - iter->next_page_idx[cpu] = data->trace_tail_idx; - } - - page = list_entry(iter->next_page[cpu], struct page, lru); - BUG_ON(&data->trace_pages == &page->lru); - - array = page_address(page); - - WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); - return &array[iter->next_page_idx[cpu]]; -} - -/* Increment the index counter of an iterator by one */ -static void __trace_iterator_increment(struct trace_iterator *iter, int cpu) -{ - iter->next_idx[cpu]++; - iter->next_page_idx[cpu]++; - - if (iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[cpu]; - - iter->next_page_idx[cpu] = 0; - iter->next_page[cpu] = - trace_next_list(data, iter->next_page[cpu]); - } -} - static void trace_iterator_increment(struct trace_iterator *iter, int cpu) { iter->idx++; - __trace_iterator_increment(iter, cpu); + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); } static struct trace_entry * -trace_entry_next(struct trace_array *tr, struct trace_array_cpu *data, - struct trace_iterator *iter, int cpu) +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) { - struct list_head *next_page; - struct trace_entry *ent; - int idx, next_idx, next_page_idx; - - ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); - - if (likely(!ent || ent->type != TRACE_CONT)) - return ent; - - /* save the iterator details */ - idx = iter->idx; - next_idx = iter->next_idx[cpu]; - next_page_idx = iter->next_page_idx[cpu]; - next_page = iter->next_page[cpu]; - - /* find a real entry */ - do { - __trace_iterator_increment(iter, cpu); - ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); - } while (ent && ent->type != TRACE_CONT); - - /* reset the iterator */ - iter->idx = idx; - iter->next_idx[cpu] = next_idx; - iter->next_page_idx[cpu] = next_page_idx; - iter->next_page[cpu] = next_page; + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - return ent; + event = ring_buffer_iter_peek(buf_iter, ts); + return event ? ring_buffer_event_data(event) : NULL; } - static struct trace_entry * -__find_next_entry(struct trace_iterator *iter, int *ent_cpu, int inc) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - struct trace_array *tr = iter->tr; + struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + u64 next_ts = 0, ts; int next_cpu = -1; int cpu; for_each_tracing_cpu(cpu) { - if (!head_page(tr->data[cpu])) - continue; - ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + if (ring_buffer_empty_cpu(buffer, cpu)) + continue; - if (ent && ent->type == TRACE_CONT) { - struct trace_array_cpu *data = tr->data[cpu]; - - if (!inc) - ent = trace_entry_next(tr, data, iter, cpu); - else { - while (ent && ent->type == TRACE_CONT) { - __trace_iterator_increment(iter, cpu); - ent = trace_entry_idx(tr, tr->data[cpu], - iter, cpu); - } - } - } + ent = peek_next_entry(iter, cpu, &ts); /* * Pick the entry with the smallest timestamp: */ - if (ent && (!next || ent->field.t < next->field.t)) { + if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; + next_ts = ts; } } if (ent_cpu) *ent_cpu = next_cpu; + if (ent_ts) + *ent_ts = next_ts; + return next; } /* Find the next real entry, without updating the iterator itself */ static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu) +find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, 0); + return __find_next_entry(iter, ent_cpu, ent_ts); } /* Find the next real entry, and increment the iterator to the next entry */ static void *find_next_entry_inc(struct trace_iterator *iter) { - struct trace_entry *next; - int next_cpu = -1; - - next = __find_next_entry(iter, &next_cpu, 1); - - iter->prev_ent = iter->ent; - iter->prev_cpu = iter->cpu; + iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); - iter->ent = next; - iter->cpu = next_cpu; - - if (next) + if (iter->ent) trace_iterator_increment(iter, iter->cpu); - return next ? iter : NULL; + return iter->ent ? iter : NULL; } static void trace_consume(struct trace_iterator *iter) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - struct trace_entry *ent; - - again: - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } - - /* Check if we empty it, then reset the index */ - if (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx) - data->trace_idx = 0; - - ent = trace_entry_idx(iter->tr, iter->tr->data[iter->cpu], - iter, iter->cpu); - if (ent && ent->type == TRACE_CONT) - goto again; + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1220,7 +971,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct trace_iterator *iter = m->private; void *p = NULL; loff_t l = 0; - int i; + int cpu; mutex_lock(&trace_types_lock); @@ -1239,12 +990,9 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->ent = NULL; iter->cpu = 0; iter->idx = -1; - iter->prev_ent = NULL; - iter->prev_cpu = -1; - for_each_tracing_cpu(i) { - iter->next_idx[i] = 0; - iter->next_page[i] = NULL; + for_each_tracing_cpu(cpu) { + ring_buffer_iter_reset(iter->buffer_iter[cpu]); } for (p = iter; p && l < *pos; p = s_next(m, p, &l)) @@ -1365,23 +1113,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long total = 0; - unsigned long entries = 0; - int cpu; + unsigned long total; + unsigned long entries; const char *name = "preemption"; if (type) name = type->name; - for_each_tracing_cpu(cpu) { - if (head_page(tr->data[cpu])) { - total += tr->data[cpu]->trace_idx; - if (tr->data[cpu]->trace_idx > tr->entries) - entries += tr->entries; - else - entries += tr->data[cpu]->trace_idx; - } - } + entries = ring_buffer_entries(iter->tr->buffer); + total = entries + + ring_buffer_overruns(iter->tr->buffer); seq_printf(m, "%s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1468,7 +1209,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; static void -lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, unsigned long rel_usecs) { trace_seq_printf(s, " %4lldus", abs_usecs); @@ -1488,12 +1229,10 @@ static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; */ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) { - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[iter->cpu]; struct trace_entry *ent; bool ok = true; - ent = trace_entry_idx(tr, data, iter, iter->cpu); + ent = peek_next_entry(iter, iter->cpu, NULL); if (!ent || ent->type != TRACE_CONT) { trace_seq_putc(s, '\n'); return; @@ -1502,8 +1241,8 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) do { if (ok) ok = (trace_seq_printf(s, "%s", ent->cont.buf) > 0); - __trace_iterator_increment(iter, iter->cpu); - ent = trace_entry_idx(tr, data, iter, iter->cpu); + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + ent = peek_next_entry(iter, iter->cpu, NULL); } while (ent && ent->type == TRACE_CONT); if (!ok) @@ -1515,25 +1254,26 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *next_entry = find_next_entry(iter, NULL); + struct trace_entry *next_entry; unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; struct trace_field *field = &entry->field; unsigned long abs_usecs; unsigned long rel_usecs; + u64 next_ts; char *comm; int S, T; int i; unsigned state; - if (!next_entry) - next_entry = entry; - if (entry->type == TRACE_CONT) return 1; - rel_usecs = ns2usecs(next_entry->field.t - entry->field.t); - abs_usecs = ns2usecs(entry->field.t - iter->tr->time_start); + next_entry = find_next_entry(iter, NULL, &next_ts); + if (!next_entry) + next_ts = iter->ts; + rel_usecs = ns2usecs(next_ts - iter->ts); + abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); if (verbose) { comm = trace_find_cmdline(field->pid); @@ -1542,7 +1282,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm, field->pid, cpu, field->flags, field->preempt_count, trace_idx, - ns2usecs(field->t), + ns2usecs(iter->ts), abs_usecs/1000, abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); @@ -1627,7 +1367,7 @@ static int print_trace_fmt(struct trace_iterator *iter) comm = trace_find_cmdline(iter->ent->field.pid); - t = ns2usecs(field->t); + t = ns2usecs(iter->ts); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; @@ -1732,7 +1472,7 @@ static int print_raw_fmt(struct trace_iterator *iter) field = &entry->field; ret = trace_seq_printf(s, "%d %d %llu ", - field->pid, iter->cpu, field->t); + field->pid, iter->cpu, iter->ts); if (!ret) return 0; @@ -1811,7 +1551,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, field->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->t); + SEQ_PUT_HEX_FIELD_RET(s, iter->ts); switch (entry->type) { case TRACE_FN: @@ -1861,7 +1601,7 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, field->pid); SEQ_PUT_FIELD_RET(s, field->cpu); - SEQ_PUT_FIELD_RET(s, field->t); + SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { case TRACE_FN: @@ -1888,15 +1628,10 @@ static int print_bin_fmt(struct trace_iterator *iter) static int trace_empty(struct trace_iterator *iter) { - struct trace_array_cpu *data; int cpu; for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (head_page(data) && data->trace_idx && - (data->trace_tail != data->trace_head || - data->trace_tail_idx != data->trace_head_idx)) + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) return 0; } return 1; @@ -1961,6 +1696,8 @@ static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; + struct seq_file *m; + int cpu; if (tracing_disabled) { *ret = -ENODEV; @@ -1981,28 +1718,43 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = + ring_buffer_read_start(iter->tr->buffer, cpu); + if (!iter->buffer_iter[cpu]) + goto fail_buffer; + } + /* TODO stop tracer */ *ret = seq_open(file, &tracer_seq_ops); - if (!*ret) { - struct seq_file *m = file->private_data; - m->private = iter; + if (*ret) + goto fail_buffer; - /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } + m = file->private_data; + m->private = iter; - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - } else { - kfree(iter); - iter = NULL; + /* stop the trace while dumping */ + if (iter->tr->ctrl) { + tracer_enabled = 0; + ftrace_function_enabled = 0; } + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + mutex_unlock(&trace_types_lock); out: return iter; + + fail_buffer: + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + mutex_unlock(&trace_types_lock); + + return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -2018,8 +1770,14 @@ int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct trace_iterator *iter = m->private; + int cpu; mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + if (iter->trace && iter->trace->close) iter->trace->close(iter); @@ -2526,6 +2284,7 @@ static atomic_t tracing_reader; static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_iterator *iter; + int cpu; if (tracing_disabled) return -ENODEV; @@ -2546,17 +2305,38 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->trace = current_trace; filp->private_data = iter; + for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = + ring_buffer_read_start(iter->tr->buffer, cpu); + if (!iter->buffer_iter[cpu]) + goto fail_buffer; + } + if (iter->trace->pipe_open) iter->trace->pipe_open(iter); mutex_unlock(&trace_types_lock); return 0; + + fail_buffer: + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + mutex_unlock(&trace_types_lock); + + return -ENOMEM; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + int cpu; + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } kfree(iter); atomic_dec(&tracing_reader); @@ -2592,13 +2372,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array_cpu *data; - static cpumask_t mask; unsigned long flags; #ifdef CONFIG_FTRACE int ftrace_save; #endif - int cpu; ssize_t sret; /* return any leftover data */ @@ -2687,32 +2464,13 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, * and then release the locks again. */ - cpus_clear(mask); - local_irq_save(flags); + local_irq_disable(); #ifdef CONFIG_FTRACE ftrace_save = ftrace_enabled; ftrace_enabled = 0; #endif smp_wmb(); - for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (!head_page(data) || !data->trace_idx) - continue; - - atomic_inc(&data->disabled); - cpu_set(cpu, mask); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_lock(&data->lock); - - if (data->overrun > iter->last_overrun[cpu]) - iter->overrun[cpu] += - data->overrun - iter->last_overrun[cpu]; - iter->last_overrun[cpu] = data->overrun; - } + ring_buffer_lock(iter->tr->buffer, &flags); while (find_next_entry_inc(iter) != NULL) { int ret; @@ -2731,19 +2489,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, break; } - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_unlock(&data->lock); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - atomic_dec(&data->disabled); - } + ring_buffer_unlock(iter->tr->buffer, flags); #ifdef CONFIG_FTRACE ftrace_enabled = ftrace_save; #endif - local_irq_restore(flags); + local_irq_enable(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -2776,7 +2526,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int i, ret; + int ret; struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) @@ -2804,52 +2554,31 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, goto out; } - if (val > global_trace.entries) { - long pages_requested; - unsigned long freeable_pages; - - /* make sure we have enough memory before mapping */ - pages_requested = - (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; - - /* account for each buffer (and max_tr) */ - pages_requested *= tracing_nr_buffers * 2; - - /* Check for overflow */ - if (pages_requested < 0) { - cnt = -ENOMEM; - goto out; - } - - freeable_pages = determine_dirtyable_memory(); - - /* we only allow to request 1/4 of useable memory */ - if (pages_requested > - ((freeable_pages + tracing_pages_allocated) / 4)) { - cnt = -ENOMEM; + if (val != global_trace.entries) { + ret = ring_buffer_resize(global_trace.buffer, val); + if (ret < 0) { + cnt = ret; goto out; } - while (global_trace.entries < val) { - if (trace_alloc_page()) { - cnt = -ENOMEM; - goto out; + ret = ring_buffer_resize(max_tr.buffer, val); + if (ret < 0) { + int r; + cnt = ret; + r = ring_buffer_resize(global_trace.buffer, + global_trace.entries); + if (r < 0) { + /* AARGH! We are left with different + * size max buffer!!!! */ + WARN_ON(1); + tracing_disabled = 1; } - /* double check that we don't go over the known pages */ - if (tracing_pages_allocated > pages_requested) - break; + goto out; } - } else { - /* include the number of entries in val (inc of page entries) */ - while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) - trace_free_page(); + global_trace.entries = val; } - /* check integrity */ - for_each_tracing_cpu(i) - check_pages(global_trace.data[i]); - filp->f_pos += cnt; /* If check pages failed, return ENOMEM */ @@ -3086,10 +2815,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static DEFINE_SPINLOCK(trace_buf_lock); static char trace_buf[TRACE_BUF_SIZE]; + struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct trace_entry *entry; - unsigned long flags; + unsigned long flags, irq_flags; long disabled; int cpu, len = 0, write, written = 0; @@ -3110,8 +2840,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) len = min(len, TRACE_BUF_SIZE-1); trace_buf[len] = 0; - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, flags); entry->type = TRACE_PRINT; entry->field.print.ip = ip; @@ -3121,21 +2854,27 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->field.print.buf, trace_buf, write); entry->field.print.buf[write] = 0; written = write; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); if (written != len) entry->field.flags |= TRACE_FLAG_CONT; while (written != len) { - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); entry->type = TRACE_CONT; write = min(len - written, (int)(TRACE_CONT_BUF_SIZE-1)); memcpy(&entry->cont.buf, trace_buf+written, write); entry->cont.buf[write] = 0; written += write; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } - __raw_spin_unlock(&data->lock); + out_unlock: spin_unlock(&trace_buf_lock); out: @@ -3227,12 +2966,10 @@ void ftrace_dump(void) static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - struct trace_array_cpu *data; static cpumask_t mask; static int dump_ran; - unsigned long flags; + unsigned long flags, irq_flags; int cnt = 0; - int cpu; /* only one dump */ spin_lock_irqsave(&ftrace_dump_lock, flags); @@ -3258,25 +2995,7 @@ void ftrace_dump(void) cpus_clear(mask); - for_each_tracing_cpu(cpu) { - data = iter.tr->data[cpu]; - - if (!head_page(data) || !data->trace_idx) - continue; - - atomic_inc(&data->disabled); - cpu_set(cpu, mask); - } - - for_each_cpu_mask(cpu, mask) { - data = iter.tr->data[cpu]; - __raw_spin_lock(&data->lock); - - if (data->overrun > iter.last_overrun[cpu]) - iter.overrun[cpu] += - data->overrun - iter.last_overrun[cpu]; - iter.last_overrun[cpu] = data->overrun; - } + ring_buffer_lock(iter.tr->buffer, &irq_flags); while (!trace_empty(&iter)) { @@ -3305,205 +3024,47 @@ void ftrace_dump(void) else printk(KERN_TRACE "---------------------------------\n"); - for_each_cpu_mask(cpu, mask) { - data = iter.tr->data[cpu]; - __raw_spin_unlock(&data->lock); - } - - for_each_cpu_mask(cpu, mask) { - data = iter.tr->data[cpu]; - atomic_dec(&data->disabled); - } - + ring_buffer_unlock(iter.tr->buffer, irq_flags); out: spin_unlock_irqrestore(&ftrace_dump_lock, flags); } -static int trace_alloc_page(void) +__init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; - struct page *page, *tmp; - LIST_HEAD(pages); - void *array; - unsigned pages_allocated = 0; int i; - /* first allocate a page for each CPU */ - for_each_tracing_cpu(i) { - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } - - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_buffer_mask = cpu_possible_map; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); -#endif + global_trace.buffer = ring_buffer_alloc(trace_buf_size, + TRACE_BUFFER_FLAGS); + if (!global_trace.buffer) { + printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); + WARN_ON(1); + return 0; } - - /* Now that we successfully allocate a page per CPU, add them */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - ClearPageLRU(page); + global_trace.entries = ring_buffer_size(global_trace.buffer); #ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - SetPageLRU(page); -#endif - } - tracing_pages_allocated += pages_allocated; - global_trace.entries += ENTRIES_PER_PAGE; - - return 0; - - free_pages: - list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del_init(&page->lru); - __free_page(page); + max_tr.buffer = ring_buffer_alloc(trace_buf_size, + TRACE_BUFFER_FLAGS); + if (!max_tr.buffer) { + printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); + WARN_ON(1); + ring_buffer_free(global_trace.buffer); + return 0; } - return -ENOMEM; -} - -static int trace_free_page(void) -{ - struct trace_array_cpu *data; - struct page *page; - struct list_head *p; - int i; - int ret = 0; - - /* free one page from each buffer */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - tracing_pages_allocated--; - tracing_pages_allocated--; - __free_page(page); - - tracing_reset(data); - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - __free_page(page); - - tracing_reset(data); + max_tr.entries = ring_buffer_size(max_tr.buffer); + WARN_ON(max_tr.entries != global_trace.entries); #endif - } - global_trace.entries -= ENTRIES_PER_PAGE; - - return ret; -} - -__init static int tracer_alloc_buffers(void) -{ - struct trace_array_cpu *data; - void *array; - struct page *page; - int pages = 0; - int ret = -ENOMEM; - int i; - - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_nr_buffers = num_possible_cpus(); - tracing_buffer_mask = cpu_possible_map; /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); - - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } - - /* set the array to the list */ - INIT_LIST_HEAD(&data->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &data->trace_pages); - /* use the LRU flag to differentiate the two buffers */ - ClearPageLRU(page); - - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } - - INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &max_tr.data[i]->trace_pages); - SetPageLRU(page); -#endif - } - - /* - * Since we allocate by orders of pages, we may be able to - * round up a bit. - */ - global_trace.entries = ENTRIES_PER_PAGE; - pages++; - - while (global_trace.entries < trace_nr_entries) { - if (trace_alloc_page()) - break; - pages++; } - max_tr.entries = global_trace.entries; - - pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", - pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); - pr_info(" actual entries %ld\n", global_trace.entries); trace_init_cmdlines(); @@ -3519,38 +3080,13 @@ __init static int tracer_alloc_buffers(void) /* All seems OK, enable tracing */ global_trace.ctrl = tracer_enabled; tracing_disabled = 0; + atomic_notifier_chain_register(&panic_notifier_list, &trace_panic_notifier); register_die_notifier(&trace_die_notifier); return 0; - - free_buffers: - for (i-- ; i >= 0; i--) { - struct page *page, *tmp; - struct trace_array_cpu *data = global_trace.data[i]; - - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } -#endif - } - return ret; } early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b28bf8812efc..f6965f775b43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -102,7 +103,6 @@ struct trace_field { char flags; char preempt_count; int pid; - cycle_t t; union { struct ftrace_entry fn; struct ctx_switch_entry ctx; @@ -139,16 +139,9 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - struct list_head trace_pages; atomic_t disabled; - raw_spinlock_t lock; - struct lock_class_key lock_key; /* these fields get copied into max-trace: */ - unsigned trace_head_idx; - unsigned trace_tail_idx; - void *trace_head; /* producer */ - void *trace_tail; /* consumer */ unsigned long trace_idx; unsigned long overrun; unsigned long saved_latency; @@ -172,6 +165,7 @@ struct trace_iterator; * They have on/off state as well: */ struct trace_array { + struct ring_buffer *buffer; unsigned long entries; long ctrl; int cpu; @@ -219,27 +213,21 @@ struct trace_iterator { struct trace_array *tr; struct tracer *trace; void *private; - long last_overrun[NR_CPUS]; - long overrun[NR_CPUS]; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; int cpu; - - struct trace_entry *prev_ent; - int prev_cpu; + u64 ts; unsigned long iter_flags; loff_t pos; - unsigned long next_idx[NR_CPUS]; - struct list_head *next_page[NR_CPUS]; - unsigned next_page_idx[NR_CPUS]; long idx; }; void trace_wake_up(void); -void tracing_reset(struct trace_array_cpu *data); +void tracing_reset(struct trace_array *tr, int cpu); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d5c9e2e4a9c4..3657eec6b87d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -34,7 +34,7 @@ static void boot_trace_init(struct trace_array *tr) trace_boot_enabled = 0; for_each_cpu_mask(cpu, cpu_possible_map) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void boot_trace_ctrl_update(struct trace_array *tr) @@ -74,6 +74,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot(struct boot_trace *it) { + struct ring_buffer_event *event; struct trace_entry *entry; struct trace_array_cpu *data; unsigned long irq_flags; @@ -85,17 +86,18 @@ void trace_boot(struct boot_trace *it) preempt_disable(); data = tr->data[smp_processor_id()]; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, 0); entry->type = TRACE_BOOT; entry->field.initcall = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); trace_wake_up(); + out: preempt_enable(); } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 312144897970..e90eb0c2c56c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,7 +23,7 @@ static void function_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void start_function_trace(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ece6cfb649fa..37ad49407f27 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -173,7 +173,7 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(data); + tracing_reset(tr, cpu); trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); } @@ -203,7 +203,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - tracing_reset(data); + tracing_reset(tr, cpu); local_save_flags(flags); @@ -234,7 +234,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!head_page(data)) || + if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) return; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index a108c326f36e..bdbf09d8413c 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void mmio_trace_init(struct trace_array *tr) @@ -130,10 +130,14 @@ static unsigned long count_overruns(struct trace_iterator *iter) { int cpu; unsigned long cnt = 0; +/* FIXME: */ +#if 0 for_each_online_cpu(cpu) { cnt += iter->overrun[cpu]; iter->overrun[cpu] = 0; } +#endif + (void)cpu; return cnt; } @@ -176,7 +180,7 @@ static int mmio_print_rw(struct trace_iterator *iter) struct trace_entry *entry = iter->ent; struct mmiotrace_rw *rw = &entry->field.mmiorw; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->field.t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; @@ -218,7 +222,7 @@ static int mmio_print_map(struct trace_iterator *iter) struct trace_entry *entry = iter->ent; struct mmiotrace_map *m = &entry->field.mmiomap; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->field.t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; @@ -250,7 +254,7 @@ static int mmio_print_mark(struct trace_iterator *iter) struct trace_entry *entry = iter->ent; const char *msg = entry->field.print.buf; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->field.t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret; @@ -303,19 +307,19 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ring_buffer_event *event; struct trace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, 0); entry->type = TRACE_MMIO_RW; entry->field.mmiorw = *rw; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); } @@ -331,19 +335,19 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ring_buffer_event *event; struct trace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); tracing_generic_entry_update(entry, 0); entry->type = TRACE_MMIO_MAP; entry->field.mmiomap = *map; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 16c9ba060bab..4592b4862515 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -30,7 +30,7 @@ static void nop_trace_init(struct trace_array *tr) ctx_trace = tr; for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); if (tr->ctrl) start_nop_trace(tr); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 789e927abc9c..e0b06db0f7af 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -81,7 +81,7 @@ static void sched_switch_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static int tracing_sched_register(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 08206b4e29c4..01e75e0639b7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -191,7 +191,7 @@ static void __wakeup_reset(struct trace_array *tr) for_each_possible_cpu(cpu) { data = tr->data[cpu]; - tracing_reset(data); + tracing_reset(tr, cpu); } wakeup_cpu = -1; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ebd4b135498..09cf230d7eca 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -18,58 +18,20 @@ static inline int trace_valid_entry(struct trace_entry *entry) return 0; } -static int -trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) { - struct trace_entry *entries; - struct page *page; - int idx = 0; - int i; + struct ring_buffer_event *event; + struct trace_entry *entry; - BUG_ON(list_empty(&data->trace_pages)); - page = list_entry(data->trace_pages.next, struct page, lru); - entries = page_address(page); + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { + entry = ring_buffer_event_data(event); - check_pages(data); - if (head_page(data) != entries) - goto failed; - - /* - * The starting trace buffer always has valid elements, - * if any element exists. - */ - entries = head_page(data); - - for (i = 0; i < tr->entries; i++) { - - if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + if (!trace_valid_entry(entry)) { printk(KERN_CONT ".. invalid entry %d ", - entries[idx].type); + entry->type); goto failed; } - - idx++; - if (idx >= ENTRIES_PER_PAGE) { - page = virt_to_page(entries); - if (page->lru.next == &data->trace_pages) { - if (i != tr->entries - 1) { - printk(KERN_CONT ".. entries buffer mismatch"); - goto failed; - } - } else { - page = list_entry(page->lru.next, struct page, lru); - entries = page_address(page); - } - idx = 0; - } - } - - page = virt_to_page(entries); - if (page->lru.next != &data->trace_pages) { - printk(KERN_CONT ".. too many entries"); - goto failed; } - return 0; failed: @@ -91,13 +53,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ raw_local_irq_save(flags); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(cpu) { - if (!head_page(tr->data[cpu])) - continue; - cnt += tr->data[cpu]->trace_idx; + cnt = ring_buffer_entries(tr->buffer); - ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + for_each_possible_cpu(cpu) { + ret = trace_test_buffer_cpu(tr, cpu); if (ret) break; } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index db58fb66a135..9587d3bcba55 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -241,7 +241,7 @@ static void stack_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void start_stack_trace(struct trace_array *tr) -- cgit v1.2.1 From 777e208d40d0953efc6fb4ab58590da3f7d8f02d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:42 -0400 Subject: ftrace: take advantage of variable length entries Now that the underlining ring buffer for ftrace now hold variable length entries, we can take advantage of this by only storing the size of the actual event into the buffer. This happens to increase the number of entries in the buffer dramatically. We can also get rid of the "trace_cont" operation, but I'm keeping that until we have no more users. Some of the ftrace tracers can now change their code to adapt to this new feature. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 439 ++++++++++++++++++++++------------------- kernel/trace/trace.h | 81 ++++---- kernel/trace/trace_boot.c | 13 +- kernel/trace/trace_mmiotrace.c | 31 +-- 4 files changed, 301 insertions(+), 263 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ef80793858b8..ed9e47c18810 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -637,9 +637,9 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) pc = preempt_count(); - entry->field.preempt_count = pc & 0xff; - entry->field.pid = (tsk) ? tsk->pid : 0; - entry->field.flags = + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | @@ -651,7 +651,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct ftrace_entry *entry; unsigned long irq_flags; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), @@ -659,10 +659,10 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->field.fn.ip = ip; - entry->field.fn.parent_ip = parent_ip; + tracing_generic_entry_update(&entry->ent, flags); + entry->ent.type = TRACE_FN; + entry->ip = ip; + entry->parent_ip = parent_ip; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } @@ -680,7 +680,7 @@ void __trace_stack(struct trace_array *tr, int skip) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct stack_entry *entry; struct stack_trace trace; unsigned long irq_flags; @@ -692,15 +692,15 @@ void __trace_stack(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_STACK; + tracing_generic_entry_update(&entry->ent, flags); + entry->ent.type = TRACE_STACK; - memset(&entry->field.stack, 0, sizeof(entry->field.stack)); + memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; trace.max_entries = FTRACE_STACK_ENTRIES; trace.skip = skip; - trace.entries = entry->field.stack.caller; + trace.entries = entry->caller; save_stack_trace(&trace); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); @@ -713,7 +713,7 @@ __trace_special(void *__tr, void *__data, struct ring_buffer_event *event; struct trace_array_cpu *data = __data; struct trace_array *tr = __tr; - struct trace_entry *entry; + struct special_entry *entry; unsigned long irq_flags; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), @@ -721,11 +721,11 @@ __trace_special(void *__tr, void *__data, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->field.special.arg1 = arg1; - entry->field.special.arg2 = arg2; - entry->field.special.arg3 = arg3; + tracing_generic_entry_update(&entry->ent, 0); + entry->ent.type = TRACE_SPECIAL; + entry->arg1 = arg1; + entry->arg2 = arg2; + entry->arg3 = arg3; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); __trace_stack(tr, data, irq_flags, 4); @@ -740,7 +740,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct ctx_switch_entry *entry; unsigned long irq_flags; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), @@ -748,15 +748,15 @@ tracing_sched_switch_trace(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_CTX; - entry->field.ctx.prev_pid = prev->pid; - entry->field.ctx.prev_prio = prev->prio; - entry->field.ctx.prev_state = prev->state; - entry->field.ctx.next_pid = next->pid; - entry->field.ctx.next_prio = next->prio; - entry->field.ctx.next_state = next->state; - entry->field.ctx.next_cpu = task_cpu(next); + tracing_generic_entry_update(&entry->ent, flags); + entry->ent.type = TRACE_CTX; + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); __trace_stack(tr, data, flags, 5); } @@ -769,7 +769,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, unsigned long flags) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct ctx_switch_entry *entry; unsigned long irq_flags; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), @@ -777,15 +777,15 @@ tracing_sched_wakeup_trace(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_WAKE; - entry->field.ctx.prev_pid = curr->pid; - entry->field.ctx.prev_prio = curr->prio; - entry->field.ctx.prev_state = curr->state; - entry->field.ctx.next_pid = wakee->pid; - entry->field.ctx.next_prio = wakee->prio; - entry->field.ctx.next_state = wakee->state; - entry->field.ctx.next_cpu = task_cpu(wakee); + tracing_generic_entry_update(&entry->ent, flags); + entry->ent.type = TRACE_WAKE; + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); __trace_stack(tr, data, flags, 6); @@ -1173,20 +1173,19 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) static void lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { - struct trace_field *field = &entry->field; int hardirq, softirq; char *comm; - comm = trace_find_cmdline(field->pid); + comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%8.8s-%-5d ", comm, field->pid); + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); trace_seq_printf(s, "%3d", cpu); trace_seq_printf(s, "%c%c", - (field->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', - ((field->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); - hardirq = field->flags & TRACE_FLAG_HARDIRQ; - softirq = field->flags & TRACE_FLAG_SOFTIRQ; + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; if (hardirq && softirq) { trace_seq_putc(s, 'H'); } else { @@ -1200,8 +1199,8 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) } } - if (field->preempt_count) - trace_seq_printf(s, "%x", field->preempt_count); + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); else trace_seq_puts(s, "."); } @@ -1230,6 +1229,7 @@ static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) { struct trace_entry *ent; + struct trace_field_cont *cont; bool ok = true; ent = peek_next_entry(iter, iter->cpu, NULL); @@ -1239,8 +1239,9 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) } do { + cont = (struct trace_field_cont *)ent; if (ok) - ok = (trace_seq_printf(s, "%s", ent->cont.buf) > 0); + ok = (trace_seq_printf(s, "%s", cont->buf) > 0); ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); ent = peek_next_entry(iter, iter->cpu, NULL); } while (ent && ent->type == TRACE_CONT); @@ -1257,7 +1258,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) struct trace_entry *next_entry; unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; - struct trace_field *field = &entry->field; unsigned long abs_usecs; unsigned long rel_usecs; u64 next_ts; @@ -1276,12 +1276,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); if (verbose) { - comm = trace_find_cmdline(field->pid); + comm = trace_find_cmdline(entry->pid); trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, - field->pid, cpu, field->flags, - field->preempt_count, trace_idx, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, ns2usecs(iter->ts), abs_usecs/1000, abs_usecs % 1000, rel_usecs/1000, @@ -1291,53 +1291,69 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { - case TRACE_FN: - seq_print_ip_sym(s, field->fn.ip, sym_flags); + case TRACE_FN: { + struct ftrace_entry *field = (struct ftrace_entry *)entry; + + seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_puts(s, " ("); - if (kretprobed(field->fn.parent_ip)) + if (kretprobed(field->parent_ip)) trace_seq_puts(s, KRETPROBE_MSG); else - seq_print_ip_sym(s, field->fn.parent_ip, sym_flags); + seq_print_ip_sym(s, field->parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; + } case TRACE_CTX: - case TRACE_WAKE: - T = field->ctx.next_state < sizeof(state_to_char) ? - state_to_char[field->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field = + (struct ctx_switch_entry *)entry; + + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; - state = field->ctx.prev_state ? - __ffs(field->ctx.prev_state) + 1 : 0; + state = field->prev_state ? + __ffs(field->prev_state) + 1 : 0; S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; - comm = trace_find_cmdline(field->ctx.next_pid); + comm = trace_find_cmdline(field->next_pid); trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->ctx.prev_pid, - field->ctx.prev_prio, + field->prev_pid, + field->prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - field->ctx.next_cpu, - field->ctx.next_pid, - field->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T, comm); break; - case TRACE_SPECIAL: + } + case TRACE_SPECIAL: { + struct special_entry *field = (struct special_entry *)entry; + trace_seq_printf(s, "# %ld %ld %ld\n", - field->special.arg1, - field->special.arg2, - field->special.arg3); + field->arg1, + field->arg2, + field->arg3); break; - case TRACE_STACK: + } + case TRACE_STACK: { + struct stack_entry *field = (struct stack_entry *)entry; + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, field->stack.caller[i], sym_flags); + seq_print_ip_sym(s, field->caller[i], sym_flags); } trace_seq_puts(s, "\n"); break; - case TRACE_PRINT: - seq_print_ip_sym(s, field->print.ip, sym_flags); - trace_seq_printf(s, ": %s", field->print.buf); - if (field->flags & TRACE_FLAG_CONT) + } + case TRACE_PRINT: { + struct print_entry *field = (struct print_entry *)entry; + + seq_print_ip_sym(s, field->ip, sym_flags); + trace_seq_printf(s, ": %s", field->buf); + if (entry->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1349,7 +1365,6 @@ static int print_trace_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; - struct trace_field *field; unsigned long usec_rem; unsigned long long t; unsigned long secs; @@ -1363,15 +1378,13 @@ static int print_trace_fmt(struct trace_iterator *iter) if (entry->type == TRACE_CONT) return 1; - field = &entry->field; - - comm = trace_find_cmdline(iter->ent->field.pid); + comm = trace_find_cmdline(iter->ent->pid); t = ns2usecs(iter->ts); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - ret = trace_seq_printf(s, "%16s-%-5d ", comm, field->pid); + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (!ret) return 0; ret = trace_seq_printf(s, "[%03d] ", iter->cpu); @@ -1382,20 +1395,22 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; switch (entry->type) { - case TRACE_FN: - ret = seq_print_ip_sym(s, field->fn.ip, sym_flags); + case TRACE_FN: { + struct ftrace_entry *field = (struct ftrace_entry *)entry; + + ret = seq_print_ip_sym(s, field->ip, sym_flags); if (!ret) return 0; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - field->fn.parent_ip) { + field->parent_ip) { ret = trace_seq_printf(s, " <-"); if (!ret) return 0; - if (kretprobed(field->fn.parent_ip)) + if (kretprobed(field->parent_ip)) ret = trace_seq_puts(s, KRETPROBE_MSG); else ret = seq_print_ip_sym(s, - field->fn.parent_ip, + field->parent_ip, sym_flags); if (!ret) return 0; @@ -1404,40 +1419,50 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + } case TRACE_CTX: - case TRACE_WAKE: - S = field->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[field->ctx.prev_state] : 'X'; - T = field->ctx.next_state < sizeof(state_to_char) ? - state_to_char[field->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field = + (struct ctx_switch_entry *)entry; + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", - field->ctx.prev_pid, - field->ctx.prev_prio, + field->prev_pid, + field->prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - field->ctx.next_cpu, - field->ctx.next_pid, - field->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T); if (!ret) return 0; break; - case TRACE_SPECIAL: + } + case TRACE_SPECIAL: { + struct special_entry *field = (struct special_entry *)entry; + ret = trace_seq_printf(s, "# %ld %ld %ld\n", - field->special.arg1, - field->special.arg2, - field->special.arg3); + field->arg1, + field->arg2, + field->arg3); if (!ret) return 0; break; - case TRACE_STACK: + } + case TRACE_STACK: { + struct stack_entry *field = (struct stack_entry *)entry; + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) { ret = trace_seq_puts(s, " <= "); if (!ret) return 0; } - ret = seq_print_ip_sym(s, field->stack.caller[i], + ret = seq_print_ip_sym(s, field->caller[i], sym_flags); if (!ret) return 0; @@ -1446,13 +1471,17 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; - case TRACE_PRINT: - seq_print_ip_sym(s, field->print.ip, sym_flags); - trace_seq_printf(s, ": %s", field->print.buf); - if (field->flags & TRACE_FLAG_CONT) + } + case TRACE_PRINT: { + struct print_entry *field = (struct print_entry *)entry; + + seq_print_ip_sym(s, field->ip, sym_flags); + trace_seq_printf(s, ": %s", field->buf); + if (entry->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); break; } + } return 1; } @@ -1460,7 +1489,6 @@ static int print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; - struct trace_field *field; int ret; int S, T; @@ -1469,56 +1497,66 @@ static int print_raw_fmt(struct trace_iterator *iter) if (entry->type == TRACE_CONT) return 1; - field = &entry->field; - ret = trace_seq_printf(s, "%d %d %llu ", - field->pid, iter->cpu, iter->ts); + entry->pid, iter->cpu, iter->ts); if (!ret) return 0; switch (entry->type) { - case TRACE_FN: + case TRACE_FN: { + struct ftrace_entry *field = (struct ftrace_entry *)entry; + ret = trace_seq_printf(s, "%x %x\n", - field->fn.ip, - field->fn.parent_ip); + field->ip, + field->parent_ip); if (!ret) return 0; break; + } case TRACE_CTX: - case TRACE_WAKE: - S = field->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[field->ctx.prev_state] : 'X'; - T = field->ctx.next_state < sizeof(state_to_char) ? - state_to_char[field->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field = + (struct ctx_switch_entry *)entry; + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", - field->ctx.prev_pid, - field->ctx.prev_prio, + field->prev_pid, + field->prev_prio, S, - field->ctx.next_cpu, - field->ctx.next_pid, - field->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T); if (!ret) return 0; break; + } case TRACE_SPECIAL: - case TRACE_STACK: + case TRACE_STACK: { + struct special_entry *field = (struct special_entry *)entry; + ret = trace_seq_printf(s, "# %ld %ld %ld\n", - field->special.arg1, - field->special.arg2, - field->special.arg3); + field->arg1, + field->arg2, + field->arg3); if (!ret) return 0; break; - case TRACE_PRINT: - trace_seq_printf(s, "# %lx %s", - field->print.ip, field->print.buf); - if (field->flags & TRACE_FLAG_CONT) + } + case TRACE_PRINT: { + struct print_entry *field = (struct print_entry *)entry; + + trace_seq_printf(s, "# %lx %s", field->ip, field->buf); + if (entry->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); break; } + } return 1; } @@ -1539,7 +1577,6 @@ static int print_hex_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; - struct trace_field *field; int S, T; entry = iter->ent; @@ -1547,40 +1584,48 @@ static int print_hex_fmt(struct trace_iterator *iter) if (entry->type == TRACE_CONT) return 1; - field = &entry->field; - - SEQ_PUT_HEX_FIELD_RET(s, field->pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); SEQ_PUT_HEX_FIELD_RET(s, iter->ts); switch (entry->type) { - case TRACE_FN: - SEQ_PUT_HEX_FIELD_RET(s, field->fn.ip); - SEQ_PUT_HEX_FIELD_RET(s, field->fn.parent_ip); + case TRACE_FN: { + struct ftrace_entry *field = (struct ftrace_entry *)entry; + + SEQ_PUT_HEX_FIELD_RET(s, field->ip); + SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); break; + } case TRACE_CTX: - case TRACE_WAKE: - S = field->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[field->ctx.prev_state] : 'X'; - T = field->ctx.next_state < sizeof(state_to_char) ? - state_to_char[field->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field = + (struct ctx_switch_entry *)entry; + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - SEQ_PUT_HEX_FIELD_RET(s, field->ctx.prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->ctx.next_prio); + SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); + SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); SEQ_PUT_HEX_FIELD_RET(s, T); break; + } case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_HEX_FIELD_RET(s, field->special.arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->special.arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->special.arg3); + case TRACE_STACK: { + struct special_entry *field = (struct special_entry *)entry; + + SEQ_PUT_HEX_FIELD_RET(s, field->arg1); + SEQ_PUT_HEX_FIELD_RET(s, field->arg2); + SEQ_PUT_HEX_FIELD_RET(s, field->arg3); break; } + } SEQ_PUT_FIELD_RET(s, newline); return 1; @@ -1590,39 +1635,46 @@ static int print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; - struct trace_field *field; entry = iter->ent; if (entry->type == TRACE_CONT) return 1; - field = &entry->field; - - SEQ_PUT_FIELD_RET(s, field->pid); - SEQ_PUT_FIELD_RET(s, field->cpu); + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, iter->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { - case TRACE_FN: - SEQ_PUT_FIELD_RET(s, field->fn.ip); - SEQ_PUT_FIELD_RET(s, field->fn.parent_ip); + case TRACE_FN: { + struct ftrace_entry *field = (struct ftrace_entry *)entry; + + SEQ_PUT_FIELD_RET(s, field->ip); + SEQ_PUT_FIELD_RET(s, field->parent_ip); break; - case TRACE_CTX: - SEQ_PUT_FIELD_RET(s, field->ctx.prev_pid); - SEQ_PUT_FIELD_RET(s, field->ctx.prev_prio); - SEQ_PUT_FIELD_RET(s, field->ctx.prev_state); - SEQ_PUT_FIELD_RET(s, field->ctx.next_pid); - SEQ_PUT_FIELD_RET(s, field->ctx.next_prio); - SEQ_PUT_FIELD_RET(s, field->ctx.next_state); + } + case TRACE_CTX: { + struct ctx_switch_entry *field = + (struct ctx_switch_entry *)entry; + + SEQ_PUT_FIELD_RET(s, field->prev_pid); + SEQ_PUT_FIELD_RET(s, field->prev_prio); + SEQ_PUT_FIELD_RET(s, field->prev_state); + SEQ_PUT_FIELD_RET(s, field->next_pid); + SEQ_PUT_FIELD_RET(s, field->next_prio); + SEQ_PUT_FIELD_RET(s, field->next_state); break; + } case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_FIELD_RET(s, field->special.arg1); - SEQ_PUT_FIELD_RET(s, field->special.arg2); - SEQ_PUT_FIELD_RET(s, field->special.arg3); + case TRACE_STACK: { + struct special_entry *field = (struct special_entry *)entry; + + SEQ_PUT_FIELD_RET(s, field->arg1); + SEQ_PUT_FIELD_RET(s, field->arg2); + SEQ_PUT_FIELD_RET(s, field->arg3); break; } + } return 1; } @@ -2818,10 +2870,10 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct trace_entry *entry; + struct print_entry *entry; unsigned long flags, irq_flags; long disabled; - int cpu, len = 0, write, written = 0; + int cpu, len = 0, size; if (!tr->ctrl || tracing_disabled) return 0; @@ -2840,40 +2892,19 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) len = min(len, TRACE_BUF_SIZE-1); trace_buf[len] = 0; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + size = sizeof(*entry) + len + 1; + event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags); if (!event) goto out_unlock; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_PRINT; - entry->field.print.ip = ip; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags); + entry->ent.type = TRACE_PRINT; + entry->ip = ip; - write = min(len, (int)(TRACE_PRINT_BUF_SIZE-1)); - - memcpy(&entry->field.print.buf, trace_buf, write); - entry->field.print.buf[write] = 0; - written = write; + memcpy(&entry->buf, trace_buf, len); + entry->buf[len] = 0; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - if (written != len) - entry->field.flags |= TRACE_FLAG_CONT; - - while (written != len) { - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - - entry->type = TRACE_CONT; - write = min(len - written, (int)(TRACE_CONT_BUF_SIZE-1)); - memcpy(&entry->cont.buf, trace_buf+written, write); - entry->cont.buf[write] = 0; - written += write; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - } - out_unlock: spin_unlock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f6965f775b43..e541a6b7e312 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -26,10 +26,25 @@ enum trace_type { __TRACE_LAST_TYPE }; +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned char type; + unsigned char cpu; + unsigned char flags; + unsigned char preempt_count; + int pid; +}; + /* * Function trace entry - function address and parent function addres: */ struct ftrace_entry { + struct trace_entry ent; unsigned long ip; unsigned long parent_ip; }; @@ -39,6 +54,7 @@ extern struct tracer boot_tracer; * Context switch trace entry - which task (and prio) we switched from/to: */ struct ctx_switch_entry { + struct trace_entry ent; unsigned int prev_pid; unsigned char prev_prio; unsigned char prev_state; @@ -52,6 +68,7 @@ struct ctx_switch_entry { * Special (free-form) trace entry: */ struct special_entry { + struct trace_entry ent; unsigned long arg1; unsigned long arg2; unsigned long arg3; @@ -64,6 +81,7 @@ struct special_entry { #define FTRACE_STACK_ENTRIES 8 struct stack_entry { + struct trace_entry ent; unsigned long caller[FTRACE_STACK_ENTRIES]; }; @@ -71,10 +89,34 @@ struct stack_entry { * ftrace_printk entry: */ struct print_entry { + struct trace_entry ent; unsigned long ip; char buf[]; }; +#define TRACE_OLD_SIZE 88 + +struct trace_field_cont { + unsigned char type; + /* Temporary till we get rid of this completely */ + char buf[TRACE_OLD_SIZE - 1]; +}; + +struct trace_mmiotrace_rw { + struct trace_entry ent; + struct mmiotrace_rw rw; +}; + +struct trace_mmiotrace_map { + struct trace_entry ent; + struct mmiotrace_map map; +}; + +struct trace_boot { + struct trace_entry ent; + struct boot_trace initcall; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -92,46 +134,7 @@ enum trace_flag_type { TRACE_FLAG_CONT = 0x10, }; -/* - * The trace field - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_field { - char cpu; - char flags; - char preempt_count; - int pid; - union { - struct ftrace_entry fn; - struct ctx_switch_entry ctx; - struct special_entry special; - struct stack_entry stack; - struct print_entry print; - struct mmiotrace_rw mmiorw; - struct mmiotrace_map mmiomap; - struct boot_trace initcall; - }; -}; - -struct trace_field_cont { - char buf[sizeof(struct trace_field)]; -}; - -struct trace_entry { - char type; - union { - struct trace_field field; - struct trace_field_cont cont; - }; -}; - -#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) #define TRACE_BUF_SIZE 1024 -#define TRACE_PRINT_BUF_SIZE \ - (sizeof(struct trace_field) - offsetof(struct trace_field, print.buf)) -#define TRACE_CONT_BUF_SIZE sizeof(struct trace_field) /* * The CPU trace array - it consists of thousands of trace entries diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3657eec6b87d..fa8cca1be115 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -49,10 +49,11 @@ static int initcall_print_line(struct trace_iterator *iter) { int ret = 0; struct trace_entry *entry = iter->ent; - struct boot_trace *it = &entry->field.initcall; + struct trace_boot *field = (struct trace_boot *)entry; + struct boot_trace *it = &field->initcall; struct trace_seq *s = &iter->seq; - if (iter->ent->type == TRACE_BOOT) + if (entry->type == TRACE_BOOT) ret = trace_seq_printf(s, "%pF called from %i " "returned %d after %lld msecs\n", it->func, it->caller, it->result, @@ -75,7 +76,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot(struct boot_trace *it) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct trace_boot *entry; struct trace_array_cpu *data; unsigned long irq_flags; struct trace_array *tr = boot_trace; @@ -91,9 +92,9 @@ void trace_boot(struct boot_trace *it) if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_BOOT; - entry->field.initcall = *it; + tracing_generic_entry_update(&entry->ent, 0); + entry->ent.type = TRACE_BOOT; + entry->initcall = *it; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index bdbf09d8413c..3df441ea2749 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -178,14 +178,16 @@ print_out: static int mmio_print_rw(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_rw *rw = &entry->field.mmiorw; + struct trace_mmiotrace_rw *field = + (struct trace_mmiotrace_rw *)entry; + struct mmiotrace_rw *rw = &field->rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; - switch (entry->field.mmiorw.opcode) { + switch (rw->opcode) { case MMIO_READ: ret = trace_seq_printf(s, "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", @@ -220,14 +222,14 @@ static int mmio_print_rw(struct trace_iterator *iter) static int mmio_print_map(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_map *m = &entry->field.mmiomap; + struct mmiotrace_map *m = (struct mmiotrace_map *)entry; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; - switch (entry->field.mmiorw.opcode) { + switch (m->opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", @@ -252,7 +254,8 @@ static int mmio_print_map(struct trace_iterator *iter) static int mmio_print_mark(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - const char *msg = entry->field.print.buf; + struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); @@ -264,7 +267,7 @@ static int mmio_print_mark(struct trace_iterator *iter) if (!ret) return 0; - if (entry->field.flags & TRACE_FLAG_CONT) + if (entry->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); return 1; @@ -308,7 +311,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct trace_mmiotrace_rw *entry; unsigned long irq_flags; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), @@ -316,9 +319,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_RW; - entry->field.mmiorw = *rw; + tracing_generic_entry_update(&entry->ent, 0); + entry->ent.type = TRACE_MMIO_RW; + entry->rw = *rw; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); @@ -336,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct ring_buffer_event *event; - struct trace_entry *entry; + struct trace_mmiotrace_map *entry; unsigned long irq_flags; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), @@ -344,9 +347,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_MAP; - entry->field.mmiomap = *map; + tracing_generic_entry_update(&entry->ent, 0); + entry->ent.type = TRACE_MMIO_MAP; + entry->map = *map; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); -- cgit v1.2.1 From 2c4f035f6c3e8fda661eb6105aa51ef07aa71607 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Sep 2008 20:18:34 +0200 Subject: tracing/ftrace: change the type of the print_line callback We need a kind of disambiguation when a print_line callback returns 0. _There is not enough space to print all the entry. Please flush the seq and retry. _I can't handle this type of entry This patch changes the type of this callback for better information. Also some changes have been made in this V2. _ Only relay to default functions after the print_line callback fails. _ This patch doesn't fix the issue with the broken pipe (see patch 2/4 for that) Some things are still in discussion: _ Find better names for the enum print_line_t values _ Change the type of print_trace_line into boolean. Patches to change that can be sent later. Signed-off-by: Frederic Weisbecker Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 77 ++++++++++++++++++++++++++++------------------------ kernel/trace/trace.h | 10 ++++++- 2 files changed, 50 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed9e47c18810..b38a4bb40548 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1250,7 +1250,7 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) trace_seq_putc(s, '\n'); } -static int +static enum print_line_t print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; @@ -1267,7 +1267,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) unsigned state; if (entry->type == TRACE_CONT) - return 1; + return TRACE_TYPE_HANDLED; next_entry = find_next_entry(iter, NULL, &next_ts); if (!next_entry) @@ -1357,10 +1357,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } - return 1; + return TRACE_TYPE_HANDLED; } -static int print_trace_fmt(struct trace_iterator *iter) +static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1376,7 +1376,7 @@ static int print_trace_fmt(struct trace_iterator *iter) entry = iter->ent; if (entry->type == TRACE_CONT) - return 1; + return TRACE_TYPE_HANDLED; comm = trace_find_cmdline(iter->ent->pid); @@ -1386,13 +1386,13 @@ static int print_trace_fmt(struct trace_iterator *iter) ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; ret = trace_seq_printf(s, "[%03d] ", iter->cpu); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; switch (entry->type) { case TRACE_FN: { @@ -1400,12 +1400,12 @@ static int print_trace_fmt(struct trace_iterator *iter) ret = seq_print_ip_sym(s, field->ip, sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { ret = trace_seq_printf(s, " <-"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; if (kretprobed(field->parent_ip)) ret = trace_seq_puts(s, KRETPROBE_MSG); else @@ -1413,11 +1413,11 @@ static int print_trace_fmt(struct trace_iterator *iter) field->parent_ip, sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = trace_seq_printf(s, "\n"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_CTX: @@ -1439,7 +1439,7 @@ static int print_trace_fmt(struct trace_iterator *iter) field->next_prio, T); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_SPECIAL: { @@ -1450,7 +1450,7 @@ static int print_trace_fmt(struct trace_iterator *iter) field->arg2, field->arg3); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_STACK: { @@ -1460,16 +1460,16 @@ static int print_trace_fmt(struct trace_iterator *iter) if (i) { ret = trace_seq_puts(s, " <= "); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = seq_print_ip_sym(s, field->caller[i], sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = trace_seq_puts(s, "\n"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_PRINT: { @@ -1482,10 +1482,10 @@ static int print_trace_fmt(struct trace_iterator *iter) break; } } - return 1; + return TRACE_TYPE_HANDLED; } -static int print_raw_fmt(struct trace_iterator *iter) +static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1495,12 +1495,12 @@ static int print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; if (entry->type == TRACE_CONT) - return 1; + return TRACE_TYPE_HANDLED; ret = trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; switch (entry->type) { case TRACE_FN: { @@ -1510,7 +1510,7 @@ static int print_raw_fmt(struct trace_iterator *iter) field->ip, field->parent_ip); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_CTX: @@ -1533,7 +1533,7 @@ static int print_raw_fmt(struct trace_iterator *iter) field->next_prio, T); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_SPECIAL: @@ -1545,7 +1545,7 @@ static int print_raw_fmt(struct trace_iterator *iter) field->arg2, field->arg3); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } case TRACE_PRINT: { @@ -1557,7 +1557,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; } } - return 1; + return TRACE_TYPE_HANDLED; } #define SEQ_PUT_FIELD_RET(s, x) \ @@ -1572,7 +1572,7 @@ do { \ return 0; \ } while (0) -static int print_hex_fmt(struct trace_iterator *iter) +static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; @@ -1582,7 +1582,7 @@ static int print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; if (entry->type == TRACE_CONT) - return 1; + return TRACE_TYPE_HANDLED; SEQ_PUT_HEX_FIELD_RET(s, entry->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); @@ -1628,10 +1628,10 @@ static int print_hex_fmt(struct trace_iterator *iter) } SEQ_PUT_FIELD_RET(s, newline); - return 1; + return TRACE_TYPE_HANDLED; } -static int print_bin_fmt(struct trace_iterator *iter) +static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1639,7 +1639,7 @@ static int print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; if (entry->type == TRACE_CONT) - return 1; + return TRACE_TYPE_HANDLED; SEQ_PUT_FIELD_RET(s, entry->pid); SEQ_PUT_FIELD_RET(s, iter->cpu); @@ -1686,13 +1686,18 @@ static int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) return 0; } - return 1; + return TRACE_TYPE_HANDLED; } -static int print_trace_line(struct trace_iterator *iter) +static enum print_line_t print_trace_line(struct trace_iterator *iter) { - if (iter->trace && iter->trace->print_line) - return iter->trace->print_line(iter); + enum print_line_t ret; + + if (iter->trace && iter->trace->print_line) { + ret = iter->trace->print_line(iter); + if (ret != TRACE_TYPE_UNHANDLED) + return ret; + } if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -2525,11 +2530,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, ring_buffer_lock(iter->tr->buffer, &flags); while (find_next_entry_inc(iter) != NULL) { - int ret; + enum print_line_t ret; int len = iter->seq.len; ret = print_trace_line(iter); - if (!ret) { + if (ret == TRACE_TYPE_PARTIAL_LINE) { /* don't print partial lines */ iter->seq.len = len; break; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e541a6b7e312..a921ba5d292d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -177,6 +177,14 @@ struct trace_array { struct trace_array_cpu *data[NR_CPUS]; }; + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ +}; + /* * A specific tracer, represented by methods that operate on a trace array: */ @@ -197,7 +205,7 @@ struct tracer { int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif - int (*print_line)(struct trace_iterator *iter); + enum print_line_t (*print_line)(struct trace_iterator *iter); struct tracer *next; int print_max; }; -- cgit v1.2.1 From 9ff4b9744c187cae58c3774361ea090addbc4130 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 29 Sep 2008 20:23:48 +0200 Subject: tracing/ftrace: fix pipe breaking This patch fixes a bug which break the pipe when the seq is empty. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b38a4bb40548..6a1c76bb56ba 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2439,7 +2439,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) return sret; - sret = 0; trace_seq_reset(&iter->seq); @@ -2450,6 +2449,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, goto out; } +waitagain: + sret = 0; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { @@ -2556,8 +2557,13 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= iter->seq.len) trace_seq_reset(&iter->seq); + + /* + * If there was nothing to send to user, inspite of consuming trace + * entries, go back to wait for more entries. + */ if (sret == -EBUSY) - sret = 0; + goto waitagain; out: mutex_unlock(&trace_types_lock); -- cgit v1.2.1 From 07f4e4f790895d55f46aaf89e7437da4a585989c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Sep 2008 20:27:42 +0200 Subject: tracing/ftrace: adapt mmiotrace to the new type of print_line Adapt mmiotrace to the new print_line type. By default, it ignores (and consumes) types it doesn't support. Signed-off-by: Frederic Weisbecker Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3df441ea2749..1a266aa08e1a 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -175,7 +175,7 @@ print_out: return (ret == -EBUSY) ? 0 : ret; } -static int mmio_print_rw(struct trace_iterator *iter) +static enum print_line_t mmio_print_rw(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct trace_mmiotrace_rw *field = @@ -215,11 +215,11 @@ static int mmio_print_rw(struct trace_iterator *iter) break; } if (ret) - return 1; - return 0; + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; } -static int mmio_print_map(struct trace_iterator *iter) +static enum print_line_t mmio_print_map(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct mmiotrace_map *m = (struct mmiotrace_map *)entry; @@ -227,7 +227,7 @@ static int mmio_print_map(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; - int ret = 1; + int ret; switch (m->opcode) { case MMIO_PROBE: @@ -247,11 +247,11 @@ static int mmio_print_map(struct trace_iterator *iter) break; } if (ret) - return 1; - return 0; + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; } -static int mmio_print_mark(struct trace_iterator *iter) +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct print_entry *print = (struct print_entry *)entry; @@ -265,16 +265,15 @@ static int mmio_print_mark(struct trace_iterator *iter) /* The trailing newline must be in the message. */ ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; if (entry->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); - return 1; + return TRACE_TYPE_HANDLED; } -/* return 0 to abort printing without consuming current entry in pipe mode */ -static int mmio_print_line(struct trace_iterator *iter) +static enum print_line_t mmio_print_line(struct trace_iterator *iter) { switch (iter->ent->type) { case TRACE_MMIO_RW: @@ -284,7 +283,7 @@ static int mmio_print_line(struct trace_iterator *iter) case TRACE_PRINT: return mmio_print_mark(iter); default: - return 1; /* ignore unknown entries */ + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ } } -- cgit v1.2.1 From 9e9efffb7848fe53c334996819139b431b983ac2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Sep 2008 20:31:58 +0200 Subject: tracing/ftrace: adapt the boot tracer to the new print_line type This patch adapts the boot tracer to the new type of the print_line callback. It still relays entries it doesn't support to default output functions. Signed-off-by: Frederic Weisbecker Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index fa8cca1be115..43bde20b95bd 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -45,22 +45,25 @@ static void boot_trace_ctrl_update(struct trace_array *tr) stop_boot_trace(tr); } -static int initcall_print_line(struct trace_iterator *iter) +static enum print_line_t initcall_print_line(struct trace_iterator *iter) { - int ret = 0; + int ret; struct trace_entry *entry = iter->ent; struct trace_boot *field = (struct trace_boot *)entry; struct boot_trace *it = &field->initcall; struct trace_seq *s = &iter->seq; - if (entry->type == TRACE_BOOT) + if (entry->type == TRACE_BOOT) { ret = trace_seq_printf(s, "%pF called from %i " "returned %d after %lld msecs\n", it->func, it->caller, it->result, it->duration); - if (ret) - return 1; - return 0; + if (ret) + return TRACE_TYPE_HANDLED; + else + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_UNHANDLED; } struct tracer boot_tracer __read_mostly = -- cgit v1.2.1 From 70255b5e3f1bd1a5af5b1e425ec2c4db7c735112 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Oct 2008 00:29:52 -0400 Subject: ring_buffer: remove raw from local_irq_save The raw_local_irq_save causes issues with lockdep. We don't need it so replace them with local_irq_save. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cfa711374d9a..8e7392fd0db9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -847,7 +847,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, if (atomic_read(&buffer->record_disabled)) return NULL; - raw_local_irq_save(*flags); + local_irq_save(*flags); cpu = raw_smp_processor_id(); if (!cpu_isset(cpu, buffer->cpumask)) @@ -909,7 +909,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); spin_unlock(&cpu_buffer->lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); return 0; } @@ -1583,13 +1583,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpu_isset(cpu, buffer->cpumask)) return; - raw_local_irq_save(flags); + local_irq_save(flags); spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); spin_unlock(&cpu_buffer->lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); } /** -- cgit v1.2.1 From d769041f865330034131525ee6a7f72eb4af2a24 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Oct 2008 00:29:53 -0400 Subject: ring_buffer: implement new locking The old "lock always" scheme had issues with lockdep, and was not very efficient anyways. This patch does a new design to be partially lockless on writes. Writes will add new entries to the per cpu pages by simply disabling interrupts. When a write needs to go to another page than it will grab the lock. A new "read page" has been added so that the reader can pull out a page from the ring buffer to read without worrying about the writer writing over it. This allows us to not take the lock for all reads. The lock is now only taken when a read needs to go to a new page. This is far from lockless, and interrupts still need to be disabled, but it is a step towards a more lockless solution, and it also solves a lot of the issues that were noticed by the first conversion of ftrace to the ring buffers. Note: the ring_buffer_{un}lock API has been removed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 298 ++++++++++++++++++++++++++------------------- kernel/trace/trace.c | 113 +++++++++++------ 2 files changed, 247 insertions(+), 164 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8e7392fd0db9..9631abf2ae29 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -161,8 +161,10 @@ struct ring_buffer_per_cpu { struct list_head pages; unsigned long head; /* read from head */ unsigned long tail; /* write to tail */ + unsigned long reader; struct buffer_page *head_page; struct buffer_page *tail_page; + struct buffer_page *reader_page; unsigned long overrun; unsigned long entries; u64 write_stamp; @@ -260,6 +262,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long addr; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), @@ -272,9 +275,16 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) spin_lock_init(&cpu_buffer->lock); INIT_LIST_HEAD(&cpu_buffer->pages); + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto fail_free_buffer; + cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr); + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + cpu_buffer->reader_page->size = 0; + ret = rb_allocate_pages(cpu_buffer, buffer->pages); if (ret < 0) - goto fail_free_buffer; + goto fail_free_reader; cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); @@ -283,6 +293,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) return cpu_buffer; + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + fail_free_buffer: kfree(cpu_buffer); return NULL; @@ -293,6 +306,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = &cpu_buffer->pages; struct buffer_page *page, *tmp; + list_del_init(&cpu_buffer->reader_page->list); + free_buffer_page(cpu_buffer->reader_page); + list_for_each_entry_safe(page, tmp, head, list) { list_del_init(&page->list); free_buffer_page(page); @@ -538,8 +554,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { - return cpu_buffer->head_page == cpu_buffer->tail_page && - cpu_buffer->head == cpu_buffer->tail; + return (cpu_buffer->reader == cpu_buffer->reader_page->size && + (cpu_buffer->tail_page == cpu_buffer->reader_page || + (cpu_buffer->tail_page == cpu_buffer->head_page && + cpu_buffer->head == cpu_buffer->tail))); } static inline int rb_null_event(struct ring_buffer_event *event) @@ -555,10 +573,10 @@ static inline void *rb_page_index(struct buffer_page *page, unsigned index) } static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { - return rb_page_index(cpu_buffer->head_page, - cpu_buffer->head); + return rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader); } static inline struct ring_buffer_event * @@ -610,15 +628,32 @@ rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) cpu_buffer->write_stamp = *ts; } -static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer) +static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp; cpu_buffer->head = 0; } -static void -rb_reset_iter_read_page(struct ring_buffer_iter *iter) +static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { + cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->reader = 0; +} + +static inline void rb_inc_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* + * The iterator could be on the reader page (it starts there). + * But the head could have moved, since the reader was + * found. Check for this case and assign the iterator + * to the head page instead of next. + */ + if (iter->head_page == cpu_buffer->reader_page) + iter->head_page = cpu_buffer->head_page; + else + rb_inc_page(cpu_buffer, &iter->head_page); + iter->read_stamp = iter->head_page->time_stamp; iter->head = 0; } @@ -693,30 +728,39 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *head_page, *tail_page; + struct buffer_page *tail_page, *head_page, *reader_page; unsigned long tail; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; + /* No locking needed for tail page */ tail_page = cpu_buffer->tail_page; - head_page = cpu_buffer->head_page; tail = cpu_buffer->tail; if (tail + length > BUF_PAGE_SIZE) { struct buffer_page *next_page = tail_page; + spin_lock(&cpu_buffer->lock); rb_inc_page(cpu_buffer, &next_page); + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; + + /* we grabbed the lock before incrementing */ + WARN_ON(next_page == reader_page); + if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) + if (!(buffer->flags & RB_FL_OVERWRITE)) { + spin_unlock(&cpu_buffer->lock); return NULL; + } /* count overflows */ rb_update_overflow(cpu_buffer); rb_inc_page(cpu_buffer, &head_page); cpu_buffer->head_page = head_page; - rb_reset_read_page(cpu_buffer); + rb_reset_head_page(cpu_buffer); } if (tail != BUF_PAGE_SIZE) { @@ -732,6 +776,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = tail_page; cpu_buffer->tail = tail; rb_add_stamp(cpu_buffer, ts); + spin_unlock(&cpu_buffer->lock); } BUG_ON(tail + length > BUF_PAGE_SIZE); @@ -802,7 +847,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } } else { + spin_lock(&cpu_buffer->lock); rb_add_stamp(cpu_buffer, &ts); + spin_unlock(&cpu_buffer->lock); delta = 0; } @@ -851,13 +898,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); if (!cpu_isset(cpu, buffer->cpumask)) - goto out_irq; + goto out; cpu_buffer = buffer->buffers[cpu]; - spin_lock(&cpu_buffer->lock); if (atomic_read(&cpu_buffer->record_disabled)) - goto no_record; + goto out; length = rb_calculate_event_length(length); if (length > BUF_PAGE_SIZE) @@ -865,13 +911,11 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); if (!event) - goto no_record; + goto out; return event; - no_record: - spin_unlock(&cpu_buffer->lock); - out_irq: + out: local_irq_restore(*flags); return NULL; } @@ -904,11 +948,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, cpu_buffer = buffer->buffers[cpu]; - assert_spin_locked(&cpu_buffer->lock); - rb_commit(cpu_buffer, event); - spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return 0; @@ -945,10 +986,9 @@ int ring_buffer_write(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); if (!cpu_isset(cpu, buffer->cpumask)) - goto out_irq; + goto out; cpu_buffer = buffer->buffers[cpu]; - spin_lock(&cpu_buffer->lock); if (atomic_read(&cpu_buffer->record_disabled)) goto out; @@ -967,55 +1007,11 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - spin_unlock(&cpu_buffer->lock); - out_irq: local_irq_restore(flags); return ret; } -/** - * ring_buffer_lock - lock the ring buffer - * @buffer: The ring buffer to lock - * @flags: The place to store the interrupt flags - * - * This locks all the per CPU buffers. - * - * Must be unlocked by ring_buffer_unlock. - */ -void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu; - - local_irq_save(*flags); - - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - spin_lock(&cpu_buffer->lock); - } -} - -/** - * ring_buffer_unlock - unlock a locked buffer - * @buffer: The locked buffer to unlock - * @flags: The interrupt flags received by ring_buffer_lock - */ -void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu; - - for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) { - if (!cpu_isset(cpu, buffer->cpumask)) - continue; - cpu_buffer = buffer->buffers[cpu]; - spin_unlock(&cpu_buffer->lock); - } - - local_irq_restore(flags); -} - /** * ring_buffer_record_disable - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. @@ -1169,9 +1165,18 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head; - rb_reset_iter_read_page(iter); + /* Iterator usage is expected to have record disabled */ + if (list_empty(&cpu_buffer->reader_page->list)) { + iter->head_page = cpu_buffer->head_page; + iter->head = cpu_buffer->head; + } else { + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader; + } + if (iter->head) + iter->read_stamp = cpu_buffer->read_stamp; + else + iter->read_stamp = iter->head_page->time_stamp; } /** @@ -1250,43 +1255,84 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; } -static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer) +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_event *event; - unsigned length; + struct buffer_page *reader = NULL; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + again: + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader < reader->size) + goto out; + + /* Never should we have an index greater than the size */ + WARN_ON(cpu_buffer->reader > reader->size); + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + goto out; /* - * Check if we are at the end of the buffer. + * Splice the empty reader page into the list around the head. + * Reset the reader page to size zero. */ - if (cpu_buffer->head >= cpu_buffer->head_page->size) { - BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page); - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); - rb_reset_read_page(cpu_buffer); - return; - } - event = rb_head_event(cpu_buffer); + reader = cpu_buffer->head_page; + cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.prev = reader->list.prev; + cpu_buffer->reader_page->size = 0; - if (event->type == RINGBUF_TYPE_DATA) - cpu_buffer->entries--; - - length = rb_event_length(event); + /* Make the reader page now replace the head */ + reader->list.prev->next = &cpu_buffer->reader_page->list; + reader->list.next->prev = &cpu_buffer->reader_page->list; /* - * This should not be called to advance the header if we are - * at the tail of the buffer. + * If the tail is on the reader, then we must set the head + * to the inserted page, otherwise we set it one before. */ - BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) && - (cpu_buffer->head + length > cpu_buffer->tail)); + cpu_buffer->head_page = cpu_buffer->reader_page; - rb_update_read_stamp(cpu_buffer, event); + if (cpu_buffer->tail_page != reader) + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + + /* Finally update the reader page to the new head */ + cpu_buffer->reader_page = reader; + rb_reset_reader_page(cpu_buffer); + + goto again; + + out: + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + unsigned length; + + reader = rb_get_reader_page(cpu_buffer); - cpu_buffer->head += length; + /* This function should not be called when buffer is empty */ + BUG_ON(!reader); - /* check for end of page */ - if ((cpu_buffer->head >= cpu_buffer->head_page->size) && - (cpu_buffer->head_page != cpu_buffer->tail_page)) - rb_advance_head(cpu_buffer); + event = rb_reader_event(cpu_buffer); + + if (event->type == RINGBUF_TYPE_DATA) + cpu_buffer->entries--; + + rb_update_read_stamp(cpu_buffer, event); + + length = rb_event_length(event); + cpu_buffer->reader += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -1304,8 +1350,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) */ if (iter->head >= iter->head_page->size) { BUG_ON(iter->head_page == cpu_buffer->tail_page); - rb_inc_page(cpu_buffer, &iter->head_page); - rb_reset_iter_read_page(iter); + rb_inc_iter(iter); return; } @@ -1344,6 +1389,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; + struct buffer_page *reader; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; @@ -1351,25 +1397,26 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) cpu_buffer = buffer->buffers[cpu]; again: - if (rb_per_cpu_empty(cpu_buffer)) + reader = rb_get_reader_page(cpu_buffer); + if (!reader) return NULL; - event = rb_head_event(cpu_buffer); + event = rb_reader_event(cpu_buffer); switch (event->type) { case RINGBUF_TYPE_PADDING: - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); - rb_reset_read_page(cpu_buffer); - goto again; + WARN_ON(1); + rb_advance_reader(cpu_buffer); + return NULL; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ - rb_advance_head(cpu_buffer); + rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_TIME_STAMP: /* FIXME: not implemented */ - rb_advance_head(cpu_buffer); + rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: @@ -1415,8 +1462,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - rb_inc_page(cpu_buffer, &iter->head_page); - rb_reset_iter_read_page(iter); + rb_inc_iter(iter); goto again; case RINGBUF_TYPE_TIME_EXTEND: @@ -1465,7 +1511,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) return NULL; cpu_buffer = buffer->buffers[cpu]; - rb_advance_head(cpu_buffer); + rb_advance_reader(cpu_buffer); return event; } @@ -1487,6 +1533,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; + unsigned long flags; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; @@ -1502,11 +1549,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - spin_lock(&cpu_buffer->lock); - iter->head = cpu_buffer->head; - iter->head_page = cpu_buffer->head_page; - rb_reset_iter_read_page(iter); - spin_unlock(&cpu_buffer->lock); + spin_lock_irqsave(&cpu_buffer->lock, flags); + ring_buffer_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->lock, flags); return iter; } @@ -1562,10 +1607,14 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); - cpu_buffer->tail_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->head_page->size = 0; + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->tail_page->size = 0; + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + cpu_buffer->reader_page->size = 0; + + cpu_buffer->head = cpu_buffer->tail = cpu_buffer->reader = 0; - cpu_buffer->head = cpu_buffer->tail = 0; cpu_buffer->overrun = 0; cpu_buffer->entries = 0; } @@ -1583,13 +1632,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpu_isset(cpu, buffer->cpumask)) return; - local_irq_save(flags); - spin_lock(&cpu_buffer->lock); + spin_lock_irqsave(&cpu_buffer->lock, flags); rb_reset_cpu(cpu_buffer); - spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&cpu_buffer->lock, flags); } /** @@ -1598,15 +1645,10 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) */ void ring_buffer_reset(struct ring_buffer *buffer) { - unsigned long flags; int cpu; - ring_buffer_lock(buffer, &flags); - for_each_buffer_cpu(buffer, cpu) - rb_reset_cpu(buffer->buffers[cpu]); - - ring_buffer_unlock(buffer, flags); + ring_buffer_reset_cpu(buffer, cpu); } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a1c76bb56ba..b542f8837801 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,20 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); + +static inline void ftrace_disable_cpu(void) +{ + preempt_disable(); + local_inc(&__get_cpu_var(ftrace_cpu_disabled)); +} + +static inline void ftrace_enable_cpu(void) +{ + local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + preempt_enable(); +} + static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ @@ -406,7 +420,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tr->buffer = max_tr.buffer; max_tr.buffer = buf; + ftrace_disable_cpu(); ring_buffer_reset(tr->buffer); + ftrace_enable_cpu(); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -428,9 +444,13 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); + ftrace_disable_cpu(); + ring_buffer_reset(max_tr.buffer); ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + ftrace_enable_cpu(); + WARN_ON_ONCE(ret); __update_max_tr(tr, tsk, cpu); @@ -543,7 +563,9 @@ void unregister_tracer(struct tracer *type) void tracing_reset(struct trace_array *tr, int cpu) { + ftrace_disable_cpu(); ring_buffer_reset_cpu(tr->buffer, cpu); + ftrace_enable_cpu(); } #define SAVED_CMDLINES 128 @@ -654,6 +676,10 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, struct ftrace_entry *entry; unsigned long irq_flags; + /* If we are reading the ring buffer, don't trace */ + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); if (!event) @@ -870,8 +896,14 @@ enum trace_file_type { static void trace_iterator_increment(struct trace_iterator *iter, int cpu) { + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + iter->idx++; - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + + ftrace_enable_cpu(); } static struct trace_entry * @@ -880,9 +912,19 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - event = ring_buffer_iter_peek(buf_iter, ts); + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + + if (buf_iter) + event = ring_buffer_iter_peek(buf_iter, ts); + else + event = ring_buffer_peek(iter->tr->buffer, cpu, ts); + + ftrace_enable_cpu(); + return event ? ring_buffer_event_data(event) : NULL; } + static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { @@ -938,7 +980,10 @@ static void *find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); + ftrace_enable_cpu(); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -991,10 +1036,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; + ftrace_disable_cpu(); + for_each_tracing_cpu(cpu) { ring_buffer_iter_reset(iter->buffer_iter[cpu]); } + ftrace_enable_cpu(); + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -1242,7 +1291,16 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) cont = (struct trace_field_cont *)ent; if (ok) ok = (trace_seq_printf(s, "%s", cont->buf) > 0); - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + + ftrace_disable_cpu(); + + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + else + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + + ftrace_enable_cpu(); + ent = peek_next_entry(iter, iter->cpu, NULL); } while (ent && ent->type == TRACE_CONT); @@ -1683,9 +1741,15 @@ static int trace_empty(struct trace_iterator *iter) int cpu; for_each_tracing_cpu(cpu) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) - return 0; + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } } + return TRACE_TYPE_HANDLED; } @@ -1776,8 +1840,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->pos = -1; for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); + if (!iter->buffer_iter[cpu]) goto fail_buffer; } @@ -2341,7 +2407,6 @@ static atomic_t tracing_reader; static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_iterator *iter; - int cpu; if (tracing_disabled) return -ENODEV; @@ -2362,38 +2427,17 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->trace = current_trace; filp->private_data = iter; - for_each_tracing_cpu(cpu) { - iter->buffer_iter[cpu] = - ring_buffer_read_start(iter->tr->buffer, cpu); - if (!iter->buffer_iter[cpu]) - goto fail_buffer; - } - if (iter->trace->pipe_open) iter->trace->pipe_open(iter); mutex_unlock(&trace_types_lock); return 0; - - fail_buffer: - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } - mutex_unlock(&trace_types_lock); - - return -ENOMEM; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; - int cpu; - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } kfree(iter); atomic_dec(&tracing_reader); @@ -2429,7 +2473,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - unsigned long flags; #ifdef CONFIG_FTRACE int ftrace_save; #endif @@ -2528,7 +2571,6 @@ waitagain: ftrace_enabled = 0; #endif smp_wmb(); - ring_buffer_lock(iter->tr->buffer, &flags); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; @@ -2547,7 +2589,6 @@ waitagain: break; } - ring_buffer_unlock(iter->tr->buffer, flags); #ifdef CONFIG_FTRACE ftrace_enabled = ftrace_save; #endif @@ -3010,8 +3051,8 @@ void ftrace_dump(void) static struct trace_iterator iter; static cpumask_t mask; static int dump_ran; - unsigned long flags, irq_flags; - int cnt = 0; + unsigned long flags; + int cnt = 0, cpu; /* only one dump */ spin_lock_irqsave(&ftrace_dump_lock, flags); @@ -3023,6 +3064,10 @@ void ftrace_dump(void) /* No turning back! */ ftrace_kill_atomic(); + for_each_tracing_cpu(cpu) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + printk(KERN_TRACE "Dumping ftrace buffer:\n"); iter.tr = &global_trace; @@ -3037,8 +3082,6 @@ void ftrace_dump(void) cpus_clear(mask); - ring_buffer_lock(iter.tr->buffer, &irq_flags); - while (!trace_empty(&iter)) { if (!cnt) @@ -3066,8 +3109,6 @@ void ftrace_dump(void) else printk(KERN_TRACE "---------------------------------\n"); - ring_buffer_unlock(iter.tr->buffer, irq_flags); - out: spin_unlock_irqrestore(&ftrace_dump_lock, flags); } -- cgit v1.2.1 From 797d3712a9dd75c720558612be05f42c031a7bb5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 30 Sep 2008 18:13:45 +0200 Subject: tracing/ftrace: adapt mmiotrace to the new type of print_line, fix Correct the value's type of trace_empty function Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b542f8837801..c1634068adfa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1750,7 +1750,7 @@ static int trace_empty(struct trace_iterator *iter) } } - return TRACE_TYPE_HANDLED; + return 1; } static enum print_line_t print_trace_line(struct trace_iterator *iter) -- cgit v1.2.1 From 7104f300c5a69b46dda00d898034dd05c9f21739 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Oct 2008 10:52:51 -0400 Subject: ftrace: type cast filter+verifier The mmiotrace map had a bug that would typecast the entry from the trace to the wrong type. That is a known danger of C typecasts, there's absolutely zero checking done on them. Help that problem a bit by using a GCC extension to implement a type filter that restricts the types that a trace record can be cast into, and by adding a dynamic check (in debug mode) to verify the type of the entry. This patch adds a macro to assign all entries of ftrace using the type of the variable and checking the entry id. The typecasts are now done in the macro for only those types that it knows about, which should be all the types that are allowed to be read from the tracer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 85 +++++++++++++++++++++++++++++------------- kernel/trace/trace.h | 42 +++++++++++++++++++++ kernel/trace/trace_mmiotrace.c | 14 +++++-- 3 files changed, 112 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c1634068adfa..948f7d821c62 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1350,7 +1350,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) } switch (entry->type) { case TRACE_FN: { - struct ftrace_entry *field = (struct ftrace_entry *)entry; + struct ftrace_entry *field; + + trace_assign_type(field, entry); seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_puts(s, " ("); @@ -1363,8 +1365,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) } case TRACE_CTX: case TRACE_WAKE: { - struct ctx_switch_entry *field = - (struct ctx_switch_entry *)entry; + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); T = field->next_state < sizeof(state_to_char) ? state_to_char[field->next_state] : 'X'; @@ -1384,7 +1387,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) break; } case TRACE_SPECIAL: { - struct special_entry *field = (struct special_entry *)entry; + struct special_entry *field; + + trace_assign_type(field, entry); trace_seq_printf(s, "# %ld %ld %ld\n", field->arg1, @@ -1393,7 +1398,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) break; } case TRACE_STACK: { - struct stack_entry *field = (struct stack_entry *)entry; + struct stack_entry *field; + + trace_assign_type(field, entry); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) @@ -1404,7 +1411,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) break; } case TRACE_PRINT: { - struct print_entry *field = (struct print_entry *)entry; + struct print_entry *field; + + trace_assign_type(field, entry); seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_printf(s, ": %s", field->buf); @@ -1454,7 +1463,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) switch (entry->type) { case TRACE_FN: { - struct ftrace_entry *field = (struct ftrace_entry *)entry; + struct ftrace_entry *field; + + trace_assign_type(field, entry); ret = seq_print_ip_sym(s, field->ip, sym_flags); if (!ret) @@ -1480,8 +1491,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) } case TRACE_CTX: case TRACE_WAKE: { - struct ctx_switch_entry *field = - (struct ctx_switch_entry *)entry; + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); S = field->prev_state < sizeof(state_to_char) ? state_to_char[field->prev_state] : 'X'; @@ -1501,7 +1513,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: { - struct special_entry *field = (struct special_entry *)entry; + struct special_entry *field; + + trace_assign_type(field, entry); ret = trace_seq_printf(s, "# %ld %ld %ld\n", field->arg1, @@ -1512,7 +1526,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) break; } case TRACE_STACK: { - struct stack_entry *field = (struct stack_entry *)entry; + struct stack_entry *field; + + trace_assign_type(field, entry); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) { @@ -1531,7 +1547,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) break; } case TRACE_PRINT: { - struct print_entry *field = (struct print_entry *)entry; + struct print_entry *field; + + trace_assign_type(field, entry); seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_printf(s, ": %s", field->buf); @@ -1562,7 +1580,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) switch (entry->type) { case TRACE_FN: { - struct ftrace_entry *field = (struct ftrace_entry *)entry; + struct ftrace_entry *field; + + trace_assign_type(field, entry); ret = trace_seq_printf(s, "%x %x\n", field->ip, @@ -1573,8 +1593,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) } case TRACE_CTX: case TRACE_WAKE: { - struct ctx_switch_entry *field = - (struct ctx_switch_entry *)entry; + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); S = field->prev_state < sizeof(state_to_char) ? state_to_char[field->prev_state] : 'X'; @@ -1596,7 +1617,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) } case TRACE_SPECIAL: case TRACE_STACK: { - struct special_entry *field = (struct special_entry *)entry; + struct special_entry *field; + + trace_assign_type(field, entry); ret = trace_seq_printf(s, "# %ld %ld %ld\n", field->arg1, @@ -1607,7 +1630,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) break; } case TRACE_PRINT: { - struct print_entry *field = (struct print_entry *)entry; + struct print_entry *field; + + trace_assign_type(field, entry); trace_seq_printf(s, "# %lx %s", field->ip, field->buf); if (entry->flags & TRACE_FLAG_CONT) @@ -1648,7 +1673,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) switch (entry->type) { case TRACE_FN: { - struct ftrace_entry *field = (struct ftrace_entry *)entry; + struct ftrace_entry *field; + + trace_assign_type(field, entry); SEQ_PUT_HEX_FIELD_RET(s, field->ip); SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); @@ -1656,8 +1683,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) } case TRACE_CTX: case TRACE_WAKE: { - struct ctx_switch_entry *field = - (struct ctx_switch_entry *)entry; + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); S = field->prev_state < sizeof(state_to_char) ? state_to_char[field->prev_state] : 'X'; @@ -1676,7 +1704,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) } case TRACE_SPECIAL: case TRACE_STACK: { - struct special_entry *field = (struct special_entry *)entry; + struct special_entry *field; + + trace_assign_type(field, entry); SEQ_PUT_HEX_FIELD_RET(s, field->arg1); SEQ_PUT_HEX_FIELD_RET(s, field->arg2); @@ -1705,15 +1735,18 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) switch (entry->type) { case TRACE_FN: { - struct ftrace_entry *field = (struct ftrace_entry *)entry; + struct ftrace_entry *field; + + trace_assign_type(field, entry); SEQ_PUT_FIELD_RET(s, field->ip); SEQ_PUT_FIELD_RET(s, field->parent_ip); break; } case TRACE_CTX: { - struct ctx_switch_entry *field = - (struct ctx_switch_entry *)entry; + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); SEQ_PUT_FIELD_RET(s, field->prev_pid); SEQ_PUT_FIELD_RET(s, field->prev_prio); @@ -1725,7 +1758,9 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) } case TRACE_SPECIAL: case TRACE_STACK: { - struct special_entry *field = (struct special_entry *)entry; + struct special_entry *field; + + trace_assign_type(field, entry); SEQ_PUT_FIELD_RET(s, field->arg1); SEQ_PUT_FIELD_RET(s, field->arg2); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a921ba5d292d..f02042d0d828 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -177,6 +177,48 @@ struct trace_array { struct trace_array_cpu *data[NR_CPUS]; }; +#define FTRACE_CMP_TYPE(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id && (entry)->type != id); \ + break; \ + } + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + * Where "type" is the trace type that includes the trace_entry + * as the "ent" item. And "id" is the trace identifier that is + * used in the trace_type enum. + * + * If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent) \ + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct special_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ + TRACE_MMIO_MAP); \ + IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ + __ftrace_bad_type(); \ + } while (0) /* Return values for print_line callback */ enum print_line_t { diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 1a266aa08e1a..0e819f47bb7a 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -178,15 +178,17 @@ print_out: static enum print_line_t mmio_print_rw(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct trace_mmiotrace_rw *field = - (struct trace_mmiotrace_rw *)entry; - struct mmiotrace_rw *rw = &field->rw; + struct trace_mmiotrace_rw *field; + struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; + trace_assign_type(field, entry); + rw = &field->rw; + switch (rw->opcode) { case MMIO_READ: ret = trace_seq_printf(s, @@ -222,13 +224,17 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) static enum print_line_t mmio_print_map(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_map *m = (struct mmiotrace_map *)entry; + struct trace_mmiotrace_map *field; + struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret; + trace_assign_type(field, entry); + m = &field->map; + switch (m->opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, -- cgit v1.2.1 From e4c2ce82ca2710e17cb4df8eb2b249fa2eb5af30 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Oct 2008 11:14:54 -0400 Subject: ring_buffer: allocate buffer page pointer The current method of overlaying the page frame as the buffer page pointer can be very dangerous and limits our ability to do other things with a page from the buffer, like send it off to disk. This patch allocates the buffer_page instead of overlaying the page's page frame. The use of the buffer_page has hardly changed due to this. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 54 +++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9631abf2ae29..98145718988d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) * Thanks to Peter Zijlstra for suggesting this idea. */ struct buffer_page { - union { - struct { - unsigned long flags; /* mandatory */ - atomic_t _count; /* mandatory */ - u64 time_stamp; /* page time stamp */ - unsigned size; /* size of page data */ - struct list_head list; /* list of free pages */ - }; - struct page page; - }; + u64 time_stamp; /* page time stamp */ + unsigned size; /* size of page data */ + struct list_head list; /* list of free pages */ + void *page; /* Actual data page */ }; /* @@ -133,9 +127,9 @@ struct buffer_page { */ static inline void free_buffer_page(struct buffer_page *bpage) { - reset_page_mapcount(&bpage->page); - bpage->page.mapping = NULL; - __free_page(&bpage->page); + if (bpage->page) + __free_page(bpage->page); + kfree(bpage); } /* @@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned i; for (i = 0; i < nr_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); + addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page = (struct buffer_page *)virt_to_page(addr); - list_add(&page->list, &pages); + page->page = (void *)addr; } list_splice(&pages, head); @@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *page; unsigned long addr; int ret; @@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) spin_lock_init(&cpu_buffer->lock); INIT_LIST_HEAD(&cpu_buffer->pages); + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto fail_free_buffer; + + cpu_buffer->reader_page = page; addr = __get_free_page(GFP_KERNEL); if (!addr) - goto fail_free_buffer; - cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr); + goto fail_free_reader; + page->page = (void *)addr; + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); cpu_buffer->reader_page->size = 0; @@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), + cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page = (struct buffer_page *)virt_to_page(addr); - list_add(&page->list, &pages); + page->page = (void *)addr; } } @@ -567,9 +579,7 @@ static inline int rb_null_event(struct ring_buffer_event *event) static inline void *rb_page_index(struct buffer_page *page, unsigned index) { - void *addr = page_address(&page->page); - - return addr + index; + return page->page + index; } static inline struct ring_buffer_event * -- cgit v1.2.1 From 38697053fa006411224a1790e2adb8216440ab0f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Oct 2008 13:14:09 -0400 Subject: ftrace: preempt disable over interrupt disable With the new ring buffer infrastructure in ftrace, I'm trying to make ftrace a little more light weight. This patch converts a lot of the local_irq_save/restore into preempt_disable/enable. The original preempt count in a lot of cases has to be sent in as a parameter so that it can be recorded correctly. Some places were recording it incorrectly before anyway. This is also laying the ground work to make ftrace a little bit more reentrant, and remove all locking. The function tracers must still protect from reentrancy. Note: All the function tracers must be careful when using preempt_disable. It must do the following: resched = need_resched(); preempt_disable_notrace(); [...] if (resched) preempt_enable_no_resched_notrace(); else preempt_enable_notrace(); The reason is that if this function traces schedule() itself, the preempt_enable_notrace() will cause a schedule, which will lead us into a recursive failure. If we needed to reschedule before calling preempt_disable, we should have already scheduled. Since we did not, this is most likely that we should not and are probably inside a schedule function. If resched was not set, we still need to catch the need resched flag being set when preemption was off and the if case at the end will catch that for us. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 123 +++++++++++++++++++------------------- kernel/trace/trace.h | 13 ++-- kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_irqsoff.c | 13 ++-- kernel/trace/trace_mmiotrace.c | 4 +- kernel/trace/trace_sched_switch.c | 9 ++- kernel/trace/trace_sched_wakeup.c | 13 +++- 7 files changed, 97 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 948f7d821c62..1cd2e8143bb4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,12 +652,10 @@ void tracing_record_cmdline(struct task_struct *tsk) } void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, + int pc) { struct task_struct *tsk = current; - unsigned long pc; - - pc = preempt_count(); entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; @@ -670,7 +668,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) void trace_function(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) { struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -685,7 +684,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags); + tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_FN; entry->ip = ip; entry->parent_ip = parent_ip; @@ -694,16 +693,17 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, void ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) { if (likely(!atomic_read(&data->disabled))) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); } -void __trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long flags, - int skip) +static void ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) { struct ring_buffer_event *event; struct stack_entry *entry; @@ -718,7 +718,7 @@ void __trace_stack(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags); + tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_STACK; memset(&entry->caller, 0, sizeof(entry->caller)); @@ -732,9 +732,18 @@ void __trace_stack(struct trace_array *tr, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + ftrace_trace_stack(tr, data, flags, skip, preempt_count()); +} + +static void +ftrace_trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + int pc) { struct ring_buffer_event *event; struct trace_array_cpu *data = __data; @@ -747,23 +756,30 @@ __trace_special(void *__tr, void *__data, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0); + tracing_generic_entry_update(&entry->ent, 0, pc); entry->ent.type = TRACE_SPECIAL; entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - __trace_stack(tr, data, irq_flags, 4); + ftrace_trace_stack(tr, data, irq_flags, 4, pc); trace_wake_up(); } +void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); +} + void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags) + unsigned long flags, int pc) { struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -774,7 +790,7 @@ tracing_sched_switch_trace(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags); + tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_CTX; entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; @@ -784,7 +800,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - __trace_stack(tr, data, flags, 5); + ftrace_trace_stack(tr, data, flags, 5, pc); } void @@ -792,7 +808,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *curr, - unsigned long flags) + unsigned long flags, int pc) { struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -803,7 +819,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags); + tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_WAKE; entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; @@ -813,7 +829,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - __trace_stack(tr, data, flags, 6); + ftrace_trace_stack(tr, data, flags, 6, pc); trace_wake_up(); } @@ -823,23 +839,24 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; + int pc; if (tracing_disabled || !tr->ctrl) return; - local_irq_save(flags); + pc = preempt_count(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - __trace_special(tr, data, arg1, arg2, arg3); + ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } #ifdef CONFIG_FTRACE @@ -850,7 +867,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu; + int cpu, resched; + int pc; if (unlikely(!ftrace_function_enabled)) return; @@ -858,16 +876,22 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (skip_trace(ip)) return; - local_irq_save(flags); + pc = preempt_count(); + resched = need_resched(); + preempt_disable_notrace(); + local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - local_irq_restore(flags); + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -2508,9 +2532,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; -#ifdef CONFIG_FTRACE - int ftrace_save; -#endif ssize_t sret; /* return any leftover data */ @@ -2593,20 +2614,6 @@ waitagain: offsetof(struct trace_iterator, seq)); iter->pos = -1; - /* - * We need to stop all tracing on all CPUS to read the - * the next buffer. This is a bit expensive, but is - * not done often. We fill all what we can read, - * and then release the locks again. - */ - - local_irq_disable(); -#ifdef CONFIG_FTRACE - ftrace_save = ftrace_enabled; - ftrace_enabled = 0; -#endif - smp_wmb(); - while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -2624,11 +2631,6 @@ waitagain: break; } -#ifdef CONFIG_FTRACE - ftrace_enabled = ftrace_save; -#endif - local_irq_enable(); - /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= iter->seq.len) @@ -2960,12 +2962,13 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct print_entry *entry; unsigned long flags, irq_flags; long disabled; - int cpu, len = 0, size; + int cpu, len = 0, size, pc; if (!tr->ctrl || tracing_disabled) return 0; - local_irq_save(flags); + pc = preempt_count(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -2973,7 +2976,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (unlikely(disabled != 1)) goto out; - spin_lock(&trace_buf_lock); + spin_lock_irqsave(&trace_buf_lock, flags); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -2984,7 +2987,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags); + tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_PRINT; entry->ip = ip; @@ -2993,11 +2996,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) ring_buffer_unlock_commit(tr->buffer, event, irq_flags); out_unlock: - spin_unlock(&trace_buf_lock); + spin_unlock_irqrestore(&trace_buf_lock, flags); out: atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); return len; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f02042d0d828..f1f99572cde7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -288,35 +288,36 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags); + unsigned long flags, + int pc); void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, - unsigned long flags); + unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags); + unsigned long flags, int pc); void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *cur, - unsigned long flags); + unsigned long flags, int pc); void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, - unsigned long arg3); + unsigned long arg3, int pc); void trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, - unsigned long flags); + unsigned long flags, int pc); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 43bde20b95bd..f2dac6f1cf06 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -95,7 +95,7 @@ void trace_boot(struct boot_trace *it) if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0); + tracing_generic_entry_update(&entry->ent, 0, 0); entry->ent.type = TRACE_BOOT; entry->initcall = *it; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 37ad49407f27..f925dbbff2a6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr, unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; + int pc; /* * usecs conversion is slow so we try to delay the conversion @@ -144,13 +145,15 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; + pc = preempt_count(); + spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); latency = nsecs_to_usecs(delta); @@ -174,7 +177,7 @@ out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); tracing_reset(tr, cpu); - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); } static inline void @@ -207,7 +210,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); per_cpu(tracing_cpu, cpu) = 1; @@ -241,7 +244,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0e819f47bb7a..f28484618ff0 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -324,7 +324,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_RW; entry->rw = *rw; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); @@ -352,7 +352,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_MAP; entry->map = *map; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e0b06db0f7af..c7fa08a5b7f4 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -26,6 +26,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, unsigned long flags; long disabled; int cpu; + int pc; if (!atomic_read(&sched_ref)) return; @@ -36,13 +37,14 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, if (!tracer_enabled) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - tracing_sched_switch_trace(ctx_trace, data, prev, next, flags); + tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -54,11 +56,12 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu; + int cpu, pc; if (!likely(tracer_enabled)) return; + pc = preempt_count(); tracing_record_cmdline(current); local_irq_save(flags); @@ -68,7 +71,7 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) if (likely(disabled == 1)) tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, - flags); + flags, pc); atomic_dec(&data->disabled); local_irq_restore(flags); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 01e75e0639b7..fe4a252c2363 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -44,10 +44,12 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int resched; int cpu; + int pc; if (likely(!wakeup_task)) return; + pc = preempt_count(); resched = need_resched(); preempt_disable_notrace(); @@ -70,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) if (task_cpu(wakeup_task) != cpu) goto unlock; - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); unlock: __raw_spin_unlock(&wakeup_lock); @@ -121,6 +123,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, unsigned long flags; long disabled; int cpu; + int pc; tracing_record_cmdline(prev); @@ -139,6 +142,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (next != wakeup_task) return; + pc = preempt_count(); + /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; @@ -155,7 +160,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); /* * usecs conversion is slow so we try to delay the conversion @@ -220,6 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p) int cpu = smp_processor_id(); unsigned long flags; long disabled; + int pc; if (likely(!tracer_enabled)) return; @@ -232,6 +238,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p) p->prio >= current->prio) return; + pc = preempt_count(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (unlikely(disabled != 1)) goto out; @@ -256,7 +263,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p) wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], - CALLER_ADDR1, CALLER_ADDR2, flags); + CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: __raw_spin_unlock(&wakeup_lock); -- cgit v1.2.1 From 77ae11f63befb7fc41ec256f1fcb72ca7e4160d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Oct 2008 11:04:14 +0200 Subject: ring-buffer: fix build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: kernel/trace/ring_buffer.c: In function ‘rb_allocate_pages’: kernel/trace/ring_buffer.c:235: error: ‘cpu’ undeclared (first use in this function) kernel/trace/ring_buffer.c:235: error: (Each undeclared identifier is reported only once kernel/trace/ring_buffer.c:235: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 98145718988d..54a30986493a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -232,7 +232,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL, cpu_to_node(i)); if (!page) goto free_pages; list_add(&page->list, &pages); -- cgit v1.2.1 From cb5ab74204a6e2579d1119bf1348eb806526b12b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Oct 2008 12:59:20 +0200 Subject: tracing/fastboot: change the printing of boot tracer according to bootgraph.pl Change the boot tracer printing to make it parsable for the scripts/bootgraph.pl script. We have now to output two lines for each initcall, according to the printk in do_one_initcall() in init/main.c We need now the call's time and the return's time. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index f2dac6f1cf06..7c15f3e68ba3 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -52,16 +52,24 @@ static enum print_line_t initcall_print_line(struct trace_iterator *iter) struct trace_boot *field = (struct trace_boot *)entry; struct boot_trace *it = &field->initcall; struct trace_seq *s = &iter->seq; + struct timespec calltime = ktime_to_timespec(it->calltime); + struct timespec rettime = ktime_to_timespec(it->rettime); if (entry->type == TRACE_BOOT) { - ret = trace_seq_printf(s, "%pF called from %i " - "returned %d after %lld msecs\n", - it->func, it->caller, it->result, - it->duration); - if (ret) - return TRACE_TYPE_HANDLED; - else + ret = trace_seq_printf(s, "[%5ld.%06ld] calling %pF @ %i\n", + calltime.tv_sec, + calltime.tv_nsec, + it->func, it->caller); + if (!ret) return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %pF " + "returned %d after %lld msecs\n", + rettime.tv_sec, + rettime.tv_nsec, + it->func, it->result, it->duration); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } return TRACE_TYPE_UNHANDLED; } -- cgit v1.2.1 From 5601020feb0c3010e9e3e0131e9697ac6a06777b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Oct 2008 13:26:05 +0200 Subject: tracing/fastboot: get the initcall name before it disappears After some initcall traces, some initcall names may be inconsistent. That's because these functions will disappear from the .init section and also their name from the symbols table. So we have to copy the name of the function in a buffer large enough during the trace appending. It is not costly for the ring_buffer because the number of initcall entries is commonly not really large. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7c15f3e68ba3..b9dc2c0093ab 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "trace.h" @@ -56,17 +57,19 @@ static enum print_line_t initcall_print_line(struct trace_iterator *iter) struct timespec rettime = ktime_to_timespec(it->rettime); if (entry->type == TRACE_BOOT) { - ret = trace_seq_printf(s, "[%5ld.%06ld] calling %pF @ %i\n", + ret = trace_seq_printf(s, "[%5ld.%06ld] calling %s @ %i\n", calltime.tv_sec, calltime.tv_nsec, it->func, it->caller); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %pF " + + ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %s " "returned %d after %lld msecs\n", rettime.tv_sec, rettime.tv_nsec, it->func, it->result, it->duration); + if (!ret) return TRACE_TYPE_PARTIAL_LINE; return TRACE_TYPE_HANDLED; @@ -83,8 +86,7 @@ struct tracer boot_tracer __read_mostly = .print_line = initcall_print_line, }; - -void trace_boot(struct boot_trace *it) +void trace_boot(struct boot_trace *it, initcall_t fn) { struct ring_buffer_event *event; struct trace_boot *entry; @@ -95,6 +97,10 @@ void trace_boot(struct boot_trace *it) if (!trace_boot_enabled) return; + /* Get its name now since this function could + * disappear because it is in the .init section. + */ + sprint_symbol(it->func, (unsigned long)fn); preempt_disable(); data = tr->data[smp_processor_id()]; -- cgit v1.2.1 From aa1e0e3bcf95ce684d005bedb16e5d4559455685 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Oct 2008 19:18:09 -0400 Subject: ring_buffer: map to cpu not page My original patch had a compile bug when NUMA was configured. I referenced cpu when it should have been cpu_buffer->cpu. Ingo quickly fixed this bug by replacing cpu with 'i' because that was the loop counter. Unfortunately, the 'i' was the counter of pages, not CPUs. This caused a crash when the number of pages allocated for the buffers exceeded the number of pages, which would usually be the case. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 54a30986493a..6b8dac02364f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -232,7 +232,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), - GFP_KERNEL, cpu_to_node(i)); + GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!page) goto free_pages; list_add(&page->list, &pages); -- cgit v1.2.1 From 6450c1d3213e27b0dcbf34cce7ad1ae74244c520 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Oct 2008 19:23:04 -0400 Subject: ftrace: move pc counter in irqtrace The assigning of the pc counter is in the wrong spot in the check_critical_timing function. The pc variable is used in the out jump. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_irqsoff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index f925dbbff2a6..a7db7f040ae0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -142,11 +142,11 @@ check_critical_timing(struct trace_array *tr, local_save_flags(flags); + pc = preempt_count(); + if (!report_latency(delta)) goto out; - pc = preempt_count(); - spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ -- cgit v1.2.1 From 097d036a2f25eecc42435c57e010aaf4a2eed2d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 15:39:21 +0200 Subject: tracing/fastboot: only trace non-module initcalls At this time, only built-in initcalls interest us. We can't really produce a relevant graph if we include the modules initcall too. I had good results after this patch (see svg in attachment). Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index b9dc2c0093ab..a7efe3559654 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -22,11 +22,16 @@ void start_boot_trace(void) trace_boot_enabled = 1; } -void stop_boot_trace(struct trace_array *tr) +void stop_boot_trace(void) { trace_boot_enabled = 0; } +void reset_boot_trace(struct trace_array *tr) +{ + stop_boot_trace(); +} + static void boot_trace_init(struct trace_array *tr) { int cpu; @@ -43,7 +48,7 @@ static void boot_trace_ctrl_update(struct trace_array *tr) if (tr->ctrl) start_boot_trace(); else - stop_boot_trace(tr); + stop_boot_trace(); } static enum print_line_t initcall_print_line(struct trace_iterator *iter) @@ -81,7 +86,7 @@ struct tracer boot_tracer __read_mostly = { .name = "initcall", .init = boot_trace_init, - .reset = stop_boot_trace, + .reset = reset_boot_trace, .ctrl_update = boot_trace_ctrl_update, .print_line = initcall_print_line, }; -- cgit v1.2.1 From 6f807acd27734197b11d42829d3cbb9c0937b572 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 4 Oct 2008 02:00:58 -0400 Subject: ring-buffer: move page indexes into page headers Remove the global head and tail indexes and move them into the page header. Each page will now keep track of where the last write and read was made. We also rename the head and tail to read and write for better clarification. This patch is needed for future enhancements to move the ring buffer to a lockless solution. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 75 +++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6b8dac02364f..09d4f0d879a7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -117,6 +117,8 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) struct buffer_page { u64 time_stamp; /* page time stamp */ unsigned size; /* size of page data */ + unsigned write; /* index for next write */ + unsigned read; /* index for next read */ struct list_head list; /* list of free pages */ void *page; /* Actual data page */ }; @@ -153,11 +155,8 @@ struct ring_buffer_per_cpu { spinlock_t lock; struct lock_class_key lock_key; struct list_head pages; - unsigned long head; /* read from head */ - unsigned long tail; /* write to tail */ - unsigned long reader; - struct buffer_page *head_page; - struct buffer_page *tail_page; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ struct buffer_page *reader_page; unsigned long overrun; unsigned long entries; @@ -566,10 +565,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { - return (cpu_buffer->reader == cpu_buffer->reader_page->size && + return cpu_buffer->reader_page->read == cpu_buffer->reader_page->size && (cpu_buffer->tail_page == cpu_buffer->reader_page || (cpu_buffer->tail_page == cpu_buffer->head_page && - cpu_buffer->head == cpu_buffer->tail))); + cpu_buffer->head_page->read == + cpu_buffer->tail_page->write)); } static inline int rb_null_event(struct ring_buffer_event *event) @@ -577,7 +577,7 @@ static inline int rb_null_event(struct ring_buffer_event *event) return event->type == RINGBUF_TYPE_PADDING; } -static inline void *rb_page_index(struct buffer_page *page, unsigned index) +static inline void *__rb_page_index(struct buffer_page *page, unsigned index) { return page->page + index; } @@ -585,15 +585,21 @@ static inline void *rb_page_index(struct buffer_page *page, unsigned index) static inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { - return rb_page_index(cpu_buffer->reader_page, - cpu_buffer->reader); + return __rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader_page->read); +} + +static inline struct ring_buffer_event * +rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->head_page, + cpu_buffer->head_page->read); } static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { - return rb_page_index(iter->head_page, - iter->head); + return __rb_page_index(iter->head_page, iter->head); } /* @@ -610,7 +616,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) for (head = 0; head < rb_head_size(cpu_buffer); head += rb_event_length(event)) { - event = rb_page_index(cpu_buffer->head_page, head); + event = __rb_page_index(cpu_buffer->head_page, head); BUG_ON(rb_null_event(event)); /* Only count data entries */ if (event->type != RINGBUF_TYPE_DATA) @@ -640,13 +646,13 @@ rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->head = 0; + cpu_buffer->head_page->read = 0; } static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; - cpu_buffer->reader = 0; + cpu_buffer->reader_page->read = 0; } static inline void rb_inc_iter(struct ring_buffer_iter *iter) @@ -743,9 +749,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - /* No locking needed for tail page */ tail_page = cpu_buffer->tail_page; - tail = cpu_buffer->tail; + tail = cpu_buffer->tail_page->write; if (tail + length > BUF_PAGE_SIZE) { struct buffer_page *next_page = tail_page; @@ -774,7 +779,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } if (tail != BUF_PAGE_SIZE) { - event = rb_page_index(tail_page, tail); + event = __rb_page_index(tail_page, tail); /* page padding */ event->type = RINGBUF_TYPE_PADDING; } @@ -784,14 +789,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail_page->size = 0; tail = 0; cpu_buffer->tail_page = tail_page; - cpu_buffer->tail = tail; + cpu_buffer->tail_page->write = tail; rb_add_stamp(cpu_buffer, ts); spin_unlock(&cpu_buffer->lock); } BUG_ON(tail + length > BUF_PAGE_SIZE); - event = rb_page_index(tail_page, tail); + event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); return event; @@ -823,12 +828,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return -1; /* check to see if we went to the next page */ - if (cpu_buffer->tail) { + if (cpu_buffer->tail_page->write) { /* Still on same page, update timestamp */ event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; /* commit the time event */ - cpu_buffer->tail += + cpu_buffer->tail_page->write += rb_event_length(event); cpu_buffer->write_stamp = *ts; *delta = 0; @@ -846,7 +851,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, ts = ring_buffer_time_stamp(cpu_buffer->cpu); - if (cpu_buffer->tail) { + if (cpu_buffer->tail_page->write) { delta = ts - cpu_buffer->write_stamp; if (test_time_stamp(delta)) { @@ -868,7 +873,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; /* If the reserve went to the next page, our delta is zero */ - if (!cpu_buffer->tail) + if (!cpu_buffer->tail_page->write) delta = 0; event->time_delta = delta; @@ -933,8 +938,8 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->tail += rb_event_length(event); - cpu_buffer->tail_page->size = cpu_buffer->tail; + cpu_buffer->tail_page->write += rb_event_length(event); + cpu_buffer->tail_page->size = cpu_buffer->tail_page->write; cpu_buffer->write_stamp += event->time_delta; cpu_buffer->entries++; } @@ -1178,10 +1183,10 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head; + iter->head = cpu_buffer->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader; + iter->head = cpu_buffer->reader_page->read; } if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; @@ -1200,7 +1205,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; return iter->head_page == cpu_buffer->tail_page && - iter->head == cpu_buffer->tail; + iter->head == cpu_buffer->tail_page->write; } static void @@ -1277,11 +1282,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ - if (cpu_buffer->reader < reader->size) + if (cpu_buffer->reader_page->read < reader->size) goto out; /* Never should we have an index greater than the size */ - WARN_ON(cpu_buffer->reader > reader->size); + WARN_ON(cpu_buffer->reader_page->read > reader->size); /* check if we caught up to the tail */ reader = NULL; @@ -1342,7 +1347,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) rb_update_read_stamp(cpu_buffer, event); length = rb_event_length(event); - cpu_buffer->reader += length; + cpu_buffer->reader_page->read += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -1373,7 +1378,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * at the tail of the buffer. */ BUG_ON((iter->head_page == cpu_buffer->tail_page) && - (iter->head + length > cpu_buffer->tail)); + (iter->head + length > cpu_buffer->tail_page->write)); rb_update_iter_read_stamp(iter, event); @@ -1623,7 +1628,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); cpu_buffer->reader_page->size = 0; - cpu_buffer->head = cpu_buffer->tail = cpu_buffer->reader = 0; + cpu_buffer->head_page->read = 0; + cpu_buffer->tail_page->write = 0; + cpu_buffer->reader_page->read = 0; cpu_buffer->overrun = 0; cpu_buffer->entries = 0; -- cgit v1.2.1 From bf41a158cacba6ca5fc6407a54e7ad8ce1567e2e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 4 Oct 2008 02:00:59 -0400 Subject: ring-buffer: make reentrant This patch replaces the local_irq_save/restore with preempt_disable/ enable. This allows for interrupts to enter while recording. To write to the ring buffer, you must reserve data, and then commit it. During this time, an interrupt may call a trace function that will also record into the buffer before the commit is made. The interrupt will reserve its entry after the first entry, even though the first entry did not finish yet. The time stamp delta of the interrupt entry will be zero, since in the view of the trace, the interrupt happened during the first field anyway. Locking still takes place when the tail/write moves from one page to the next. The reader always takes the locks. A new page pointer is added, called the commit. The write/tail will always point to the end of all entries. The commit field will point to the last committed entry. Only this commit entry may update the write time stamp. The reader can only go up to the commit. It cannot go past it. If a lot of interrupts come in during a commit that fills up the buffer, and it happens to make it all the way around the buffer back to the commit, then a warning is printed and new events will be dropped. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 487 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 374 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 09d4f0d879a7..94af1fe56bb4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -116,8 +116,8 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) */ struct buffer_page { u64 time_stamp; /* page time stamp */ - unsigned size; /* size of page data */ - unsigned write; /* index for next write */ + local_t write; /* index for next write */ + local_t commit; /* write commited index */ unsigned read; /* index for next read */ struct list_head list; /* list of free pages */ void *page; /* Actual data page */ @@ -157,6 +157,7 @@ struct ring_buffer_per_cpu { struct list_head pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* commited pages */ struct buffer_page *reader_page; unsigned long overrun; unsigned long entries; @@ -185,12 +186,32 @@ struct ring_buffer_iter { u64 read_stamp; }; -#define RB_WARN_ON(buffer, cond) \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return -1; \ - } +#define RB_WARN_ON(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +#define RB_WARN_ON_RET(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return -1; \ + } \ + } while (0) + +#define RB_WARN_ON_ONCE(buffer, cond) \ + do { \ + static int once; \ + if (unlikely(cond) && !once) { \ + once++; \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) /** * check_pages - integrity check of buffer pages @@ -204,22 +225,19 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = &cpu_buffer->pages; struct buffer_page *page, *tmp; - RB_WARN_ON(cpu_buffer, head->next->prev != head); - RB_WARN_ON(cpu_buffer, head->prev->next != head); + RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); + RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); list_for_each_entry_safe(page, tmp, head, list) { - RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list); - RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list); + RB_WARN_ON_RET(cpu_buffer, + page->list.next->prev != &page->list); + RB_WARN_ON_RET(cpu_buffer, + page->list.prev->next != &page->list); } return 0; } -static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return cpu_buffer->head_page->size; -} - static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { @@ -286,7 +304,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) page->page = (void *)addr; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); - cpu_buffer->reader_page->size = 0; ret = rb_allocate_pages(cpu_buffer, buffer->pages); if (ret < 0) @@ -294,8 +311,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); - cpu_buffer->tail_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; return cpu_buffer; @@ -563,15 +579,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return -ENOMEM; } -static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) -{ - return cpu_buffer->reader_page->read == cpu_buffer->reader_page->size && - (cpu_buffer->tail_page == cpu_buffer->reader_page || - (cpu_buffer->tail_page == cpu_buffer->head_page && - cpu_buffer->head_page->read == - cpu_buffer->tail_page->write)); -} - static inline int rb_null_event(struct ring_buffer_event *event) { return event->type == RINGBUF_TYPE_PADDING; @@ -602,6 +609,33 @@ rb_iter_head_event(struct ring_buffer_iter *iter) return __rb_page_index(iter->head_page, iter->head); } +static inline unsigned rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write); +} + +static inline unsigned rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->commit); +} + +/* Size is determined by what has been commited */ +static inline unsigned rb_page_size(struct buffer_page *bpage) +{ + return rb_page_commit(bpage); +} + +static inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->commit_page); +} + +static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->head_page); +} + /* * When the tail hits the head and the buffer is in overwrite mode, * the head jumps to the next page and all content on the previous @@ -637,16 +671,76 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, *page = list_entry(p, struct buffer_page, list); } -static inline void -rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) +static inline unsigned +rb_event_index(struct ring_buffer_event *event) { - cpu_buffer->tail_page->time_stamp = *ts; - cpu_buffer->write_stamp = *ts; + unsigned long addr = (unsigned long)event; + + return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer) +static inline int +rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { - cpu_buffer->head_page->read = 0; + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static inline void +rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + while (cpu_buffer->commit_page->page != (void *)addr) { + RB_WARN_ON(cpu_buffer, + cpu_buffer->commit_page == cpu_buffer->tail_page); + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + } + + /* Now set the commit to the event's index */ + local_set(&cpu_buffer->commit_page->commit, index); +} + +static inline void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + barrier(); + } } static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) @@ -745,61 +839,120 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { struct buffer_page *tail_page, *head_page, *reader_page; - unsigned long tail; + unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; + unsigned long flags; tail_page = cpu_buffer->tail_page; - tail = cpu_buffer->tail_page->write; + write = local_add_return(length, &tail_page->write); + tail = write - length; - if (tail + length > BUF_PAGE_SIZE) { + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) { struct buffer_page *next_page = tail_page; - spin_lock(&cpu_buffer->lock); + spin_lock_irqsave(&cpu_buffer->lock, flags); + rb_inc_page(cpu_buffer, &next_page); head_page = cpu_buffer->head_page; reader_page = cpu_buffer->reader_page; /* we grabbed the lock before incrementing */ - WARN_ON(next_page == reader_page); + RB_WARN_ON(cpu_buffer, next_page == reader_page); + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == cpu_buffer->commit_page)) { + WARN_ON_ONCE(1); + goto out_unlock; + } if (next_page == head_page) { if (!(buffer->flags & RB_FL_OVERWRITE)) { - spin_unlock(&cpu_buffer->lock); - return NULL; + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + goto out_unlock; } - /* count overflows */ - rb_update_overflow(cpu_buffer); + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + rb_update_overflow(cpu_buffer); + + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - rb_reset_head_page(cpu_buffer); + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(cpu_buffer->cpu); + cpu_buffer->tail_page->time_stamp = *ts; } - if (tail != BUF_PAGE_SIZE) { + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ event = __rb_page_index(tail_page, tail); - /* page padding */ event->type = RINGBUF_TYPE_PADDING; } - tail_page->size = tail; - tail_page = next_page; - tail_page->size = 0; - tail = 0; - cpu_buffer->tail_page = tail_page; - cpu_buffer->tail_page->write = tail; - rb_add_stamp(cpu_buffer, ts); - spin_unlock(&cpu_buffer->lock); + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); } - BUG_ON(tail + length > BUF_PAGE_SIZE); + /* We reserved something on the buffer */ + + BUG_ON(write > BUF_PAGE_SIZE); event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->time_stamp = *ts; + return event; + + out_unlock: + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + return NULL; } static int @@ -808,6 +961,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, { struct ring_buffer_event *event; static int once; + int ret; if (unlikely(*delta > (1ULL << 59) && !once++)) { printk(KERN_WARNING "Delta way too big! %llu" @@ -825,21 +979,38 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, RB_LEN_TIME_EXTEND, ts); if (!event) - return -1; + return -EBUSY; - /* check to see if we went to the next page */ - if (cpu_buffer->tail_page->write) { - /* Still on same page, update timestamp */ - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; - /* commit the time event */ - cpu_buffer->tail_page->write += - rb_event_length(event); + if (PTR_ERR(event) == -EAGAIN) + return -EAGAIN; + + /* Only a commited time event can update the write stamp */ + if (rb_is_commit(cpu_buffer, event)) { + /* + * If this is the first on the page, then we need to + * update the page itself, and just put in a zero. + */ + if (rb_event_index(event)) { + event->time_delta = *delta & TS_MASK; + event->array[0] = *delta >> TS_SHIFT; + } else { + cpu_buffer->commit_page->time_stamp = *ts; + event->time_delta = 0; + event->array[0] = 0; + } cpu_buffer->write_stamp = *ts; - *delta = 0; + /* let the caller know this was the commit */ + ret = 1; + } else { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; } - return 0; + *delta = 0; + + return ret; } static struct ring_buffer_event * @@ -848,32 +1019,69 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, { struct ring_buffer_event *event; u64 ts, delta; + int commit = 0; + again: ts = ring_buffer_time_stamp(cpu_buffer->cpu); - if (cpu_buffer->tail_page->write) { + /* + * Only the first commit can update the timestamp. + * Yes there is a race here. If an interrupt comes in + * just after the conditional and it traces too, then it + * will also check the deltas. More than one timestamp may + * also be made. But only the entry that did the actual + * commit will be something other than zero. + */ + if (cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer)) { + delta = ts - cpu_buffer->write_stamp; + /* make sure this delta is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (unlikely(ts < cpu_buffer->write_stamp)) + goto again; + if (test_time_stamp(delta)) { - int ret; - ret = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (ret < 0) + commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + + if (commit == -EBUSY) return NULL; + + if (commit == -EAGAIN) + goto again; + + RB_WARN_ON(cpu_buffer, commit < 0); } - } else { - spin_lock(&cpu_buffer->lock); - rb_add_stamp(cpu_buffer, &ts); - spin_unlock(&cpu_buffer->lock); + } else + /* Non commits have zero deltas */ delta = 0; - } event = __rb_reserve_next(cpu_buffer, type, length, &ts); - if (!event) + if (PTR_ERR(event) == -EAGAIN) + goto again; + + if (!event) { + if (unlikely(commit)) + /* + * Ouch! We needed a timestamp and it was commited. But + * we didn't get our event reserved. + */ + rb_set_commit_to_write(cpu_buffer); return NULL; + } - /* If the reserve went to the next page, our delta is zero */ - if (!cpu_buffer->tail_page->write) + /* + * If the timestamp was commited, make the commit our entry + * now so that we will update it when needed. + */ + if (commit) + rb_set_commit_event(cpu_buffer, event); + else if (!rb_is_commit(cpu_buffer, event)) delta = 0; event->time_delta = delta; @@ -881,6 +1089,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static DEFINE_PER_CPU(int, rb_need_resched); + /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -904,12 +1114,15 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - int cpu; + int cpu, resched; if (atomic_read(&buffer->record_disabled)) return NULL; - local_irq_save(*flags); + /* If we are tracing schedule, we don't want to recurse */ + resched = need_resched(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); if (!cpu_isset(cpu, buffer->cpumask)) @@ -922,26 +1135,42 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, length = rb_calculate_event_length(length); if (length > BUF_PAGE_SIZE) - return NULL; + goto out; event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); if (!event) goto out; + /* + * Need to store resched state on this cpu. + * Only the first needs to. + */ + + if (preempt_count() == 1) + per_cpu(rb_need_resched, cpu) = resched; + return event; out: - local_irq_restore(*flags); + if (resched) + preempt_enable_notrace(); + else + preempt_enable_notrace(); return NULL; } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->tail_page->write += rb_event_length(event); - cpu_buffer->tail_page->size = cpu_buffer->tail_page->write; - cpu_buffer->write_stamp += event->time_delta; cpu_buffer->entries++; + + /* Only process further if we own the commit */ + if (!rb_is_commit(cpu_buffer, event)) + return; + + cpu_buffer->write_stamp += event->time_delta; + + rb_set_commit_to_write(cpu_buffer); } /** @@ -965,7 +1194,16 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); - local_irq_restore(flags); + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) { + if (per_cpu(rb_need_resched, cpu)) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); + } else + preempt_enable_no_resched_notrace(); return 0; } @@ -989,15 +1227,17 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length, flags; + unsigned long event_length; void *body; int ret = -EBUSY; - int cpu; + int cpu, resched; if (atomic_read(&buffer->record_disabled)) return -EBUSY; - local_irq_save(flags); + resched = need_resched(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); if (!cpu_isset(cpu, buffer->cpumask)) @@ -1022,11 +1262,26 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - local_irq_restore(flags); + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); return ret; } +static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *commit = cpu_buffer->commit_page; + + return reader->read == rb_page_commit(reader) && + (commit == reader || + (commit == head && + head->read == rb_page_commit(commit))); +} + /** * ring_buffer_record_disable - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. @@ -1204,8 +1459,8 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - return iter->head_page == cpu_buffer->tail_page && - iter->head == cpu_buffer->tail_page->write; + return iter->head_page == cpu_buffer->commit_page && + iter->head == rb_commit_index(cpu_buffer); } static void @@ -1282,15 +1537,16 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ - if (cpu_buffer->reader_page->read < reader->size) + if (cpu_buffer->reader_page->read < rb_page_size(reader)) goto out; /* Never should we have an index greater than the size */ - WARN_ON(cpu_buffer->reader_page->read > reader->size); + RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader)); /* check if we caught up to the tail */ reader = NULL; - if (cpu_buffer->tail_page == cpu_buffer->reader_page) + if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; /* @@ -1301,7 +1557,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) reader = cpu_buffer->head_page; cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; - cpu_buffer->reader_page->size = 0; + + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->commit, 0); /* Make the reader page now replace the head */ reader->list.prev->next = &cpu_buffer->reader_page->list; @@ -1313,7 +1571,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ cpu_buffer->head_page = cpu_buffer->reader_page; - if (cpu_buffer->tail_page != reader) + if (cpu_buffer->commit_page != reader) rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ @@ -1363,8 +1621,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* * Check if we are at the end of the buffer. */ - if (iter->head >= iter->head_page->size) { - BUG_ON(iter->head_page == cpu_buffer->tail_page); + if (iter->head >= rb_page_size(iter->head_page)) { + BUG_ON(iter->head_page == cpu_buffer->commit_page); rb_inc_iter(iter); return; } @@ -1377,16 +1635,16 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * This should not be called to advance the header if we are * at the tail of the buffer. */ - BUG_ON((iter->head_page == cpu_buffer->tail_page) && - (iter->head + length > cpu_buffer->tail_page->write)); + BUG_ON((iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer))); rb_update_iter_read_stamp(iter, event); iter->head += length; /* check for end of page padding */ - if ((iter->head >= iter->head_page->size) && - (iter->head_page != cpu_buffer->tail_page)) + if ((iter->head >= rb_page_size(iter->head_page)) && + (iter->head_page != cpu_buffer->commit_page)) rb_advance_iter(iter); } @@ -1420,7 +1678,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - WARN_ON(1); + RB_WARN_ON(cpu_buffer, 1); rb_advance_reader(cpu_buffer); return NULL; @@ -1622,14 +1880,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); - cpu_buffer->head_page->size = 0; - cpu_buffer->tail_page = cpu_buffer->head_page; - cpu_buffer->tail_page->size = 0; - INIT_LIST_HEAD(&cpu_buffer->reader_page->list); - cpu_buffer->reader_page->size = 0; + local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->commit, 0); cpu_buffer->head_page->read = 0; - cpu_buffer->tail_page->write = 0; + + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->commit_page = cpu_buffer->head_page; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->commit, 0); cpu_buffer->reader_page->read = 0; cpu_buffer->overrun = 0; -- cgit v1.2.1 From 3ea2e6d71aafe35b8aaf89ed711a283815acfae6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 4 Oct 2008 02:01:00 -0400 Subject: ftrace: make some tracers reentrant Now that the ring buffer is reentrant, some of the ftrace tracers (sched_swich, debugging traces) can also be reentrant. Note: Never make the function tracer reentrant, that can cause recursion problems all over the kernel. The function tracer must disable reentrancy. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 10 ++-------- kernel/trace/trace_sched_switch.c | 10 ++-------- 2 files changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1cd2e8143bb4..caa4051ce778 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -839,7 +839,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - long disabled; int cpu; int pc; @@ -850,12 +849,10 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(!atomic_read(&data->disabled))) ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); - atomic_dec(&data->disabled); preempt_enable_notrace(); } @@ -2961,7 +2958,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct trace_array_cpu *data; struct print_entry *entry; unsigned long flags, irq_flags; - long disabled; int cpu, len = 0, size, pc; if (!tr->ctrl || tracing_disabled) @@ -2971,9 +2967,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1)) + if (unlikely(atomic_read(&data->disabled))) goto out; spin_lock_irqsave(&trace_buf_lock, flags); @@ -2999,7 +2994,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) spin_unlock_irqrestore(&trace_buf_lock, flags); out: - atomic_dec(&data->disabled); preempt_enable_notrace(); return len; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c7fa08a5b7f4..b8f56beb1a62 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -24,7 +24,6 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, { struct trace_array_cpu *data; unsigned long flags; - long disabled; int cpu; int pc; @@ -41,12 +40,10 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(!atomic_read(&data->disabled))) tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); - atomic_dec(&data->disabled); local_irq_restore(flags); } @@ -55,7 +52,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) { struct trace_array_cpu *data; unsigned long flags; - long disabled; int cpu, pc; if (!likely(tracer_enabled)) @@ -67,13 +63,11 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(!atomic_read(&data->disabled))) tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, flags, pc); - atomic_dec(&data->disabled); local_irq_restore(flags); } -- cgit v1.2.1 From c2931e05ec5965597cbfb79ad332d4a29aeceb23 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 4 Oct 2008 22:04:44 +0200 Subject: ftrace: return an error when setting a nonexistent tracer When one try to set a nonexistent tracer, no error is returned as if the name of the tracer was correct. We should return -EINVAL. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index caa4051ce778..78d56614c95b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2381,9 +2381,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, struct tracer *t; char buf[max_tracer_type_len+1]; int i; + size_t ret; if (cnt > max_tracer_type_len) cnt = max_tracer_type_len; + ret = cnt; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2399,7 +2401,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (strcmp(t->name, buf) == 0) break; } - if (!t || t == current_trace) + if (!t) { + ret = -EINVAL; + goto out; + } + if (t == current_trace) goto out; if (current_trace && current_trace->reset) @@ -2412,9 +2418,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + if (ret == cnt) + filp->f_pos += cnt; - return cnt; + return ret; } static ssize_t -- cgit v1.2.1 From 8a5d900cca57115326bc5b9e7f3bac980986c2c8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 4 Oct 2008 13:42:27 -0700 Subject: tracing/fastboot: fix printk format typo in boot tracer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When printing nanoseconds, the right printk format string is %09 not %06... Signed-off-by: Arjan van de Ven Acked-by: Frédéric Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a7efe3559654..d0a5e50eeff2 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -62,14 +62,14 @@ static enum print_line_t initcall_print_line(struct trace_iterator *iter) struct timespec rettime = ktime_to_timespec(it->rettime); if (entry->type == TRACE_BOOT) { - ret = trace_seq_printf(s, "[%5ld.%06ld] calling %s @ %i\n", + ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", calltime.tv_sec, calltime.tv_nsec, it->func, it->caller); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %s " + ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %lld msecs\n", rettime.tv_sec, rettime.tv_nsec, -- cgit v1.2.1 From 2fbc474901933c8f0c09b0280dfbb6780cb8bd60 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 8 Oct 2008 16:51:49 -0700 Subject: ftrace: fix hex output mode of ftrace Fix the output of ftrace in hex mode as the hi/lo nibbles are output in reverse order. Without this patch, the output of ftrace is: raw mode : 6474 0 141531612444 0 140 + 6402 120 S hex mode : 000091a4 00000000 000000023f1f50c1 00000000 c8 000000b2 00009120 87 ffff00c8 00000035 There is an inversion on ouput hex(6474) is 194a [based on a patch by Philippe Reynes ] Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78d56614c95b..36cbb873845f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -336,14 +336,12 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) } #define HEX_CHARS 17 -static const char hex2asc[] = "0123456789abcdef"; static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; unsigned char *data = mem; - unsigned char byte; int i, j; BUG_ON(len >= HEX_CHARS); @@ -353,10 +351,8 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) #else for (i = len-1, j = 0; i >= 0; i--) { #endif - byte = data[i]; - - hex[j++] = hex2asc[byte & 0x0f]; - hex[j++] = hex2asc[byte >> 4]; + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); } hex[j++] = ' '; -- cgit v1.2.1 From ad0a3b68114e8f3c25ac0045b45a2838f23e3b3a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 8 Oct 2008 17:44:55 -0700 Subject: trace: add build-time check to avoid overrunning hex buffer Remove the runtime BUG_ON and change to a compile-time check in the macro that calls the hex format routine [Noticed by Joe Perches] Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 36cbb873845f..d345d649d073 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -335,7 +335,8 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } -#define HEX_CHARS 17 +#define MAX_MEMHEX_BYTES 8 +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) @@ -344,8 +345,6 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) unsigned char *data = mem; int i, j; - BUG_ON(len >= HEX_CHARS); - #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -1668,6 +1667,7 @@ do { \ #define SEQ_PUT_HEX_FIELD_RET(s, x) \ do { \ + BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ return 0; \ } while (0) -- cgit v1.2.1 From 4519d9e54dcd273975ad0adebad2a08c20428029 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Oct 2008 14:15:43 +0200 Subject: tracing/stacktrace: improve help text Improve the help text that is displayed for CONFIG_STACK_TRACER. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 396aea11398e..11fd03a429f0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,8 +141,17 @@ config STACK_TRACER select FTRACE select STACKTRACE help - This tracer records the max stack of the kernel, and displays - it in debugfs/tracing/stack_trace + This special tracer records the maximum stack footprint of the + kernel and displays it in debugfs/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. Because this logic has to execute in every + kernel function, all the time, this option can slow down the + kernel measurably and is generally intended for kernel + developers only. + + Say N if unsure. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" -- cgit v1.2.1 From 98d9c66ab07471006fd7910cb16453581c41a3e7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Oct 2008 14:27:20 +0200 Subject: tracing/fastboot: improve help text Improve the help text of the boot tracer. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 11fd03a429f0..1cb3e1f616af 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -127,12 +127,17 @@ config BOOT_TRACER select TRACING help This tracer helps developers to optimize boot times: it records - the timings of the initcalls. Its aim is to be parsed by the - /scripts/bootgraph.pl tool to produce pretty graphics about - boot inefficiencies, giving a visual representation of the - delays during initcalls. Note that tracers self tests can't - be enabled if this tracer is selected since only one tracer - should touch the tracing buffer at a time. + the timings of the initcalls and traces key events and the identity + of tasks that can cause boot delays, such as context-switches. + + Its aim is to be parsed by the /scripts/bootgraph.pl tool to + produce pretty graphics about boot inefficiencies, giving a visual + representation of the delays during initcalls - but the raw + /debug/tracing/trace text output is readable too. + + ( Note that tracing self tests can't be enabled if this tracer is + selected, because the self-tests are an initcall as well and that + would invalidate the boot trace. ) config STACK_TRACER bool "Trace max stack" -- cgit v1.2.1 From 8cd162ce230b154e564a1285bb5f89fcf73f0dce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Oct 2008 20:37:23 +0200 Subject: sched: only update rq->clock while holding rq->lock Vatsa noticed rq->clock going funny and tracked it down to an update_rq_clock() outside a rq->lock section. This is a problem because things like double_rq_lock() update the rq->clock value for both rqs. Therefore disabling interrupts isn't strong enough. Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6f230596bd0c..c530b84c7f80 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4441,12 +4441,8 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); + spin_lock_irq(&rq->lock); update_rq_clock(rq); - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { -- cgit v1.2.1 From 85c0f90978bf50596dbd23854648020f1f9b5bfd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:47 -0700 Subject: irq: introduce nr_irqs at this point nr_irqs is equal NR_IRQS convert a few easy users from NR_IRQS to dynamic nr_irqs. v2: according to Eric, we need to take care of arch without generic_hardirqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/autoprobe.c | 10 +++++----- kernel/irq/chip.c | 20 ++++++++++---------- kernel/irq/handle.c | 3 ++- kernel/irq/manage.c | 16 ++++++++-------- kernel/irq/proc.c | 2 +- kernel/irq/resend.c | 4 ++-- kernel/irq/spurious.c | 4 ++-- 7 files changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 533068cfb607..c689e9851a80 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -38,7 +38,7 @@ unsigned long probe_irq_on(void) * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. */ - for (i = NR_IRQS-1; i > 0; i--) { + for (i = nr_irqs-1; i > 0; i--) { desc = irq_desc + i; spin_lock_irq(&desc->lock); @@ -68,7 +68,7 @@ unsigned long probe_irq_on(void) * (we must startup again here because if a longstanding irq * happened in the previous stage, it may have masked itself) */ - for (i = NR_IRQS-1; i > 0; i--) { + for (i = nr_irqs-1; i > 0; i--) { desc = irq_desc + i; spin_lock_irq(&desc->lock); @@ -89,7 +89,7 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ mask = 0; - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { unsigned int status; desc = irq_desc + i; @@ -130,7 +130,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; mask = 0; - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { struct irq_desc *desc = irq_desc + i; unsigned int status; @@ -173,7 +173,7 @@ int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_irqs = 0; - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { struct irq_desc *desc = irq_desc + i; unsigned int status; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5203a599d211..bba66e098703 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -27,7 +27,7 @@ void dynamic_irq_init(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } @@ -60,7 +60,7 @@ void dynamic_irq_cleanup(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } @@ -92,7 +92,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } @@ -121,7 +121,7 @@ int set_irq_type(unsigned int irq, unsigned int type) unsigned long flags; int ret = -ENXIO; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; } @@ -149,7 +149,7 @@ int set_irq_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); return -EINVAL; @@ -175,7 +175,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; @@ -201,7 +201,7 @@ int set_irq_chip_data(unsigned int irq, void *data) struct irq_desc *desc = irq_desc + irq; unsigned long flags; - if (irq >= NR_IRQS || !desc->chip) { + if (irq >= nr_irqs || !desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } @@ -545,7 +545,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); return; @@ -610,7 +610,7 @@ void __init set_irq_noprobe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); return; @@ -628,7 +628,7 @@ void __init set_irq_probe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); return; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f4c8a03a9fbb..e9d022cf593e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -47,6 +47,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * * Controller mappings for all interrupt sources: */ +int nr_irqs = NR_IRQS; struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -265,7 +266,7 @@ void early_init_irq_lock_class(void) { int i; - for (i = 0; i < NR_IRQS; i++) + for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d363f32dba7d..d5a4333d8f1f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -34,7 +34,7 @@ void synchronize_irq(unsigned int irq) struct irq_desc *desc = irq_desc + irq; unsigned int status; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return; do { @@ -143,7 +143,7 @@ void disable_irq_nosync(unsigned int irq) struct irq_desc *desc = irq_desc + irq; unsigned long flags; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return; spin_lock_irqsave(&desc->lock, flags); @@ -171,7 +171,7 @@ void disable_irq(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return; disable_irq_nosync(irq); @@ -214,7 +214,7 @@ void enable_irq(unsigned int irq) struct irq_desc *desc = irq_desc + irq; unsigned long flags; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return; spin_lock_irqsave(&desc->lock, flags); @@ -290,7 +290,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irqaction *action; - if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) + if (irq >= nr_irqs || irq_desc[irq].status & IRQ_NOREQUEST) return 0; action = irq_desc[irq].action; @@ -356,7 +356,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) int shared = 0; int ret; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return -EINVAL; if (desc->chip == &no_irq_chip) @@ -515,7 +515,7 @@ void free_irq(unsigned int irq, void *dev_id) unsigned long flags; WARN_ON(in_interrupt()); - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return; desc = irq_desc + irq; @@ -630,7 +630,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, */ if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) return -EINVAL; if (irq_desc[irq].status & IRQ_NOREQUEST) return -EINVAL; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a09dd29c2fd7..e5225a65a4f9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -240,7 +240,7 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for (i = 0; i < NR_IRQS; i++) + for (i = 0; i < nr_irqs; i++) register_irq_proc(i); } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index a8046791ba2d..cba8aa5bc7f4 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -33,8 +33,8 @@ static void resend_irqs(unsigned long arg) struct irq_desc *desc; int irq; - while (!bitmap_empty(irqs_resend, NR_IRQS)) { - irq = find_first_bit(irqs_resend, NR_IRQS); + while (!bitmap_empty(irqs_resend, nr_irqs)) { + irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); desc = irq_desc + irq; local_irq_disable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 19fe9d6ebfe8..e26ca1e90c08 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -91,7 +91,7 @@ static int misrouted_irq(int irq) int i; int ok = 0; - for (i = 1; i < NR_IRQS; i++) { + for (i = 1; i < nr_irqs; i++) { struct irq_desc *desc = irq_desc + i; if (i == irq) /* Already tried */ @@ -107,7 +107,7 @@ static int misrouted_irq(int irq) static void poll_spurious_irqs(unsigned long dummy) { int i; - for (i = 1; i < NR_IRQS; i++) { + for (i = 1; i < nr_irqs; i++) { struct irq_desc *desc = irq_desc + i; unsigned int status; -- cgit v1.2.1 From d60458b224d6b997a582a05cb8c4b9bed9e17a1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:00 -0700 Subject: irq: make irq_desc to use dyn_array Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e9d022cf593e..e94eeca09ea9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -48,6 +48,36 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * Controller mappings for all interrupt sources: */ int nr_irqs = NR_IRQS; + +#ifdef CONFIG_HAVE_DYN_ARRAY +static struct irq_desc irq_desc_init __initdata = { + .status = IRQ_DISABLED, + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif +}; + +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + int i; + struct irq_desc *desc; + + desc = *da->name; + + for (i = 0; i < *da->nr; i++) + memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc)); +} + +struct irq_desc *irq_desc; +DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); + +#else + struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -60,6 +90,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { #endif } }; +#endif /* * What should we do if we get a hw irq event on an illegal vector? -- cgit v1.2.1 From fa42d10dd5e1ff373061c0526f272106512301f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 20:50:30 -0700 Subject: irq: sparse irqs, export nr_irqs fix: Building modules, stage 2. MODPOST 458 modules ERROR: "nr_irqs" [drivers/serial/8250.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e94eeca09ea9..6ce3bcc2b8f7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -48,6 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * Controller mappings for all interrupt sources: */ int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_HAVE_DYN_ARRAY static struct irq_desc irq_desc_init __initdata = { -- cgit v1.2.1 From d17a55ded3393ad3878010bb3a8243a15a8d8df5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:01 -0700 Subject: irq: make irqs in kernel stat use per_cpu_dyn_array Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6f230596bd0c..b9d713781b5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4048,9 +4048,12 @@ static inline void idle_balance(int cpu, struct rq *rq) #endif DEFINE_PER_CPU(struct kernel_stat, kstat); - EXPORT_PER_CPU_SYMBOL(kstat); +#ifdef CONFIG_HAVE_DYN_ARRAY +DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); +#endif + /* * Return p->sum_exec_runtime plus any more ns on the sched_clock * that have not yet been banked in case the task is currently running. -- cgit v1.2.1 From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:05 -0700 Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead of irq_desc[] add CONFIG_HAVE_SPARSE_IRQ to for use condensed array. Get rid of irq_desc[] array assumptions. Preallocate 32 irq_desc, and irq_desc() will try to get more. ( No change in functionality is expected anywhere, except the odd build failure where we missed a code site or where a crossing commit itroduces new irq_desc[] usage. ) v2: according to Eric, change get_irq_desc() to irq_desc() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/autoprobe.c | 10 ++-- kernel/irq/chip.c | 32 +++++++----- kernel/irq/handle.c | 138 ++++++++++++++++++++++++++++++++++++++++++++----- kernel/irq/manage.c | 35 +++++++------ kernel/irq/migration.c | 14 ++--- kernel/irq/proc.c | 36 +++++++------ kernel/irq/resend.c | 2 +- kernel/irq/spurious.c | 5 +- 8 files changed, 199 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index c689e9851a80..c45ab718cf07 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -39,7 +39,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for (i = nr_irqs-1; i > 0; i--) { - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -69,7 +69,7 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for (i = nr_irqs-1; i > 0; i--) { - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -92,7 +92,7 @@ unsigned long probe_irq_on(void) for (i = 0; i < nr_irqs; i++) { unsigned int status; - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); status = desc->status; @@ -131,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) mask = 0; for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; spin_lock_irq(&desc->lock); @@ -174,7 +174,7 @@ int probe_irq_off(unsigned long val) int i, irq_found = 0, nr_irqs = 0; for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; spin_lock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bba66e098703..76c225cf4b26 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -33,7 +33,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -65,7 +65,7 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -100,7 +100,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; @@ -126,7 +126,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return -ENODEV; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); if (type == IRQ_TYPE_NONE) return 0; @@ -155,7 +155,7 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -180,7 +180,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -198,9 +198,10 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) */ int set_irq_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; unsigned long flags; + desc = irq_to_desc(irq); if (irq >= nr_irqs || !desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; @@ -219,8 +220,9 @@ EXPORT_SYMBOL(set_irq_chip_data); */ static void default_enable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; + desc = irq_to_desc(irq); desc->chip->unmask(irq); desc->status &= ~IRQ_MASKED; } @@ -237,7 +239,10 @@ static void default_disable(unsigned int irq) */ static unsigned int default_startup(unsigned int irq) { - irq_desc[irq].chip->enable(irq); + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->chip->enable(irq); return 0; } @@ -247,8 +252,9 @@ static unsigned int default_startup(unsigned int irq) */ static void default_shutdown(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; + desc = irq_to_desc(irq); desc->chip->mask(irq); desc->status |= IRQ_MASKED; } @@ -551,7 +557,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); if (!handle) handle = handle_bad_irq; @@ -616,7 +622,7 @@ void __init set_irq_noprobe(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; @@ -634,7 +640,7 @@ void __init set_irq_probe(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6ce3bcc2b8f7..9fc33b3378e6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,6 +18,14 @@ #include "internals.h" +#ifdef CONFIG_TRACE_IRQFLAGS + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; +#endif + /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -51,7 +59,8 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_desc irq_desc_init __initdata = { +static struct irq_desc irq_desc_init = { + .irq = -1U, .status = IRQ_DISABLED, .chip = &no_irq_chip, .handle_irq = handle_bad_irq, @@ -62,6 +71,27 @@ static struct irq_desc irq_desc_init __initdata = { #endif }; + +static void init_one_irq_desc(struct irq_desc *desc) +{ + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); +#ifdef CONFIG_TRACE_IRQFLAGS + lockdep_set_class(&desc->lock, &irq_desc_lock_class); +#endif +} + +#ifdef CONFIG_HAVE_SPARSE_IRQ +static int nr_irq_desc = 32; + +static int __init parse_nr_irq_desc(char *arg) +{ + if (arg) + nr_irq_desc = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("nr_irq_desc", parse_nr_irq_desc); + static void __init init_work(void *data) { struct dyn_array *da = data; @@ -71,12 +101,83 @@ static void __init init_work(void *data) desc = *da->name; for (i = 0; i < *da->nr; i++) - memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc)); + init_one_irq_desc(&desc[i]); + + for (i = 1; i < *da->nr; i++) + desc[i-1].next = &desc[i]; } -struct irq_desc *irq_desc; +static struct irq_desc *sparse_irqs; +DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); + +extern int after_bootmem; +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); +struct irq_desc *irq_to_desc(unsigned int irq) +{ + struct irq_desc *desc, *desc_pri; + int i; + int count = 0; + + BUG_ON(irq == -1U); + + desc_pri = desc = &sparse_irqs[0]; + while (desc) { + if (desc->irq == irq) + return desc; + + if (desc->irq == -1U) { + desc->irq = irq; + return desc; + } + desc_pri = desc; + desc = desc->next; + count++; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + + if (after_bootmem) + desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC); + else + desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0); + + if (!desc) + panic("please boot with nr_irq_desc= %d\n", count * 2); + + for (i = 0; i < nr_irq_desc; i++) + init_one_irq_desc(&desc[i]); + + for (i = 1; i < nr_irq_desc; i++) + desc[i-1].next = &desc[i]; + + desc->irq = irq; + desc_pri->next = desc; + + return desc; +} +#else +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + int i; + struct irq_desc *desc; + + desc = *da->name; + + for (i = 0; i < *da->nr; i++) + init_one_irq_desc(&desc[i]); + +} +static struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); +#endif + #else struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { @@ -85,12 +186,23 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __SPIN_LOCK_UNLOCKED(sparse_irqs->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif } }; + +#endif + +#ifndef CONFIG_HAVE_SPARSE_IRQ +struct irq_desc *irq_to_desc(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_desc[irq]; + + return NULL; +} #endif /* @@ -99,7 +211,10 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { */ static void ack_bad(unsigned int irq) { - print_irq_desc(irq, irq_desc + irq); + struct irq_desc *desc; + + desc = irq_to_desc(irq); + print_irq_desc(irq, desc); ack_bad_irq(irq); } @@ -196,7 +311,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) */ unsigned int __do_IRQ(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned int status; @@ -287,19 +402,16 @@ out: } #endif -#ifdef CONFIG_TRACE_IRQFLAGS - -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; +#ifdef CONFIG_TRACE_IRQFLAGS void early_init_irq_lock_class(void) { +#ifndef CONFIG_HAVE_DYN_ARRAY int i; for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); +#endif } - #endif + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d5a4333d8f1f..b5943e9f95aa 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; if (irq >= nr_irqs) @@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq); */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || !desc->chip->set_affinity) @@ -81,7 +81,7 @@ int irq_can_set_affinity(unsigned int irq) */ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (!desc->chip->set_affinity) return -EINVAL; @@ -111,14 +111,16 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) int irq_select_affinity(unsigned int irq) { cpumask_t mask; + struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - irq_desc[irq].affinity = mask; - irq_desc[irq].chip->set_affinity(irq, mask); + desc = irq_to_desc(irq); + desc->affinity = mask; + desc->chip->set_affinity(irq, mask); set_balance_irq_affinity(irq, mask); return 0; @@ -140,7 +142,7 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; if (irq >= nr_irqs) @@ -169,7 +171,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (irq >= nr_irqs) return; @@ -211,7 +213,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; if (irq >= nr_irqs) @@ -225,7 +227,7 @@ EXPORT_SYMBOL(enable_irq); static int set_irq_wake_real(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (desc->chip->set_wake) @@ -248,7 +250,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) */ int set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = 0; @@ -288,12 +290,13 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - if (irq >= nr_irqs || irq_desc[irq].status & IRQ_NOREQUEST) + if (irq >= nr_irqs || desc->status & IRQ_NOREQUEST) return 0; - action = irq_desc[irq].action; + action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; @@ -349,7 +352,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, */ int setup_irq(unsigned int irq, struct irqaction *new) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; @@ -518,7 +521,7 @@ void free_irq(unsigned int irq, void *dev_id) if (irq >= nr_irqs) return; - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -615,6 +618,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, { struct irqaction *action; int retval; + struct irq_desc *desc; #ifdef CONFIG_LOCKDEP /* @@ -632,7 +636,8 @@ int request_irq(unsigned int irq, irq_handler_t handler, return -EINVAL; if (irq >= nr_irqs) return -EINVAL; - if (irq_desc[irq].status & IRQ_NOREQUEST) + desc = irq_to_desc(irq); + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) return -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 77b7acc875c5..90b920d3f52b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,18 +3,18 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + desc->pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } void move_masked_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) @@ -30,7 +30,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) + if (unlikely(cpus_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); + cpus_and(tmp, desc->pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -55,12 +55,12 @@ void move_masked_irq(int irq) if (likely(!cpus_empty(tmp))) { desc->chip->set_affinity(irq,tmp); } - cpus_clear(irq_desc[irq].pending_mask); + cpus_clear(desc->pending_mask); } void move_native_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e5225a65a4f9..c2f356c808f6 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)m->private; + struct irq_desc *desc = irq_to_desc((long)m->private); cpumask_t *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_t new_value; int err; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = { static int irq_spurious_read(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct irq_desc *d = &irq_desc[(long) data]; + struct irq_desc *desc = irq_to_desc((long) data); return sprintf(page, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", - d->irq_count, - d->irqs_unhandled, - jiffies_to_msecs(d->last_unhandled)); + desc->irq_count, + desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); } #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; int ret = 1; @@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; + struct irq_desc *desc = irq_to_desc(irq); - if (!irq_desc[irq].dir || action->dir || !action->name || + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -174,7 +175,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); + action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN @@ -185,25 +186,24 @@ void register_irq_proc(unsigned int irq) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; + struct irq_desc *desc = irq_to_desc(irq); - if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_chip) || - irq_desc[irq].dir) + if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); + desc->dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ - proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); + entry = create_proc_entry("spurious", 0444, desc->dir); if (entry) { entry->data = (void *)(long)irq; entry->read_proc = irq_spurious_read; @@ -214,8 +214,10 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) - remove_proc_entry(action->dir->name, irq_desc[irq].dir); + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + remove_proc_entry(action->dir->name, desc->dir); + } } void register_default_affinity_proc(void) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index cba8aa5bc7f4..89c7117acf2b 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -36,7 +36,7 @@ static void resend_irqs(unsigned long arg) while (!bitmap_empty(irqs_resend, nr_irqs)) { irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); - desc = irq_desc + irq; + desc = irq_to_desc(irq); local_irq_disable(); desc->handle_irq(irq, desc); local_irq_enable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e26ca1e90c08..b5d906002e1d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -92,11 +92,12 @@ static int misrouted_irq(int irq) int ok = 0; for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc; if (i == irq) /* Already tried */ continue; + desc = irq_to_desc(i); if (try_one_irq(i, desc)) ok = 1; } @@ -108,7 +109,7 @@ static void poll_spurious_irqs(unsigned long dummy) { int i; for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; /* Racy but it doesn't matter */ -- cgit v1.2.1 From 3bf52a4df3ccd25d4154797977c556a2a8b3bc1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 20:50:29 -0700 Subject: irq: sparse irqs, fix IRQ auto-probe crash fix: [ 10.631533] calling yenta_socket_init+0x0/0x20 [ 10.631533] Yenta: CardBus bridge found at 0000:15:00.0 [17aa:2012] [ 10.631533] Yenta: Using INTVAL to route CSC interrupts to PCI [ 10.631533] Yenta: Routing CardBus interrupts to PCI [ 10.631533] Yenta TI: socket 0000:15:00.0, mfunc 0x01d01002, devctl 0x64 [ 10.731599] BUG: unable to handle kernel NULL pointer dereference at 00000040 [ 10.731838] IP: [] _spin_lock_irq+0xf/0x20 [ 10.732221] *pde = 00000000 [ 10.732741] Oops: 0002 [#1] SMP [ 10.733453] [ 10.734253] Pid: 1, comm: swapper Tainted: G W (2.6.27-rc3-tip-00173-gd7eaa4f-dirty #1) [ 10.735188] EIP: 0060:[] EFLAGS: 00010002 CPU: 0 [ 10.735523] EIP is at _spin_lock_irq+0xf/0x20 [ 10.735523] EAX: 00000040 EBX: 00000000 ECX: f6e04c90 EDX: 00000100 [ 10.735523] ESI: 000000df EDI: f6e04c90 EBP: f7867df0 ESP: f7867df0 [ 10.735523] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 [ 10.735523] Process swapper (pid: 1, ti=f7867000 task=f7870000 task.ti=f7867000) [ 10.735523] Stack: f7867e04 c0155fbd 00000000 00000000 f6e04c90 f7867e5c c0c6e319 c0f6a074 [ 10.735523] f6e04c90 000017aa 00002012 c112b648 f791f240 c112b5e0 f7867e44 c010440b [ 10.735523] f791f240 f791f29c c112b8ec f791f240 00000000 f7867e5c c048f893 03c0b648 [ 10.735523] Call Trace: [ 10.735523] [] ? probe_irq_on+0x3d/0x140 [ 10.735523] [] ? yenta_probe+0x529/0x640 [ 10.735523] [] ? mcount_call+0x5/0xa [ 10.735523] [] ? pci_match_device+0xa3/0xb0 [ 10.735523] [] ? pci_device_probe+0x5e/0x80 [ 10.735523] [] ? driver_probe_device+0x83/0x180 [ 10.735523] [] ? __driver_attach+0x74/0x80 [ 10.735523] [] ? bus_for_each_dev+0x49/0x70 [ 10.735523] [] ? driver_attach+0x1e/0x20 [ 10.735523] [] ? __driver_attach+0x0/0x80 [ 10.735523] [] ? bus_add_driver+0x1a3/0x220 [ 10.735523] [] ? pci_device_remove+0x0/0x40 [ 10.735523] [] ? driver_register+0x54/0x130 [ 10.735523] [] ? __pci_register_driver+0x4f/0x90 [ 10.735523] [] ? yenta_socket_init+0x19/0x20 [ 10.735523] [] ? do_one_initcall+0x35/0x160 [ 10.735523] [] ? yenta_socket_init+0x0/0x20 [ 10.735523] [] ? __queue_work+0x36/0x50 [ 10.735523] [] ? queue_work_on+0x3d/0x50 [ 10.735523] [] ? kernel_init+0x148/0x210 [ 10.735523] [] ? kernel_init+0x0/0x210 [ 10.735523] [] ? kernel_thread_helper+0x7/0x10 [ 10.735523] ======================= [ 10.735523] Code: 10 38 f2 74 06 f3 90 8a 10 eb f6 5d 89 c8 c3 8d b6 00 00 00 00 8d bc 27 00 00 00 00 55 89 e5 e8 a4 e8 46 ff fa ba 00 01 00 00 90 <66> 0f c1 10 38 f2 74 06 f3 90 8a 10 eb f6 5d c3 90 55 89 e5 53 as auto-probing wants to iterate over existing irqs. Signed-off-by: Ingo Molnar --- kernel/irq/autoprobe.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index c45ab718cf07..b3a5549ea81e 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -40,6 +40,8 @@ unsigned long probe_irq_on(void) */ for (i = nr_irqs-1; i > 0; i--) { desc = irq_to_desc(i); + if (!desc) + continue; spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -70,6 +72,8 @@ unsigned long probe_irq_on(void) */ for (i = nr_irqs-1; i > 0; i--) { desc = irq_to_desc(i); + if (!desc) + continue; spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -93,6 +97,8 @@ unsigned long probe_irq_on(void) unsigned int status; desc = irq_to_desc(i); + if (!desc) + continue; spin_lock_irq(&desc->lock); status = desc->status; @@ -134,6 +140,8 @@ unsigned int probe_irq_mask(unsigned long val) struct irq_desc *desc = irq_to_desc(i); unsigned int status; + if (!desc) + continue; spin_lock_irq(&desc->lock); status = desc->status; @@ -177,6 +185,8 @@ int probe_irq_off(unsigned long val) struct irq_desc *desc = irq_to_desc(i); unsigned int status; + if (!desc) + continue; spin_lock_irq(&desc->lock); status = desc->status; -- cgit v1.2.1 From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:09 -0700 Subject: x86: move kstat_irqs from kstat to irq_desc based on Eric's patch ... together mold it with dyn_array for irq_desc, will allcate kstat_irqs for nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already. v2: make sure system without generic_hardirqs works they don't have irq_desc v3: fix merging v4: [mingo@elte.hu] fix typo [ mingo@elte.hu ] irq: build fix fix: arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow': arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs' Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 15 +++------ kernel/irq/handle.c | 97 ++++++++++++++++++++++++++++++++++++++--------------- kernel/sched.c | 5 +-- 3 files changed, 76 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 76c225cf4b26..2aa3d4b2fce8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -312,14 +312,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; - const unsigned int cpu = smp_processor_id(); spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -351,7 +350,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -361,7 +359,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* * If its disabled or no action available @@ -399,7 +397,6 @@ out_unlock: void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -409,7 +406,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* * If its disabled or no action available @@ -458,8 +455,6 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - const unsigned int cpu = smp_processor_id(); - spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -476,7 +471,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; } - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* Start handling the irq */ desc->chip->ack(irq); @@ -531,7 +526,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9fc33b3378e6..1f346990f3f8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -37,7 +37,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; ack_bad_irq(irq); } @@ -80,17 +80,38 @@ static void init_one_irq_desc(struct irq_desc *desc) #endif } -#ifdef CONFIG_HAVE_SPARSE_IRQ -static int nr_irq_desc = 32; +extern int after_bootmem; +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); -static int __init parse_nr_irq_desc(char *arg) +static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) { - if (arg) - nr_irq_desc = simple_strtoul(arg, NULL, 0); - return 0; + unsigned long bytes, total_bytes; + char *ptr; + int i; + unsigned long phys; + + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + total_bytes = bytes * nr_desc; + if (after_bootmem) + ptr = kzalloc(total_bytes, GFP_ATOMIC); + else + ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!ptr) + panic(" can not allocate kstat_irqs\n"); + + phys = __pa(ptr); + printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_desc; i++) { + desc[i].kstat_irqs = (unsigned int *)ptr; + ptr += bytes; + } } -early_param("nr_irq_desc", parse_nr_irq_desc); static void __init init_work(void *data) { @@ -100,25 +121,44 @@ static void __init init_work(void *data) desc = *da->name; - for (i = 0; i < *da->nr; i++) + for (i = 0; i < *da->nr; i++) { init_one_irq_desc(&desc[i]); +#ifndef CONFIG_HAVE_SPARSE_IRQ + desc[i].irq = i; +#endif + } +#ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) desc[i-1].next = &desc[i]; +#endif + + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, *da->nr, nr_cpu_ids); } +#ifdef CONFIG_HAVE_SPARSE_IRQ +static int nr_irq_desc = 32; + +static int __init parse_nr_irq_desc(char *arg) +{ + if (arg) + nr_irq_desc = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("nr_irq_desc", parse_nr_irq_desc); + static struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); -extern int after_bootmem; -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc, *desc_pri; int i; int count = 0; + unsigned long phys; + unsigned long total_bytes; BUG_ON(irq == -1U); @@ -141,38 +181,34 @@ struct irq_desc *irq_to_desc(unsigned int irq) */ printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + total_bytes = sizeof(struct irq_desc) * nr_irq_desc; if (after_bootmem) - desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC); + desc = kzalloc(total_bytes, GFP_ATOMIC); else - desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0); + desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); if (!desc) panic("please boot with nr_irq_desc= %d\n", count * 2); + phys = __pa(desc); + printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + for (i = 0; i < nr_irq_desc; i++) init_one_irq_desc(&desc[i]); for (i = 1; i < nr_irq_desc; i++) desc[i-1].next = &desc[i]; + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); + desc->irq = irq; desc_pri->next = desc; return desc; } #else -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - int i; - struct irq_desc *desc; - - desc = *da->name; - for (i = 0; i < *da->nr; i++) - init_one_irq_desc(&desc[i]); - -} static struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); @@ -315,7 +351,7 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -415,3 +451,10 @@ void early_init_irq_lock_class(void) } #endif +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc->kstat_irqs[cpu]; +} +EXPORT_SYMBOL(kstat_irqs_cpu); + diff --git a/kernel/sched.c b/kernel/sched.c index b9d713781b5b..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4048,11 +4048,8 @@ static inline void idle_balance(int cpu, struct rq *rq) #endif DEFINE_PER_CPU(struct kernel_stat, kstat); -EXPORT_PER_CPU_SYMBOL(kstat); -#ifdef CONFIG_HAVE_DYN_ARRAY -DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); -#endif +EXPORT_PER_CPU_SYMBOL(kstat); /* * Return p->sum_exec_runtime plus any more ns on the sched_clock -- cgit v1.2.1 From 9059d8fa4a3a9153da53da890039f7f956cc9d19 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:10 -0700 Subject: irq: add irq_desc_without_new add an irq_desc accessor that will not allocate any sparse entry but returns failure if there's no entry present. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 1f346990f3f8..8e55dbe50afc 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -112,7 +112,6 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -149,9 +148,27 @@ static int __init parse_nr_irq_desc(char *arg) early_param("nr_irq_desc", parse_nr_irq_desc); -static struct irq_desc *sparse_irqs; +struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); +struct irq_desc *__irq_to_desc(unsigned int irq) +{ + struct irq_desc *desc; + + BUG_ON(irq == -1U); + + desc = &sparse_irqs[0]; + while (desc) { + if (desc->irq == irq) + return desc; + + if (desc->irq == -1U) + return NULL; + + desc = desc->next; + } + return NULL; +} struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc, *desc_pri; @@ -208,8 +225,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return desc; } #else - -static struct irq_desc *irq_desc; +struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); #endif @@ -239,6 +255,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } +struct irq_desc *__irq_to_desc(unsigned int irq) +{ + return irq_to_desc(irq); +} #endif /* -- cgit v1.2.1 From 2c6927a38f65b53b62f86158fba29a068c4e8b6a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:11 -0700 Subject: irq: replace loop with nr_irqs with for_each_irq_desc There are a handful of loops that go from 0 to nr_irqs and use get_irq_desc() on them. These would allocate all the irq_desc entries, regardless of the need for them. Use the smarter for_each_irq_desc() iterator that will only iterate over the present ones. v2: make sure arch without GENERIC_HARDIRQS work too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/internals.h | 4 ++-- kernel/irq/manage.c | 2 +- kernel/irq/proc.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 422dd00c8bd3..c9767e641980 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -14,11 +14,11 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); #ifdef CONFIG_PROC_FS -extern void register_irq_proc(unsigned int irq); +extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else -static inline void register_irq_proc(unsigned int irq) { } +static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b5943e9f95aa..5070f55fdc16 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -478,7 +478,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; - register_irq_proc(irq); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c2f356c808f6..bc0993d86c8b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -182,11 +182,10 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) #define MAX_NAMELEN 10 -void register_irq_proc(unsigned int irq) +void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; - struct irq_desc *desc = irq_to_desc(irq); if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; @@ -230,7 +229,8 @@ void register_default_affinity_proc(void) void init_irq_proc(void) { - int i; + unsigned int irq; + struct irq_desc *desc; /* create /proc/irq */ root_irq_dir = proc_mkdir("irq", NULL); @@ -242,7 +242,7 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for (i = 0; i < nr_irqs; i++) - register_irq_proc(i); + for_each_irq_desc(irq, desc) + register_irq_proc(irq, desc); } -- cgit v1.2.1 From 7d94f7ca401dd7f445fda9a971a48aa5427b3e55 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:14 -0700 Subject: irq: remove >= nr_irqs checking with config_have_sparse_irq remove irq limit checks - nr_irqs is dynamic and we expand anytime. v2: fix checking about result irq_cfg_without_new, so could use msi again v3: use irq_desc_without_new to check irq is valid Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 49 +++++++++++++++++++++++++++---------------------- kernel/irq/manage.c | 43 +++++++++++++++++++++++++++---------------- 2 files changed, 54 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2aa3d4b2fce8..a4bb0da9c88c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -27,13 +27,13 @@ void dynamic_irq_init(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = irq_to_desc(irq); + if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -60,12 +60,12 @@ void dynamic_irq_cleanup(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -92,7 +92,8 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } @@ -121,12 +122,12 @@ int set_irq_type(unsigned int irq, unsigned int type) unsigned long flags; int ret = -ENXIO; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; } - desc = irq_to_desc(irq); if (type == IRQ_TYPE_NONE) return 0; @@ -149,13 +150,13 @@ int set_irq_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -175,12 +176,13 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_to_desc(irq); + spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -201,8 +203,14 @@ int set_irq_chip_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - desc = irq_to_desc(irq); - if (irq >= nr_irqs || !desc->chip) { + desc = __irq_to_desc(irq); + if (!desc) { + printk(KERN_ERR + "Trying to install chip data for IRQ%d\n", irq); + return -EINVAL; + } + + if (!desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } @@ -546,14 +554,13 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); return; } - desc = irq_to_desc(irq); - if (!handle) handle = handle_bad_irq; else if (desc->chip == &no_irq_chip) { @@ -611,14 +618,13 @@ void __init set_irq_noprobe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); return; } - desc = irq_to_desc(irq); - spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); @@ -629,14 +635,13 @@ void __init set_irq_probe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); return; } - desc = irq_to_desc(irq); - spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5070f55fdc16..c0b4d4df6de2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = __irq_to_desc(irq); unsigned int status; - if (irq >= nr_irqs) + if (!desc) return; do { @@ -142,10 +142,11 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -171,9 +172,10 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return; disable_irq_nosync(irq); @@ -213,10 +215,11 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -290,10 +293,14 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; struct irqaction *action; - if (irq >= nr_irqs || desc->status & IRQ_NOREQUEST) + desc = __irq_to_desc(irq); + if (!desc) + return 0; + + if (desc->status & IRQ_NOREQUEST) return 0; action = desc->action; @@ -352,14 +359,15 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, */ int setup_irq(unsigned int irq, struct irqaction *new) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; int shared = 0; int ret; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return -EINVAL; if (desc->chip == &no_irq_chip) @@ -518,10 +526,11 @@ void free_irq(unsigned int irq, void *dev_id) unsigned long flags; WARN_ON(in_interrupt()); - if (irq >= nr_irqs) + + desc = __irq_to_desc(irq); + if (!desc) return; - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -634,9 +643,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, */ if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - if (irq >= nr_irqs) + + desc = __irq_to_desc(irq); + if (!desc) return -EINVAL; - desc = irq_to_desc(irq); + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) -- cgit v1.2.1 From 46926b67fc663d357a1a8174328998a9e49da0b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:15 -0700 Subject: generic: add irq_desc in function in parameter So we could remove some duplicated calling to irq_desc v2: make sure irq_desc in init/main.c is not used without generic_hardirqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 8e55dbe50afc..e1d787e9169b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -197,6 +197,21 @@ struct irq_desc *irq_to_desc(unsigned int irq) * we run out of pre-allocate ones, allocate more */ printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + { + /* double check if some one mess up the list */ + struct irq_desc *desc; + int count = 0; + + desc = &sparse_irqs[0]; + while (desc) { + printk(KERN_DEBUG "found irq_desc for irq %d\n", desc->irq); + if (desc->next) + printk(KERN_DEBUG "found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); + desc = desc->next; + count++; + } + printk(KERN_DEBUG "all preallocted %d\n", count); + } total_bytes = sizeof(struct irq_desc) * nr_irq_desc; if (after_bootmem) @@ -221,6 +236,21 @@ struct irq_desc *irq_to_desc(unsigned int irq) desc->irq = irq; desc_pri->next = desc; + { + /* double check if some one mess up the list */ + struct irq_desc *desc; + int count = 0; + + desc = &sparse_irqs[0]; + while (desc) { + printk(KERN_DEBUG "1 found irq_desc for irq %d\n", desc->irq); + if (desc->next) + printk(KERN_DEBUG "1 found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); + desc = desc->next; + count++; + } + printk(KERN_DEBUG "1 all preallocted %d\n", count); + } return desc; } -- cgit v1.2.1 From cb5bc83225a86ca53bbb889ed8439e4fd6cf44ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:17 -0700 Subject: x86_64: rename irq_desc/irq_desc_alloc change names: irq_desc() ==> irq_desc_alloc __irq_desc() ==> irq_desc Also split a few of the uses in lowlevel x86 code. v2: need to check if desc is null in smp_irq_move_cleanup Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 21 +++++++++++---------- kernel/irq/handle.c | 23 +++++------------------ kernel/irq/manage.c | 16 ++++++++-------- 3 files changed, 24 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a4bb0da9c88c..9fc5e69213de 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -27,7 +27,8 @@ void dynamic_irq_init(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + desc = irq_to_desc_alloc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; @@ -60,7 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; @@ -92,7 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; @@ -122,7 +123,7 @@ int set_irq_type(unsigned int irq, unsigned int type) unsigned long flags; int ret = -ENXIO; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; @@ -150,7 +151,7 @@ int set_irq_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); @@ -176,7 +177,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); @@ -203,7 +204,7 @@ int set_irq_chip_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install chip data for IRQ%d\n", irq); @@ -554,7 +555,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); @@ -618,7 +619,7 @@ void __init set_irq_noprobe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); @@ -635,7 +636,7 @@ void __init set_irq_probe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e1d787e9169b..d44e3515eae1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -151,7 +151,7 @@ early_param("nr_irq_desc", parse_nr_irq_desc); struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); -struct irq_desc *__irq_to_desc(unsigned int irq) +struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc; @@ -169,7 +169,7 @@ struct irq_desc *__irq_to_desc(unsigned int irq) } return NULL; } -struct irq_desc *irq_to_desc(unsigned int irq) +struct irq_desc *irq_to_desc_alloc(unsigned int irq) { struct irq_desc *desc, *desc_pri; int i; @@ -186,6 +186,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) if (desc->irq == -1U) { desc->irq = irq; + printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); return desc; } desc_pri = desc; @@ -236,21 +237,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) desc->irq = irq; desc_pri->next = desc; - { - /* double check if some one mess up the list */ - struct irq_desc *desc; - int count = 0; - - desc = &sparse_irqs[0]; - while (desc) { - printk(KERN_DEBUG "1 found irq_desc for irq %d\n", desc->irq); - if (desc->next) - printk(KERN_DEBUG "1 found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); - desc = desc->next; - count++; - } - printk(KERN_DEBUG "1 all preallocted %d\n", count); - } + printk(KERN_DEBUG "1 found new irq_desc for irq %d and pri will be irq %d\n", desc->irq, desc_pri->irq); return desc; } @@ -285,7 +272,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *__irq_to_desc(unsigned int irq) +struct irq_desc *irq_to_desc_alloc(unsigned int irq) { return irq_to_desc(irq); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c0b4d4df6de2..6df49218632a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = __irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; if (!desc) @@ -145,7 +145,7 @@ void disable_irq_nosync(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -174,7 +174,7 @@ void disable_irq(unsigned int irq) { struct irq_desc *desc; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -218,7 +218,7 @@ void enable_irq(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -296,7 +296,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) struct irq_desc *desc; struct irqaction *action; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return 0; @@ -366,7 +366,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) int shared = 0; int ret; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return -EINVAL; @@ -527,7 +527,7 @@ void free_irq(unsigned int irq, void *dev_id) WARN_ON(in_interrupt()); - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -644,7 +644,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return -EINVAL; -- cgit v1.2.1 From 67fb283e148e9bd761f73691d3173b6eab9ba8db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:18 -0700 Subject: irq: separate sparse_irqs from sparse_irqs_free so later don't need compare with -1U Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 115 ++++++++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d44e3515eae1..6d174390f3a0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -112,6 +112,11 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } +#ifdef CONFIG_HAVE_SPARSE_IRQ +static struct irq_desc *sparse_irqs_free; +struct irq_desc *sparse_irqs; +#endif + static void __init init_work(void *data) { struct dyn_array *da = data; @@ -127,13 +132,16 @@ static void __init init_work(void *data) #endif } + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, *da->nr, nr_cpu_ids); + #ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) desc[i-1].next = &desc[i]; -#endif - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, *da->nr, nr_cpu_ids); + sparse_irqs_free = sparse_irqs; + sparse_irqs = NULL; +#endif } #ifdef CONFIG_HAVE_SPARSE_IRQ @@ -148,23 +156,17 @@ static int __init parse_nr_irq_desc(char *arg) early_param("nr_irq_desc", parse_nr_irq_desc); -struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc; - BUG_ON(irq == -1U); - - desc = &sparse_irqs[0]; + desc = sparse_irqs; while (desc) { if (desc->irq == irq) return desc; - if (desc->irq == -1U) - return NULL; - desc = desc->next; } return NULL; @@ -174,21 +176,12 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) struct irq_desc *desc, *desc_pri; int i; int count = 0; - unsigned long phys; - unsigned long total_bytes; - BUG_ON(irq == -1U); - - desc_pri = desc = &sparse_irqs[0]; + desc_pri = desc = sparse_irqs; while (desc) { if (desc->irq == irq) return desc; - if (desc->irq == -1U) { - desc->irq = irq; - printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); - return desc; - } desc_pri = desc; desc = desc->next; count++; @@ -197,48 +190,62 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) /* * we run out of pre-allocate ones, allocate more */ - printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); - { - /* double check if some one mess up the list */ - struct irq_desc *desc; - int count = 0; - - desc = &sparse_irqs[0]; - while (desc) { - printk(KERN_DEBUG "found irq_desc for irq %d\n", desc->irq); - if (desc->next) - printk(KERN_DEBUG "found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); - desc = desc->next; - count++; - } - printk(KERN_DEBUG "all preallocted %d\n", count); - } + if (!sparse_irqs_free) { + unsigned long phys; + unsigned long total_bytes; - total_bytes = sizeof(struct irq_desc) * nr_irq_desc; - if (after_bootmem) - desc = kzalloc(total_bytes, GFP_ATOMIC); - else - desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); - if (!desc) - panic("please boot with nr_irq_desc= %d\n", count * 2); + total_bytes = sizeof(struct irq_desc) * nr_irq_desc; + if (after_bootmem) + desc = kzalloc(total_bytes, GFP_ATOMIC); + else + desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - phys = __pa(desc); - printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + if (!desc) + panic("please boot with nr_irq_desc= %d\n", count * 2); - for (i = 0; i < nr_irq_desc; i++) - init_one_irq_desc(&desc[i]); + phys = __pa(desc); + printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - for (i = 1; i < nr_irq_desc; i++) - desc[i-1].next = &desc[i]; + for (i = 0; i < nr_irq_desc; i++) + init_one_irq_desc(&desc[i]); - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); + for (i = 1; i < nr_irq_desc; i++) + desc[i-1].next = &desc[i]; - desc->irq = irq; - desc_pri->next = desc; - printk(KERN_DEBUG "1 found new irq_desc for irq %d and pri will be irq %d\n", desc->irq, desc_pri->irq); + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); + sparse_irqs_free = desc; + } + + desc = sparse_irqs_free; + sparse_irqs_free = sparse_irqs_free->next; + desc->next = NULL; + if (desc_pri) + desc_pri->next = desc; + else + sparse_irqs = desc; + desc->irq = irq; + printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_desc *desc; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_desc); + unsigned int irqx; + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_desc dump after get that for %d\n", irq); + for_each_irq_desc(irqx, desc) { + phys = __pa(desc); + printk(KERN_DEBUG "irq_desc %d ==> [%#lx - %#lx]\n", irqx, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif return desc; } #else -- cgit v1.2.1 From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:23 -0700 Subject: x86: remove irqbalance in kernel for 32 bit This has been deprecated for years, the user space irqbalanced utility works better with numa, has configurable policies, etc... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6df49218632a..ddc956861a58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -86,8 +86,6 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) if (!desc->chip->set_affinity) return -EINVAL; - set_balance_irq_affinity(irq, cpumask); - #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { unsigned long flags; @@ -122,7 +120,6 @@ int irq_select_affinity(unsigned int irq) desc->affinity = mask; desc->chip->set_affinity(irq, mask); - set_balance_irq_affinity(irq, mask); return 0; } #endif -- cgit v1.2.1 From e955b5398b660a204854bdff059d050b44090879 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 20:50:37 -0700 Subject: sparseirq: fix lockdep -tip testing found this lockdep splat: [ 0.000000] Initializing CPU#0 [ 0.000000] found new irq_desc for irq 0 [ 0.000000] INFO: trying to register non-static key. [ 0.000000] the code is fine but needs lockdep annotation. [ 0.000000] turning off the locking correctness validator. [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.27-rc3-tip-00191-g98ccb89-dirty #1 [ 0.000000] [] register_lock_class+0x3d2/0x400 [ 0.000000] [] ? mcount_call+0x5/0xa [ 0.000000] [] __lock_acquire+0x22a/0x5d0 [ 0.000000] [] ? mcount_call+0x5/0xa [ 0.000000] [] lock_acquire+0x71/0xa0 [ 0.000000] [] ? set_irq_chip+0x3f/0x90 [ 0.000000] [] _spin_lock_irqsave+0x58/0x90 [ 0.000000] [] ? set_irq_chip+0x3f/0x90 [ 0.000000] [] set_irq_chip+0x3f/0x90 [ 0.000000] [] ? handle_level_irq+0x0/0xe0 [ 0.000000] [] set_irq_chip_and_handler_name+0x1a/0x40 [ 0.000000] [] init_ISA_irqs+0x51/0xa0 [ 0.000000] [] pre_intr_init_hook+0x25/0x30 [ 0.000000] [] native_init_IRQ+0x13/0x370 [ 0.000000] [] ? lock_release+0xcc/0x1d0 [ 0.000000] [] ? mcount_call+0x5/0xa [ 0.000000] [] ? __mutex_unlock_slowpath+0x92/0x110 [ 0.000000] [] ? mutex_unlock+0xd/0x10 [ 0.000000] [] ? cpu_maps_update_done+0x12/0x20 [ 0.000000] [] ? register_cpu_notifier+0x23/0x30 [ 0.000000] [] init_IRQ+0xe/0x10 [ 0.000000] [] start_kernel+0x1c5/0x340 [ 0.000000] [] ? unknown_bootoption+0x0/0x210 [ 0.000000] [] i386_start_kernel+0x6b/0x80 [ 0.000000] ======================= [ 0.000000] found new irq_desc for irq 1 [ 0.000000] found new irq_desc for irq 2 [ 0.000000] found new irq_desc for irq 3 this: static void init_one_irq_desc(struct irq_desc *desc) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); #ifdef CONFIG_TRACE_IRQFLAGS lockdep_set_class(&desc->lock, &irq_desc_lock_class); #endif } should be unconditional. Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6d174390f3a0..24c83a3cee4d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,13 +18,10 @@ #include "internals.h" -#ifdef CONFIG_TRACE_IRQFLAGS - /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ static struct lock_class_key irq_desc_lock_class; -#endif /** * handle_bad_irq - handle spurious and unhandled irqs @@ -75,9 +72,7 @@ static struct irq_desc irq_desc_init = { static void init_one_irq_desc(struct irq_desc *desc) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); -#ifdef CONFIG_TRACE_IRQFLAGS lockdep_set_class(&desc->lock, &irq_desc_lock_class); -#endif } extern int after_bootmem; -- cgit v1.2.1 From e89eb43863c2d9f11a3bbe766766fe646e6c50d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Aug 2008 20:46:25 -0700 Subject: x86: sparse_irq needs spin_lock in allocations Suresh Siddha noticed that we should have a spinlock around it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 24c83a3cee4d..d638a911cbc1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -107,6 +107,11 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } +/* + * Protect the sparse_irqs_free freelist: + */ +static DEFINE_SPINLOCK(sparse_irq_lock); + #ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_desc *sparse_irqs_free; struct irq_desc *sparse_irqs; @@ -166,11 +171,13 @@ struct irq_desc *irq_to_desc(unsigned int irq) } return NULL; } + struct irq_desc *irq_to_desc_alloc(unsigned int irq) { struct irq_desc *desc, *desc_pri; - int i; + unsigned long flags; int count = 0; + int i; desc_pri = desc = sparse_irqs; while (desc) { @@ -182,6 +189,7 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) count++; } + spin_lock_irqsave(&sparse_irq_lock, flags); /* * we run out of pre-allocate ones, allocate more */ @@ -223,6 +231,9 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) else sparse_irqs = desc; desc->irq = irq; + + spin_unlock_irqrestore(&sparse_irq_lock, flags); + printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); #ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG { -- cgit v1.2.1 From 8c464a4b23ca283b414022ebc77787f3c7040fa7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 Aug 2008 12:41:19 -0700 Subject: sparseirq: move kstat_irqs from kstat to irq_desc - fix fix non-sparseirq architectures. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 21 ++++++++++++++++++++- kernel/irq/handle.c | 10 ++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9fc5e69213de..4ef555c50db8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -327,7 +327,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -368,7 +372,11 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif /* * If its disabled or no action available @@ -415,7 +423,11 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif /* * If its disabled or no action available @@ -479,8 +491,11 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc, irq); goto out_unlock; } - +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif /* Start handling the irq */ desc->chip->ack(irq); @@ -535,7 +550,11 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d638a911cbc1..eae69373a9c6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -34,7 +34,11 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif ack_bad_irq(irq); } @@ -401,7 +405,11 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; +#ifdef CONFIG_HAVE_DYN_ARRAY kstat_irqs_this_cpu(desc)++; +#else + kstat_irqs_this_cpu(irq)++; +#endif if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -501,10 +509,12 @@ void early_init_irq_lock_class(void) } #endif +#ifdef CONFIG_HAVE_DYN_ARRAY unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc->kstat_irqs[cpu]; } +#endif EXPORT_SYMBOL(kstat_irqs_cpu); -- cgit v1.2.1 From 21056830c4e5c6735f1d49453398d7186ca4e8d7 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Fri, 5 Sep 2008 09:10:40 -0500 Subject: irq: set_irq_chip() has redundant call to irq_to_desc() Extraneous call to irq_to_desc(). Signed-off-by: Dean Nelson Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4ef555c50db8..570d1ea1db5d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -102,7 +102,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; -- cgit v1.2.1 From 932775a4ab622e3c99bd59f14cc7d96722f79501 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:15 -0700 Subject: x86: HPET_MSI change IRQ affinity in process context when it is disabled Change the IRQ affinity in the process context when the IRQ is disabled. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ddc956861a58..ad2ce72c83c4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -87,10 +87,11 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) return -EINVAL; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { + if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { unsigned long flags; spin_lock_irqsave(&desc->lock, flags); + desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); spin_unlock_irqrestore(&desc->lock, flags); } else -- cgit v1.2.1 From e00585bb7fc3d0b601181b765a254df7ff4ea59b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Sep 2008 01:53:50 -0700 Subject: irq: fix irqpoll && sparseirq Steven Noonan reported a boot hang when using irqpoll and CONFIG_HAVE_SPARSE_IRQ=y. The irqpoll loop needs to be updated to not iterate from 1 to nr_irqs but to iterate via for_each_irq_desc(). (in the former case desc can be NULL which crashes the box) Reported-by: Steven Noonan Tested-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/irq/spurious.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b5d906002e1d..ec5a4bef3054 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -90,14 +90,15 @@ static int misrouted_irq(int irq) { int i; int ok = 0; + struct irq_desc *desc; - for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc; + for_each_irq_desc(i, desc) { + if (!i) + continue; if (i == irq) /* Already tried */ continue; - desc = irq_to_desc(i); if (try_one_irq(i, desc)) ok = 1; } @@ -108,10 +109,14 @@ static int misrouted_irq(int irq) static void poll_spurious_irqs(unsigned long dummy) { int i; - for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_to_desc(i); + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { unsigned int status; + if (!i) + continue; + /* Racy but it doesn't matter */ status = desc->status; barrier(); @@ -278,7 +283,7 @@ static int __init irqfixup_setup(char *str) __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); +MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { -- cgit v1.2.1 From 9d98598d2fc286c8dbcd0b681168639528428db0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Sep 2008 00:12:26 -0700 Subject: sparseirq: remove some debug print out Signed-off-by: Yinghai Lu Cc: Yinghai Lu Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index eae69373a9c6..710db0ec97e3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -238,24 +238,6 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) spin_unlock_irqrestore(&sparse_irq_lock, flags); - printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_desc *desc; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_desc); - unsigned int irqx; - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_desc dump after get that for %d\n", irq); - for_each_irq_desc(irqx, desc) { - phys = __pa(desc); - printk(KERN_DEBUG "irq_desc %d ==> [%#lx - %#lx]\n", irqx, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif return desc; } #else -- cgit v1.2.1 From 2976fe20125587c944c8df48d991c38f0891fb28 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 18 Sep 2008 16:10:48 -0700 Subject: fix warning: "x86: sparse_irq needs spin_lock in allocations" caused by commit a532e19680ada3b8579b81e67e76d3ebd19c340f Author: Yinghai Lu Date: Wed Aug 20 20:46:25 2008 -0700 x86: sparse_irq needs spin_lock in allocations Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 710db0ec97e3..7f625fbc9aa4 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,12 +111,11 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } +#ifdef CONFIG_HAVE_SPARSE_IRQ /* * Protect the sparse_irqs_free freelist: */ static DEFINE_SPINLOCK(sparse_irq_lock); - -#ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_desc *sparse_irqs_free; struct irq_desc *sparse_irqs; #endif -- cgit v1.2.1 From aac3f2b6f6e88827432c4050ac73552f24b19de1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Sep 2008 19:04:35 -0700 Subject: x86: fix typo in irq_desc array when SPARSE_IRQ is not used, should still use irq_desc->lock Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 7f625fbc9aa4..fb6bdb602a93 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -253,7 +253,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(sparse_irqs->lock), + .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif -- cgit v1.2.1 From c6b7674f323622d86316bf7951ad9cae1ce24642 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:31:29 +0200 Subject: genirq: use inline function for irq_to_desc For the non sparse irq case an inline function is perfectly fine. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fb6bdb602a93..c19896f895f9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -262,20 +262,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { #endif -#ifndef CONFIG_HAVE_SPARSE_IRQ -struct irq_desc *irq_to_desc(unsigned int irq) -{ - if (irq < nr_irqs) - return &irq_desc[irq]; - - return NULL; -} -struct irq_desc *irq_to_desc_alloc(unsigned int irq) -{ - return irq_to_desc(irq); -} -#endif - /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. -- cgit v1.2.1 From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:16:55 +0200 Subject: genirq: remove sparse irq code This code is not ready, but we need to rip it out instead of rebasing as we would lose the APIC/IO_APIC unification otherwise. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 114 ---------------------------------------------------- 1 file changed, 114 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c19896f895f9..f837133cdfbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,15 +111,6 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* - * Protect the sparse_irqs_free freelist: - */ -static DEFINE_SPINLOCK(sparse_irq_lock); -static struct irq_desc *sparse_irqs_free; -struct irq_desc *sparse_irqs; -#endif - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -130,121 +121,16 @@ static void __init init_work(void *data) for (i = 0; i < *da->nr; i++) { init_one_irq_desc(&desc[i]); -#ifndef CONFIG_HAVE_SPARSE_IRQ desc[i].irq = i; -#endif } /* init kstat_irqs, nr_cpu_ids is ready already */ init_kstat_irqs(desc, *da->nr, nr_cpu_ids); - -#ifdef CONFIG_HAVE_SPARSE_IRQ - for (i = 1; i < *da->nr; i++) - desc[i-1].next = &desc[i]; - - sparse_irqs_free = sparse_irqs; - sparse_irqs = NULL; -#endif -} - -#ifdef CONFIG_HAVE_SPARSE_IRQ -static int nr_irq_desc = 32; - -static int __init parse_nr_irq_desc(char *arg) -{ - if (arg) - nr_irq_desc = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("nr_irq_desc", parse_nr_irq_desc); - -DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - struct irq_desc *desc; - - desc = sparse_irqs; - while (desc) { - if (desc->irq == irq) - return desc; - - desc = desc->next; - } - return NULL; } -struct irq_desc *irq_to_desc_alloc(unsigned int irq) -{ - struct irq_desc *desc, *desc_pri; - unsigned long flags; - int count = 0; - int i; - - desc_pri = desc = sparse_irqs; - while (desc) { - if (desc->irq == irq) - return desc; - - desc_pri = desc; - desc = desc->next; - count++; - } - - spin_lock_irqsave(&sparse_irq_lock, flags); - /* - * we run out of pre-allocate ones, allocate more - */ - if (!sparse_irqs_free) { - unsigned long phys; - unsigned long total_bytes; - - printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); - - total_bytes = sizeof(struct irq_desc) * nr_irq_desc; - if (after_bootmem) - desc = kzalloc(total_bytes, GFP_ATOMIC); - else - desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!desc) - panic("please boot with nr_irq_desc= %d\n", count * 2); - - phys = __pa(desc); - printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_desc; i++) - init_one_irq_desc(&desc[i]); - - for (i = 1; i < nr_irq_desc; i++) - desc[i-1].next = &desc[i]; - - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); - - sparse_irqs_free = desc; - } - - desc = sparse_irqs_free; - sparse_irqs_free = sparse_irqs_free->next; - desc->next = NULL; - if (desc_pri) - desc_pri->next = desc; - else - sparse_irqs = desc; - desc->irq = irq; - - spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} -#else struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); -#endif - #else struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { -- cgit v1.2.1 From ee32c9732244bde4b9b59eeac2814c23e2b71f8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:34:09 +0200 Subject: genirq: remove irq_to_desc_alloc Remove the leftover of sparseirqs. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 570d1ea1db5d..e6f73dbfcc3d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -28,7 +28,7 @@ void dynamic_irq_init(unsigned int irq) unsigned long flags; /* first time to use this irq_desc */ - desc = irq_to_desc_alloc(irq); + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; -- cgit v1.2.1 From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 15:27:23 +0200 Subject: genirq: revert dynarray Revert the dynarray changes. They need more thought and polishing. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 30 +++----------- kernel/irq/handle.c | 114 +++++----------------------------------------------- 2 files changed, 14 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e6f73dbfcc3d..d96d6f687c48 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -326,11 +326,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -371,11 +367,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -422,11 +414,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -490,11 +478,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc, irq); goto out_unlock; } -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); @@ -549,11 +533,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f837133cdfbe..9fe86b3a60a5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,11 +18,6 @@ #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -30,15 +25,10 @@ static struct lock_class_key irq_desc_lock_class; * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); ack_bad_irq(irq); } @@ -59,80 +49,6 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_desc irq_desc_init = { - .irq = -1U, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif -}; - - -static void init_one_irq_desc(struct irq_desc *desc) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - lockdep_set_class(&desc->lock, &irq_desc_lock_class); -} - -extern int after_bootmem; -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); - -static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) -{ - unsigned long bytes, total_bytes; - char *ptr; - int i; - unsigned long phys; - - /* Compute how many bytes we need per irq and allocate them */ - bytes = nr * sizeof(unsigned int); - total_bytes = bytes * nr_desc; - if (after_bootmem) - ptr = kzalloc(total_bytes, GFP_ATOMIC); - else - ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!ptr) - panic(" can not allocate kstat_irqs\n"); - - phys = __pa(ptr); - printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_desc; i++) { - desc[i].kstat_irqs = (unsigned int *)ptr; - ptr += bytes; - } -} - -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - int i; - struct irq_desc *desc; - - desc = *da->name; - - for (i = 0; i < *da->nr; i++) { - init_one_irq_desc(&desc[i]); - desc[i].irq = i; - } - - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, *da->nr, nr_cpu_ids); -} - -struct irq_desc *irq_desc; -DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); - -#else - struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -146,8 +62,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -#endif - /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. @@ -258,11 +172,8 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -351,23 +262,16 @@ out: #ifdef CONFIG_TRACE_IRQFLAGS +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + void early_init_irq_lock_class(void) { -#ifndef CONFIG_HAVE_DYN_ARRAY int i; for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); -#endif } #endif - -#ifdef CONFIG_HAVE_DYN_ARRAY -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc->kstat_irqs[cpu]; -} -#endif -EXPORT_SYMBOL(kstat_irqs_cpu); - -- cgit v1.2.1 From d3c60047bdb03199b93497ac40bd531315d43a86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 09:55:00 +0200 Subject: genirq: cleanup the sparseirq modifications Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 43 +++++++++++++------------------------------ kernel/irq/handle.c | 3 +-- kernel/irq/manage.c | 38 +++++++++++++++++++++++--------------- kernel/irq/proc.c | 1 + kernel/irq/spurious.c | 18 ++++++++++-------- 5 files changed, 48 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d96d6f687c48..4895fde4eb93 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -24,11 +24,9 @@ */ void dynamic_irq_init(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - /* first time to use this irq_desc */ - desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; @@ -58,10 +56,9 @@ void dynamic_irq_init(unsigned int irq) */ void dynamic_irq_cleanup(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; @@ -90,10 +87,9 @@ void dynamic_irq_cleanup(unsigned int irq) */ int set_irq_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; @@ -118,11 +114,10 @@ EXPORT_SYMBOL(set_irq_chip); */ int set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = -ENXIO; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; @@ -147,10 +142,9 @@ EXPORT_SYMBOL(set_irq_type); */ int set_irq_data(unsigned int irq, void *data) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); @@ -173,10 +167,9 @@ EXPORT_SYMBOL(set_irq_data); */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); @@ -200,10 +193,9 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) */ int set_irq_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install chip data for IRQ%d\n", irq); @@ -228,9 +220,8 @@ EXPORT_SYMBOL(set_irq_chip_data); */ static void default_enable(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); - desc = irq_to_desc(irq); desc->chip->unmask(irq); desc->status &= ~IRQ_MASKED; } @@ -247,11 +238,9 @@ static void default_disable(unsigned int irq) */ static unsigned int default_startup(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); - desc = irq_to_desc(irq); desc->chip->enable(irq); - return 0; } @@ -260,9 +249,8 @@ static unsigned int default_startup(unsigned int irq) */ static void default_shutdown(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); - desc = irq_to_desc(irq); desc->chip->mask(irq); desc->status |= IRQ_MASKED; } @@ -550,10 +538,9 @@ void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); @@ -614,13 +601,11 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void __init set_irq_noprobe(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); - return; } @@ -631,13 +616,11 @@ void __init set_irq_noprobe(unsigned int irq) void __init set_irq_probe(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9fe86b3a60a5..a69368ff607f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -68,9 +68,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { */ static void ack_bad(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); - desc = irq_to_desc(irq); print_irq_desc(irq, desc); ack_bad_irq(irq); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ad2ce72c83c4..c498a1b8c621 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -140,10 +140,9 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) return; @@ -170,9 +169,8 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); - desc = irq_to_desc(irq); if (!desc) return; @@ -213,10 +211,9 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - desc = irq_to_desc(irq); if (!desc) return; @@ -291,10 +288,9 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - desc = irq_to_desc(irq); if (!desc) return 0; @@ -355,16 +351,15 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. */ -int setup_irq(unsigned int irq, struct irqaction *new) +static int +__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) { - struct irq_desc *desc; struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; int shared = 0; int ret; - desc = irq_to_desc(irq); if (!desc) return -EINVAL; @@ -503,6 +498,20 @@ mismatch: return -EBUSY; } +/** + * setup_irq - setup an interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt + * + * Used to statically setup interrupts in the early boot process. + */ +int setup_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __setup_irq(irq, desc, act); +} + /** * free_irq - free an interrupt * @irq: Interrupt line to free @@ -519,13 +528,12 @@ mismatch: */ void free_irq(unsigned int irq, void *dev_id) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction **p; unsigned long flags; WARN_ON(in_interrupt()); - desc = irq_to_desc(irq); if (!desc) return; @@ -624,8 +632,8 @@ int request_irq(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; - int retval; struct irq_desc *desc; + int retval; #ifdef CONFIG_LOCKDEP /* @@ -662,7 +670,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; - retval = setup_irq(irq, action); + retval = __setup_irq(irq, desc, action); if (retval) kfree(action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index bc0993d86c8b..fac014a81b24 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -215,6 +215,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) { if (action->dir) { struct irq_desc *desc = irq_to_desc(irq); + remove_proc_entry(action->dir->name, desc->dir); } } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index ec5a4bef3054..dd364c11e56e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -26,8 +26,7 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); static int try_one_irq(int irq, struct irq_desc *desc) { struct irqaction *action; - int ok = 0; - int work = 0; /* Did we do work for a real IRQ */ + int ok = 0, work = 0; spin_lock(&desc->lock); /* Already running on another processor */ @@ -88,9 +87,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) static int misrouted_irq(int irq) { - int i; - int ok = 0; struct irq_desc *desc; + int i, ok = 0; for_each_irq_desc(i, desc) { if (!i) @@ -108,8 +106,8 @@ static int misrouted_irq(int irq) static void poll_spurious_irqs(unsigned long dummy) { - int i; struct irq_desc *desc; + int i; for_each_irq_desc(i, desc) { unsigned int status; @@ -126,7 +124,8 @@ static void poll_spurious_irqs(unsigned long dummy) try_one_irq(i, desc); } - mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } /* @@ -177,7 +176,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } } -static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static inline int +try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; @@ -253,7 +254,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->depth++; desc->chip->disable(irq); - mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } -- cgit v1.2.1 From 10e580842ec8e53dddf62e1ab1871f4906477376 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 14:19:04 +0200 Subject: genirq: use iterators for irq_desc loops Use for_each_irq_desc[_reverse] for all the iteration loops. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 45 ++++++++++++--------------------------------- kernel/irq/handle.c | 5 +++-- 2 files changed, 15 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index b3a5549ea81e..0cbff18efb6d 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -30,19 +30,16 @@ static DEFINE_MUTEX(probing_active); unsigned long probe_irq_on(void) { struct irq_desc *desc; - unsigned long mask; - unsigned int i; + unsigned long mask = 0; + unsigned int status; + int i; mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. */ - for (i = nr_irqs-1; i > 0; i--) { - desc = irq_to_desc(i); - if (!desc) - continue; - + for_each_irq_desc_reverse(i, desc) { spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -70,11 +67,7 @@ unsigned long probe_irq_on(void) * (we must startup again here because if a longstanding irq * happened in the previous stage, it may have masked itself) */ - for (i = nr_irqs-1; i > 0; i--) { - desc = irq_to_desc(i); - if (!desc) - continue; - + for_each_irq_desc_reverse(i, desc) { spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -92,13 +85,7 @@ unsigned long probe_irq_on(void) /* * Now filter out any obviously spurious interrupts */ - mask = 0; - for (i = 0; i < nr_irqs; i++) { - unsigned int status; - - desc = irq_to_desc(i); - if (!desc) - continue; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; @@ -132,16 +119,11 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int mask; + unsigned int status, mask = 0; + struct irq_desc *desc; int i; - mask = 0; - for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_to_desc(i); - unsigned int status; - - if (!desc) - continue; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; @@ -180,13 +162,10 @@ EXPORT_SYMBOL(probe_irq_mask); int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_irqs = 0; + struct irq_desc *desc; + unsigned int status; - for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_to_desc(i); - unsigned int status; - - if (!desc) - continue; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a69368ff607f..c815b42d0f5b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -268,9 +268,10 @@ static struct lock_class_key irq_desc_lock_class; void early_init_irq_lock_class(void) { + struct irq_desc *desc; int i; - for (i = 0; i < nr_irqs; i++) - lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); + for_each_irq_desc(i, desc) + lockdep_set_class(&desc->lock, &irq_desc_lock_class); } #endif -- cgit v1.2.1 From 63d659d556f145d33798b8ad19ced10c254fe445 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 15:33:18 +0200 Subject: genirq: fix name space collision of nr_irqs in autoprobe.c probe_irq_off() is disfunctional as the local nr_irqs is referenced instead of the global one for the for_each_irq_desc() iterator. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 0cbff18efb6d..cc0f7321b8ce 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -161,7 +161,7 @@ EXPORT_SYMBOL(probe_irq_mask); */ int probe_irq_off(unsigned long val) { - int i, irq_found = 0, nr_irqs = 0; + int i, irq_found = 0, nr_of_irqs = 0; struct irq_desc *desc; unsigned int status; @@ -171,9 +171,9 @@ int probe_irq_off(unsigned long val) if (status & IRQ_AUTODETECT) { if (!(status & IRQ_WAITING)) { - if (!nr_irqs) + if (!nr_of_irqs) irq_found = i; - nr_irqs++; + nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); @@ -182,7 +182,7 @@ int probe_irq_off(unsigned long val) } mutex_unlock(&probing_active); - if (nr_irqs > 1) + if (nr_of_irqs > 1) irq_found = -irq_found; return irq_found; -- cgit v1.2.1 From 118a9069f06ff591d51a3133e242f0c256ba2db7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 17 Oct 2008 02:38:36 -0500 Subject: module: remove CONFIG_KMOD in comment after #endif Signed-off-by: Rusty Russell --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a0befb..58f9c2ed36be 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -113,7 +113,7 @@ int request_module(const char *fmt, ...) return ret; } EXPORT_SYMBOL(request_module); -#endif /* CONFIG_KMOD */ +#endif /* CONFIG_MODULES */ struct subprocess_info { struct work_struct work; -- cgit v1.2.1 From e94320939f44e0cbaccc3f259a5778abced4949c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 23 Sep 2008 23:51:11 +0400 Subject: modules: fix module "notes" kobject leak Fix "notes" kobject leak It happens every rmmod if KALLSYMS=y and SYSFS=y. # modprobe foo kobject: 'foo' (ffffffffa00743d0): kobject_add_internal: parent: 'module', set: 'module' kobject: 'holders' (ffff88017e7c5770): kobject_add_internal: parent: 'foo', set: '' kobject: 'foo' (ffffffffa00743d0): kobject_uevent_env kobject: 'foo' (ffffffffa00743d0): fill_kobj_path: path = '/module/foo' kobject: 'notes' (ffff88017fa9b668): kobject_add_internal: parent: 'foo', set: '' ^^^^^ # rmmod foo kobject: 'holders' (ffff88017e7c5770): kobject_cleanup kobject: 'holders' (ffff88017e7c5770): auto cleanup kobject_del kobject: 'holders' (ffff88017e7c5770): calling ktype release kobject: (ffff88017e7c5770): dynamic_kobj_release kobject: 'holders': free name kobject: 'foo' (ffffffffa00743d0): kobject_cleanup kobject: 'foo' (ffffffffa00743d0): does not have a release() function, it is broken and must be fixed. kobject: 'foo' (ffffffffa00743d0): auto cleanup 'remove' event kobject: 'foo' (ffffffffa00743d0): kobject_uevent_env kobject: 'foo' (ffffffffa00743d0): fill_kobj_path: path = '/module/foo' kobject: 'foo' (ffffffffa00743d0): auto cleanup kobject_del kobject: 'foo': free name [whooops] Signed-off-by: Alexey Dobriyan Cc: stable Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04b..d5fcd24e5aff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1173,7 +1173,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, while (i-- > 0) sysfs_remove_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - kobject_del(notes_attrs->dir); + kobject_put(notes_attrs->dir); } kfree(notes_attrs); } -- cgit v1.2.1 From 346e15beb5343c2eb8216d820f2ed8f150822b08 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 12 Aug 2008 16:46:19 -0400 Subject: driver core: basic infrastructure for per-module dynamic debug messages Base infrastructure to enable per-module debug messages. I've introduced CONFIG_DYNAMIC_PRINTK_DEBUG, which when enabled centralizes control of debugging statements on a per-module basis in one /proc file, currently, /dynamic_printk/modules. When, CONFIG_DYNAMIC_PRINTK_DEBUG, is not set, debugging statements can still be enabled as before, often by defining 'DEBUG' for the proper compilation unit. Thus, this patch set has no affect when CONFIG_DYNAMIC_PRINTK_DEBUG is not set. The infrastructure currently ties into all pr_debug() and dev_dbg() calls. That is, if CONFIG_DYNAMIC_PRINTK_DEBUG is set, all pr_debug() and dev_dbg() calls can be dynamically enabled/disabled on a per-module basis. Future plans include extending this functionality to subsystems, that define their own debug levels and flags. Usage: Dynamic debugging is controlled by the debugfs file, /dynamic_printk/modules. This file contains a list of the modules that can be enabled. The format of the file is as follows: . . . : Name of the module in which the debug call resides : whether the messages are enabled or not For example: snd_hda_intel enabled=0 fixup enabled=1 driver enabled=0 Enable a module: $echo "set enabled=1 " > dynamic_printk/modules Disable a module: $echo "set enabled=0 " > dynamic_printk/modules Enable all modules: $echo "set enabled=1 all" > dynamic_printk/modules Disable all modules: $echo "set enabled=0 all" > dynamic_printk/modules Finally, passing "dynamic_printk" at the command line enables debugging for all modules. This mode can be turned off via the above disable command. [gkh: minor cleanups and tweaks to make the build work quietly] Signed-off-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d5fcd24e5aff..c52700667292 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + unregister_dynamic_debug_module(mod->name); free_module(mod); out: @@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +{ + struct mod_debug *debug_info; + unsigned long pos, end; + unsigned int num_verbose; + + pos = sechdrs[verboseindex].sh_addr; + num_verbose = sechdrs[verboseindex].sh_size / + sizeof(struct mod_debug); + end = pos + (num_verbose * sizeof(struct mod_debug)); + + for (; pos < end; pos += sizeof(struct mod_debug)) { + debug_info = (struct mod_debug *)pos; + register_dynamic_debug_module(debug_info->modname, + debug_info->type, debug_info->logical_modname, + debug_info->flag_names, debug_info->hash, + debug_info->hash2); + } +} +#else +static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, + unsigned int verboseindex) +{ +} +#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -1831,6 +1859,7 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int verboseindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2117,6 +2146,7 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2167,6 +2197,7 @@ static noinline struct module *load_module(void __user *umod, marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif + dynamic_printk_setup(sechdrs, verboseindex); err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; -- cgit v1.2.1 From 9363b9f23c9cc36cc8ef6c05fdf879ee4a96ae92 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 15 Oct 2008 22:01:05 -0700 Subject: memrlimit: cgroup mm owner callback changes to add task info This patch adds an additional field to the mm_owner callbacks. This field is required to get to the mm that changed. Hold mmap_sem in write mode before calling the mm_owner_changed callback [hugh@veritas.com: fix mmap_sem deadlock] Signed-off-by: Balbir Singh Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: Li Zefan Cc: Pavel Emelianov Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Vivek Goyal Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- kernel/exit.c | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0123d75ec9a..8c6e1c17e6d3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2735,6 +2735,8 @@ void cgroup_fork_callbacks(struct task_struct *child) * Called on every change to mm->owner. mm_init_owner() does not * invoke this routine, since it assigns the mm->owner the first time * and does not change it. + * + * The callbacks are invoked with mmap_sem held in read mode. */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { @@ -2750,7 +2752,7 @@ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp); + ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); } } } diff --git a/kernel/exit.c b/kernel/exit.c index 85a83c831856..0ef4673e351b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -640,24 +640,23 @@ retry: assign_new_owner: BUG_ON(c == p); get_task_struct(c); + read_unlock(&tasklist_lock); + down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ -- cgit v1.2.1 From 1bfcf1304ea79c46efc3724e548b13b4b442b418 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2008 22:01:21 -0700 Subject: pm: rework disabling of user mode helpers during suspend/hibernation We currently use a PM notifier to disable user mode helpers before suspend and hibernation and to re-enable them during resume. However, this is not an ideal solution, because if any drivers want to upload firmware into memory before suspend, they have to use a PM notifier for this purpose and there is no guarantee that the ordering of PM notifiers will be as expected (ie. the notifier that disables user mode helpers has to be run after the driver's notifier used for uploading the firmware). For this reason, it seems better to move the disabling and enabling of user mode helpers to separate functions that will be called by the PM core as necessary. [akpm@linux-foundation.org: remove unneeded ifdefs] Signed-off-by: Rafael J. Wysocki Cc: Alan Stern Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 65 +++++++++++++++++++++++------------------------------ kernel/power/disk.c | 11 +++++++++ kernel/power/main.c | 7 ++++++ kernel/power/user.c | 10 ++++++++- 4 files changed, 55 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a0befb..ab7dd08472fc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -static int usermodehelper_pm_callback(struct notifier_block *nfb, - unsigned long action, - void *ignored) +/** + * usermodehelper_disable - prevent new helpers from being started + */ +int usermodehelper_disable(void) { long retval; - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - usermodehelper_disabled = 1; - smp_mb(); - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); - if (retval) { - return NOTIFY_OK; - } else { - usermodehelper_disabled = 0; - return NOTIFY_BAD; - } - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - usermodehelper_disabled = 0; - return NOTIFY_OK; - } + if (retval) + return 0; - return NOTIFY_DONE; + usermodehelper_disabled = 0; + return -EAGAIN; +} + +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + usermodehelper_disabled = 0; } static void helper_lock(void) @@ -334,18 +332,12 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } - -static void register_pm_notifier_callback(void) -{ - pm_notifier(usermodehelper_pm_callback, 0); -} -#else /* CONFIG_PM */ +#else /* CONFIG_PM_SLEEP */ #define usermodehelper_disabled 0 static inline void helper_lock(void) {} static inline void helper_unlock(void) {} -static inline void register_pm_notifier_callback(void) {} -#endif /* CONFIG_PM */ +#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper @@ -515,5 +507,4 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); - register_pm_notifier_callback(); } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index bbd85c60f741..331f9836383f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -520,6 +521,10 @@ int hibernate(void) if (error) goto Exit; + error = usermodehelper_disable(); + if (error) + goto Exit; + /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -558,6 +563,7 @@ int hibernate(void) thaw_processes(); Finish: free_basic_memory_bitmaps(); + usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -634,6 +640,10 @@ static int software_resume(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + error = create_basic_memory_bitmaps(); if (error) goto Finish; @@ -656,6 +666,7 @@ static int software_resume(void) thaw_processes(); Done: free_basic_memory_bitmaps(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 540b16b68565..19122cf6d827 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -237,6 +238,10 @@ static int suspend_prepare(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + if (suspend_freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -256,6 +261,7 @@ static int suspend_prepare(void) Thaw: suspend_thaw_processes(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/user.c b/kernel/power/user.c index a6332a313262..005b93d839ba 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREEZE: if (data->frozen) break; + printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); - error = freeze_processes(); + error = usermodehelper_disable(); if (error) + break; + + error = freeze_processes(); + if (error) { thaw_processes(); + usermodehelper_enable(); + } if (!error) data->frozen = 1; break; @@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!data->frozen || data->ready) break; thaw_processes(); + usermodehelper_enable(); data->frozen = 0; break; -- cgit v1.2.1 From d9f3216b474be170d0c093d70125b541ace58704 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 15 Oct 2008 22:01:26 -0700 Subject: kernel/dma.c: remove a CVS keyword Remove a CVS keyword that wasn't updated for a long time from a comment. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma.c b/kernel/dma.c index d2c60a822790..f903189c5304 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,4 +1,4 @@ -/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ +/* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * * Written by Hennus Bergman, 1992. -- cgit v1.2.1 From a25d644fc0e232f242d1f3baa63c149c42536ff0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: wait: kill is_sync_wait() is_sync_wait() is used to distinguish between sync and async waits. Basically sync waits are the ones initialized with init_waitqueue_entry() and async ones with init_waitqueue_func_entry(). The sync/async distinction is used only in prepare_to_wait[_exclusive]() and its only function is to skip setting the current task state if the wait is async. This has a few problems. * No one uses it. None of func_entry users use prepare_to_wait() functions, so the code path never gets executed. * The distinction is bogus. Maybe back when func_entry is used only by aio but it's now also used by epoll and in future possibly by 9p and poll/select. * Taking @state as argument and ignoring it silenly depending on how @wait is initialized is just a bad error-prone API. * It prevents func_entry waits from using wait->private for no good reason. This patch kills is_sync_wait() and the associated code paths from prepare_to_wait[_exclusive](). As there was no user of these code paths, this patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/wait.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index c275c56cf2d3..cd87131f2fc2 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); @@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue_tail(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -- cgit v1.2.1 From 9ba16087d9f996a93ab6f4453a52a4b24bc1f25c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: Kconfig: eliminate "def_bool n" constructs Using "def_bool n" is pointless, simply using bool here appears more appropriate. Further, retaining such options that don't have a prompt and aren't selected by anything seems also at least questionable. Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: Tony Luck Cc: Thomas Gleixner Cc: Bartlomiej Zolnierkiewicz Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106a0a92..95ed42951e0a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" -- cgit v1.2.1 From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:01:41 -0700 Subject: Make the taint flags reliable It's somewhat unlikely that it happens, but right now a race window between interrupts or machine checks or oopses could corrupt the tainted bitmap because it is modified in a non atomic fashion. Convert the taint variable to an unsigned long and use only atomic bit operations on it. Unfortunately this means the intvec sysctl functions cannot be used on it anymore. It turned out the taint sysctl handler could actually be simplified a bit (since it only increases capabilities) so this patch actually removes code. [akpm@linux-foundation.org: remove unneeded include] Signed-off-by: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 12 +++++----- kernel/panic.c | 63 +++++++++++++++++++++++++++++++++++-------------- kernel/softlockup.c | 2 +- kernel/sysctl.c | 67 ++++++++++++++++++++++------------------------------- 4 files changed, 81 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04b..dd9ac6ad5cb9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -923,7 +923,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1033,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1634,7 +1634,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -2552,9 +2552,9 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; /* * TAINT_FORCED_RMMOD: could be added. diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89b..028013f7afd4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -159,31 +159,60 @@ EXPORT_SYMBOL(panic); * The string is overwritten by the next call to print_taint(). */ +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, +}; + const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee93a82..3953e4aed733 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cfc5295f1e82..ec88fcc9a0d2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -149,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -379,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -2228,49 +2227,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { -- cgit v1.2.1 From 0c2d64fb6cae9aae480f6a46cfe79f8d7d48b59f Mon Sep 17 00:00:00 2001 From: Adam Tkac Date: Wed, 15 Oct 2008 22:01:45 -0700 Subject: rlimit: permit setting RLIMIT_NOFILE to RLIM_INFINITY When a process wants to set the limit of open files to RLIM_INFINITY it gets EPERM even if it has CAP_SYS_RESOURCE capability. For example, BIND does: ... #elif defined(NR_OPEN) && defined(__linux__) /* * Some Linux kernels don't accept RLIM_INFINIT; the maximum * possible value is the NR_OPEN defined in linux/fs.h. */ if (resource == isc_resource_openfiles && rlim_value == RLIM_INFINITY) { rl.rlim_cur = rl.rlim_max = NR_OPEN; unixresult = setrlimit(unixresource, &rl); if (unixresult == 0) return (ISC_R_SUCCESS); } #elif ... If we allow setting RLIMIT_NOFILE to RLIM_INFINITY we increase portability - you don't have to check if OS is linux and then use different schema for limits. The spec says "Specifying RLIM_INFINITY as any resource limit value on a successful call to setrlimit() shall inhibit enforcement of that resource limit." and we're presently not doing that. Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 234d9454294e..d5b79f65ad9b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1450,14 +1450,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; + + if (resource == RLIMIT_NOFILE) { + if (new_rlim.rlim_max == RLIM_INFINITY) + new_rlim.rlim_max = sysctl_nr_open; + if (new_rlim.rlim_cur == RLIM_INFINITY) + new_rlim.rlim_cur = sysctl_nr_open; + if (new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + } + + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; retval = security_task_setrlimit(resource, &new_rlim); if (retval) -- cgit v1.2.1 From 22b8ce94708f7cdf0b04965c6f7443dfd374c35c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 15 Oct 2008 22:01:46 -0700 Subject: profiling: dynamically enable readprofile at runtime Way too often, I have a machine that exhibits some kind of crappy behavior. The CPU looks wedged in the kernel or it is spending way too much system time and I wonder what is responsible. I try to run readprofile. But, of course, Ubuntu doesn't enable it by default. Dang! The reason we boot-time enable it is that it takes a big bufffer that we generally can only bootmem alloc. But, does it hurt to at least try and runtime-alloc it? To use: echo 2 > /sys/kernel/profile Then run readprofile like normal. This should fix the compile issue with allmodconfig. I've compile-tested on a bunch more configs now including a few more architectures. Signed-off-by: Dave Hansen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 35 +++++++++++++++++++++++++++++++++++ kernel/profile.c | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e53bc30e9ba5..08dd8ed86c77 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #define KERNEL_ATTR_RO(_name) \ @@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj, KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_PROFILING +static ssize_t profiling_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prof_on); +} +static ssize_t profiling_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + if (prof_on) + return -EEXIST; + /* + * This eventually calls into get_option() which + * has a ton of callers and is not const. It is + * easiest to cast it away here. + */ + profile_setup((char *)buf); + ret = profile_init(); + if (ret) + return ret; + ret = create_proc_profile(); + if (ret) + return ret; + return count; +} +KERNEL_ATTR_RW(profiling); +#endif + #ifdef CONFIG_KEXEC static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_PROFILING + &profiling_attr.attr, +#endif #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, diff --git a/kernel/profile.c b/kernel/profile.c index cd26bed4cc26..a9e422df6bf6 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ -static int __init profile_setup(char *str) +int profile_setup(char *str) { - static char __initdata schedstr[] = "schedule"; - static char __initdata sleepstr[] = "sleep"; - static char __initdata kvmstr[] = "kvm"; + static char schedstr[] = "schedule"; + static char sleepstr[] = "sleep"; + static char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -100,14 +102,33 @@ static int __init profile_setup(char *str) __setup("profile=", profile_setup); -void __init profile_init(void) +int profile_init(void) { + int buffer_bytes; if (!prof_on) - return; + return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; - prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); + buffer_bytes = prof_len*sizeof(atomic_t); + if (!slab_is_available()) { + prof_buffer = alloc_bootmem(buffer_bytes); + return 0; + } + + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + if (prof_buffer) + return 0; + + prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + if (prof_buffer) + return 0; + + prof_buffer = vmalloc(buffer_bytes); + if (prof_buffer) + return 0; + + return -ENOMEM; } /* Profile event notifications */ @@ -527,7 +548,7 @@ static void __init profile_nop(void *unused) { } -static int __init create_hash_tables(void) +static int create_hash_tables(void) { int cpu; @@ -575,14 +596,14 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -static int __init create_proc_profile(void) +int create_proc_profile(void) { struct proc_dir_entry *entry; if (!prof_on) return 0; if (create_hash_tables()) - return -1; + return -ENOMEM; entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) -- cgit v1.2.1 From 87988815073918134c0dae059cf247a4472d78ed Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Wed, 15 Oct 2008 22:01:51 -0700 Subject: utsname: completely overwrite prior information On sethostname() and setdomainname(), previous information may be retained if it was longer than than the new hostname/domainname. This can be demonstrated trivially by calling sethostname() first with a long name, then with a short name, and then calling uname() to retrieve the full buffer that contains the hostname (and possibly parts of the old hostname), one just has to look past the terminating zero. I don't know if we should really care that much (hence the RFC); the only scenarios I can possibly think of is administrator putting something sensitive in the hostname (or domain name) by accident, and changing it back will not undo the mistake entirely, though it's not like we can recover gracefully from "rm -rf /" either... The other scenario is namespaces (CLONE_NEWUTS) where some information may be unintentionally "inherited" from the previous namespace (a program wants to hide the original name and does clone + sethostname, but some information is still left). I think the patch may be defended on grounds of the principle of least surprise. But I am not adamant :-) (I guess the question now is whether userspace should be able to write embedded NULs into the buffer or not...) At least the observation has been made and the patch has been presented. Signed-off-by: Vegard Nossum Cc: "Eric W. Biederman" Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d5b79f65ad9b..558b035965aa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1350,7 +1350,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; + memset(utsname()->nodename + len, 0, + sizeof(utsname()->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1396,7 +1397,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; + memset(utsname()->domainname + len, 0, + sizeof(utsname()->domainname) - len); errno = 0; } up_write(&uts_sem); -- cgit v1.2.1 From 9679e4dd628743b9ef4375d60ae69923c3766173 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 15 Oct 2008 22:01:51 -0700 Subject: kernel/sys.c: improve code generation utsname() is quite expensive to calculate. Cache it in a local. text data bss dec hex filename before: 11136 720 16 11872 2e60 kernel/sys.o after: 11096 720 16 11832 2e38 kernel/sys.o Acked-by: Vegard Nossum Cc: "Eric W. Biederman" Acked-by: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 558b035965aa..0bc8fa3c2288 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1349,9 +1349,10 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - memset(utsname()->nodename + len, 0, - sizeof(utsname()->nodename) - len); + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1363,15 +1364,17 @@ asmlinkage long sys_sethostname(char __user *name, int len) asmlinkage long sys_gethostname(char __user *name, int len) { int i, errno; + struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); + u = utsname(); + i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) + if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1396,9 +1399,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - memset(utsname()->domainname + len, 0, - sizeof(utsname()->domainname) - len); + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } up_write(&uts_sem); -- cgit v1.2.1 From 7968b3d9a673e922ab980b2616945e63a52d88fe Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 15 Oct 2008 22:01:57 -0700 Subject: kernel/kallsyms.c: fix double return Commit 6dd06c9fbe025f542bce4cdb91790c0f91962722 ("module: make module_address_lookup safe") introduced double returns in the function kallsyms_lookup(), it's weird. The second one should be removed. Signed-off-by: WANG Cong Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 38fc10ac7541..5072cf1685a2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ return module_address_lookup(addr, symbolsize, offset, modname, namebuf); - return NULL; } int lookup_symbol_name(unsigned long addr, char *symname) -- cgit v1.2.1 From e1f8e87449147ffe5ea3de64a46af7de450ce279 Mon Sep 17 00:00:00 2001 From: Francois Cami Date: Wed, 15 Oct 2008 22:01:59 -0700 Subject: Remove Andrew Morton's old email accounts People can use the real name an an index into MAINTAINERS to find the current email address. Signed-off-by: Francois Cami Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- kernel/workqueue.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a430fd04008b..c3ab51dc9faf 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -13,7 +13,7 @@ * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton + * 01Mar01 Andrew Morton */ #include diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4048e92aa04f..714afad46539 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -9,7 +9,7 @@ * Derived from the taskqueue/keventd code by: * * David Woodhouse - * Andrew Morton + * Andrew Morton * Kai Petzke * Theodore Ts'o * -- cgit v1.2.1 From 6d5cd6effed5f8c958a6c0df56da336f5a4fdb8a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2008 22:02:00 -0700 Subject: taint: fix kernel-doc Move print_tainted() kernel-doc to avoid the following error: Error(/var/linsrc/mmotm-2008-1002-1617//kernel/panic.c:155): cannot understand prototype: 'struct tnt ' Signed-off-by: Randy Dunlap Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 028013f7afd4..f290e8e866f6 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -143,21 +143,6 @@ NORET_TYPE void panic(const char * fmt, ...) EXPORT_SYMBOL(panic); -/** - * print_tainted - return a string to represent the kernel taint state. - * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * - * The string is overwritten by the next call to print_taint(). - */ struct tnt { u8 bit; @@ -178,6 +163,21 @@ static const struct tnt tnts[] = { { TAINT_WARN, 'W', ' ' }, }; +/** + * print_tainted - return a string to represent the kernel taint state. + * + * 'P' - Proprietary module has been loaded. + * 'F' - Module has been forcibly loaded. + * 'S' - SMP with CPUs not designed for SMP. + * 'R' - User forced a module unload. + * 'M' - System experienced a machine check exception. + * 'B' - System has hit bad_page. + * 'U' - Userspace-defined naughtiness. + * 'A' - ACPI table overridden. + * 'W' - Taint on warning. + * + * The string is overwritten by the next call to print_taint(). + */ const char *print_tainted(void) { static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; -- cgit v1.2.1 From 20036fdcaf05fac0a84ed81a56906493a7d822e2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:02:02 -0700 Subject: Add kerneldoc documentation for new printk format extensions Add documentation in kerneldoc for new printk format extensions This patch documents the new %pS/%pF options in printk in kernel doc. Hope I didn't miss any other extension. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index c3ab51dc9faf..fbd94bdaa74f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -593,6 +593,8 @@ static int have_callable_console(void) * * See also: * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. */ asmlinkage int printk(const char *fmt, ...) -- cgit v1.2.1 From b418da16dd44810e5d5a22bba377cca80512a524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Oct 2008 22:02:06 -0700 Subject: compat: generic compat get/settimeofday Nothing arch specific in get/settimeofday. The details of the timeval conversion varied a little from arch to arch, but all with the same results. Also add an extern declaration for sys_tz to linux/time.h because externs in .c files are fowned upon. I'll kill the externs in various other files in a sparate patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Acked-by: David S. Miller [ sparc bits ] Cc: "Luck, Tony" Cc: Ralf Baechle Acked-by: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a8ab9a..143990e48cb9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,6 +26,64 @@ #include +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || -- cgit v1.2.1 From f221e726bf4e082a05dcd573379ac859bfba7126 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Oct 2008 22:04:23 -0700 Subject: sysctl: simplify ->strategy name and nlen parameters passed to ->strategy hook are unused, remove them. In general ->strategy hook should know what it's doing, and don't do something tricky for which, say, pointer to original userspace array may be needed (name). Signed-off-by: Alexey Dobriyan Acked-by: David S. Miller [ networking bits ] Cc: Ralf Baechle Cc: David Howells Cc: Matt Mackall Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 +++++++++++++---------------- kernel/utsname_sysctl.c | 5 ++--- 2 files changed, 15 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ec88fcc9a0d2..9792c315676a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1500,7 +1500,6 @@ void register_sysctl_root(struct ctl_table_root *root) /* Perform the actual read/write of a sysctl table entry. */ static int do_sysctl_strategy(struct ctl_table_root *root, struct ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1514,8 +1513,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, return -EPERM; if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = table->strategy(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1525,8 +1523,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = sysctl_data(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; } @@ -1558,7 +1555,7 @@ repeat: table = table->child; goto repeat; } - error = do_sysctl_strategy(root, table, name, nlen, + error = do_sysctl_strategy(root, table, oldval, oldlenp, newval, newlen); return error; @@ -2707,7 +2704,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, */ /* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2741,7 +2738,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen, } /* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2787,7 +2784,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen, * are between the minimum and maximum values given in the arrays * table->extra1 and table->extra2, respectively. */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2823,7 +2820,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2857,7 +2854,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2912,35 +2909,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) return error; } -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659d269e..3b34b3545936 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, +static int sysctl_uts_string(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, write = newval && newlen; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); + r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } -- cgit v1.2.1 From ebf3f09c634906d371f2bfd71b41c7e0c52efe7e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 15 Oct 2008 22:05:12 -0700 Subject: Configure out AIO support This patchs adds the CONFIG_AIO option which allows to remove support for asynchronous I/O operations, that are not necessarly used by applications, particularly on embedded devices. As this is a size-reduction option, it depends on CONFIG_EMBEDDED. It allows to save ~7 kilobytes of kernel code/data: text data bss dec hex filename 1115067 119180 217088 1451335 162547 vmlinux 1108025 119048 217088 1444161 160941 vmlinux.new -7042 -132 0 -7174 -1C06 +/- This patch has been originally written by Matt Mackall , and is part of the Linux Tiny project. [randy.dunlap@oracle.com: build fix] Signed-off-by: Thomas Petazzoni Cc: Benjamin LaHaise Cc: Zach Brown Signed-off-by: Matt Mackall Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 5 +++++ kernel/sysctl.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 503d8d4eb80a..a77b27b11b04 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,11 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9792c315676a..617d41e4d6a0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1281,6 +1281,7 @@ static struct ctl_table fs_table[] = { .extra2 = &two, }, #endif +#ifdef CONFIG_AIO { .procname = "aio-nr", .data = &aio_nr, @@ -1295,6 +1296,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, -- cgit v1.2.1 From c26ec88ea86ad5122a6ea5ad635e6a1f6c395d74 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 15 Oct 2008 22:05:14 -0700 Subject: resources: tidy __request_region() No functional change. Just return NULL for kzalloc failure immediately, rather than wrapping the whole function body in the body of an "if". Signed-off-by: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 414d6fc9131e..f193d6e3ded2 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -630,33 +630,34 @@ struct resource * __request_region(struct resource *parent, { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - if (res) { - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; + if (!res) + return NULL; - write_lock(&resource_lock); + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; - for (;;) { - struct resource *conflict; + write_lock(&resource_lock); - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } + for (;;) { + struct resource *conflict; - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; + conflict = __request_resource(parent, res); + if (!conflict) break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; } - write_unlock(&resource_lock); + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; } + write_unlock(&resource_lock); return res; } EXPORT_SYMBOL(__request_region); -- cgit v1.2.1 From 2b252c541158cfed9d7e5b019b894fe2174f5908 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 15 Oct 2008 22:05:20 -0700 Subject: make kprobes.c:kretprobe_table_lock() static Make the needlessly global kretprobe_table_lock() static. Signed-off-by: Adrian Bunk Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 75bc2cd9ebc6..8b57a2597f21 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); -- cgit v1.2.1 From 1c95e1b69073cff5ff179e592fa1a1e182c78a17 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Oct 2008 15:32:46 -0700 Subject: Fix kernel/softirq.c printk format warning properly This fixes the broken 77af7e3403e7314c47b0c07fbc5e4ef21d939532 ("softirq, warning fix: correct a format to avoid a warning") fix correctly. The type of a pointer subtraction is not "int", nor is it "long". It can be either (or something else). It's "ptrdiff_t", and the printk format for it is "%td". Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index be7a8292f992..37d67aa2d56f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,7 +210,7 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %d %p" + printk(KERN_ERR "huh, entered softirq %td %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, h->action, prev_count, preempt_count()); -- cgit v1.2.1 From 54514a70adefe356afe854e2d3912d46668068e6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Sep 2008 22:15:57 -0700 Subject: softirq: Add support for triggering softirq work on softirqs. This is basically a genericization of Jens Axboe's block layer remote softirq changes. Signed-off-by: David S. Miller Signed-off-by: Jens Axboe --- kernel/softirq.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa2d56f..83ba21a13bd4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include @@ -474,17 +476,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -- cgit v1.2.1 From c851c8676bd7ae456e9b3af8e6bb2c434eddcc75 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 17 Oct 2008 18:17:46 +0800 Subject: sched: fix the wrong mask_len If NR_CPUS isn't a multiple of 32, we get a truncated string of sched domains by catting /proc/schedstat. This is caused by the wrong mask_len. This patch fixes it. Signed-off-by: Miao Xie Cc: Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43987e2..81365b3d89f8 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,7 +9,7 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = NR_CPUS/32 * 9; + int mask_len = (NR_CPUS/32 + 1) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) -- cgit v1.2.1 From b968905292eaa52b25abb7b3e6c0841dac9f03ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 12:58:19 +0200 Subject: sched: fix the wrong mask_len, cleanup Clean up the division in show_schedstat(). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 81365b3d89f8..67579253b53b 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,7 +9,7 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = (NR_CPUS/32 + 1) * 9; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) -- cgit v1.2.1 From b0aa51b999c449e5e3f9faa1ee406e052d407fe7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 17 Oct 2008 15:33:21 +0200 Subject: sched: minor fast-path overhead reduction Greetings, 103638d added a bit of avoidable overhead to the fast-path. Use sysctl_sched_min_granularity instead of sched_slice() to restrict buddy wakeups. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd17172eb6..67084936b602 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -747,7 +747,7 @@ pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rq *rq = rq_of(cfs_rq); u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { cfs_rq->pair_start = rq->clock; return se; } -- cgit v1.2.1 From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 09:59:47 +0200 Subject: NOHZ: unify the nohz function calls in irq_enter() We have two separate nohz function calls in irq_enter() for no good reason. Just call a single NOHZ function from irq_enter() and call the bits in the tick code. Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 10 +++------- kernel/time/tick-sched.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa2d56f..d410014279e7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -265,16 +265,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b711ffcb106c..fdcf3f93bb8d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -558,6 +558,17 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ +/* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); +#endif +} + /* * High resolution timer specific code */ -- cgit v1.2.1 From c34bec5a44e9486597d78e7a686b2f9088a0564c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 10:04:34 +0200 Subject: NOHZ: split tick_nohz_restart_sched_tick() Split out the clock event device reprogramming. Preparatory patch. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 49 ++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fdcf3f93bb8d..7aedf4343539 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -430,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } -- cgit v1.2.1 From fb02fbc14d17837b4b7b02dbb36142c16a7bf208 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 10:01:23 +0200 Subject: NOHZ: restart tick device from irq_enter() We did not restart the tick device from irq_enter() to avoid double reprogramming and extra events in the return immediate to idle case. But long lasting softirqs can lead to a situation where jiffies become stale: idle() tick stopped (reprogrammed to next pending timer) halt() interrupt jiffies updated from irq_enter() interrupt handler softirq function 1 runs 20ms softirq function 2 arms a 10ms timer with a stale jiffies value jiffies updated from irq_exit() timer wheel has now an already expired timer (the one added in function 2) timer fires and timer softirq runs This was discovered when debugging a timer problem which happend only when the ath5k driver is active. The debugging proved that there is a softirq function running for more than 20ms, which is a bug by itself. To solve this we restart the tick timer right from irq_enter(), but do not go through the other functions which are necessary to return from idle when need_resched() is set. Reported-by: Elias Oltmanns Signed-off-by: Thomas Gleixner Tested-by: Elias Oltmanns --- kernel/time/tick-broadcast.c | 13 +++++++++++++ kernel/time/tick-internal.h | 2 ++ kernel/time/tick-sched.c | 31 +++++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index cb01cd8f919b..f98a1b7b16e9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -383,6 +383,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) return 0; } +/* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + /* * Handle oneshot mode broadcasting */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 469248782c23..b1c05bf75ee0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7aedf4343539..0581c11fe6c6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -508,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -557,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -568,9 +585,11 @@ static inline void tick_nohz_switch_to_nohz(void) { } */ void tick_check_idle(int cpu) { + tick_check_oneshot_broadcast(cpu); #ifdef CONFIG_NO_HZ tick_nohz_stop_idle(cpu); tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); #endif } @@ -627,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; -- cgit v1.2.1 From e67ef25a35b949561a9bd77693523ec94ab4a278 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Sep 2008 23:50:23 +0200 Subject: timer_list: print real timer address The current timer_list output prints the address of the on stack copy of the active hrtimer instead of the hrtimer itself. Print the address of the real timer instead. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20fd0001..ec9ea6cadd85 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } -- cgit v1.2.1 From c5b77a3d3a716a5c61a1999d7f2a78e9c39fd1b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 17:31:41 +0200 Subject: timer_list: print cpu number of clockevents device The per cpu clock events device output of timer_list lacks an association of the device to the cpu which is annoying when looking at the output of /proc/timer_list from a 128 way system. Add the CPU number info and mark the broadcast device in the device list printout. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ec9ea6cadd85..5479c6e7a023 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -184,12 +184,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -223,7 +227,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -233,7 +237,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else -- cgit v1.2.1 From 870e2a284567714335d125c390366dce882d726f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 17:41:55 +0200 Subject: timer_list: add base address to clock base The base address of a (per cpu) clock base is a useful debug info. Add it and bump the version number of timer_lists. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5479c6e7a023..f6426911e35a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -110,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -249,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.1 From e1dd7bc58578ebfcaba989608017fe5156c29c86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2008 13:33:36 +0200 Subject: hrtimers: fix docbook comments hrtimer_start() and hrtimer_start_range_ns() handle relative and absolute timers. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 51ee90bca2de..00e6f0a1e7a3 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -946,7 +946,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) } /** - * hrtimer_start_range_ns - (re)start an relative timer on the current CPU + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer @@ -1022,7 +1022,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** - * hrtimer_start - (re)start an relative timer on the current CPU + * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) -- cgit v1.2.1 From 643bdf68f92a8516574ed7ca3713f9334c331b8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2008 13:38:11 +0200 Subject: hrtimers: simplify hrtimer_peek_ahead_timers() Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 00e6f0a1e7a3..4fc41414fc06 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1394,22 +1394,16 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ void hrtimer_peek_ahead_timers(void) { - unsigned long flags; struct tick_device *td; - struct clock_event_device *dev; + unsigned long flags; if (!hrtimer_hres_active()) return; local_irq_save(flags); td = &__get_cpu_var(tick_cpu_device); - if (!td) - goto out; - dev = td->evtdev; - if (!dev) - goto out; - hrtimer_interrupt(dev); -out: + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); local_irq_restore(flags); } -- cgit v1.2.1 From ffda12a17a324103e9900fa1035309811eecbfe5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 19:27:02 +0200 Subject: sched: optimize group load balancer I noticed that tg_shares_up() unconditionally takes rq-locks for all cpus in the sched_domain. This hurts. We need the rq-locks whenever we change the weight of the per-cpu group sched entities. To allevate this a little, only change the weight when the new weight is at least shares_thresh away from the old value. This avoids the rq-lock for the top level entries, since those will never be re-weighted, and fuzzes the lower level entries a little to gain performance in semi-stable situations. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 45 +++++++++++++++++++++++++-------------------- kernel/sysctl.c | 10 ++++++++++ 2 files changed, 35 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c530b84c7f80..11ca39017835 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -817,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; */ unsigned int sysctl_sched_shares_ratelimit = 250000; +/* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + spin_lock_irqsave(&rq->lock, flags); + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; - __set_se_shares(tg->se[cpu], shares); + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } } /* @@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } + for_each_cpu_mask(i, sd->span) + update_group_shares_cpu(tg, i, shares, rq_weight); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a0..3d804f41e649 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -274,6 +274,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", -- cgit v1.2.1 From a4c2f00f5cb848af7a8c816426b413c8e41834df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 19:27:03 +0200 Subject: sched: fair scheduler should not resched rt tasks With use of ftrace Steven noticed that some RT tasks got rescheduled due to sched_fair interaction. What happens is that we reprogram the hrtick from enqueue/dequeue_fair_task() because that can change nr_running, and thus a current tasks ideal runtime. However, its possible the current task isn't a fair_sched_class task, and thus doesn't have a hrtick set to change. Fix this by wrapping those hrtick_start_fair() calls in a hrtick_update() function, which will check for the right conditions. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67084936b602..0c4bcac54761 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +static const struct sched_class fair_sched_class; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -848,11 +850,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta); } } + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + if (curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); +} #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } + +static inline void hrtick_update(struct rq *rq) +{ +} #endif /* @@ -873,7 +895,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) wakeup = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -895,7 +917,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) sleep = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -1001,8 +1023,6 @@ static inline int wake_idle(int cpu, struct task_struct *p) #ifdef CONFIG_SMP -static const struct sched_class fair_sched_class; - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group -- cgit v1.2.1 From f9c0b0950d5fd8c8c5af39bc061f27ea8fddcac3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 19:27:04 +0200 Subject: sched: revert back to per-rq vruntime Vatsa rightly points out that having the runqueue weight in the vruntime calculations can cause unfairness in the face of task joins/leaves. Suppose: dv = dt * rw / w Then take 10 tasks t_n, each of similar weight. If the first will run 1 then its vruntime will increase by 10. Now, if the next 8 tasks leave after having run their 1, then the last task will get a vruntime increase of 2 after having run 1. Which will leave us with 2 tasks of equal weight and equal runtime, of which one will not be scheduled for 8/2=4 units of time. Ergo, we cannot do that and must use: dv = dt / w. This means we cannot have a global vruntime based on effective priority, but must instead go back to the vruntime per rq model we started out with. This patch was lightly tested by doing starting while loops on each nice level and observing their execution time, and a simple group scenario of 1:2:3 pinned to a single cpu. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c4bcac54761..a0aa38b10fdd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -336,7 +336,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= w / rw + * delta *= P[w / rw] */ static inline unsigned long calc_delta_weight(unsigned long delta, struct sched_entity *se) @@ -350,15 +350,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se) } /* - * delta *= rw / w + * delta /= w */ static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; } @@ -388,26 +386,26 @@ static u64 __sched_period(unsigned long nr_running) * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * - * s = p*w/rw + * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); + unsigned long nr_running = cfs_rq->nr_running; + + if (unlikely(!se->on_rq)) + nr_running++; + + return calc_delta_weight(__sched_period(nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s*rw/w = p + * vs = s/w */ -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; - - if (!se->on_rq) - nr_running++; - - return __sched_period(nr_running); + return calc_delta_fair(sched_slice(cfs_rq, se), se); } /* @@ -629,7 +627,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); + vruntime += sched_vslice(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ -- cgit v1.2.1 From 0c4b83da58ec2e96ce9c44c211d6eac5f9dae478 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Oct 2008 14:27:43 +0200 Subject: sched: disable the hrtick for now David Miller reported that hrtick update overhead has tripled the wakeup overhead on Sparc64. That is too much - disable the HRTICK feature for now by default, until a faster implementation is found. Reported-by: David Miller Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7c9e8f4a049f..fda016218296 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) -SCHED_FEAT(HRTICK, 1) +SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) -- cgit v1.2.1 From af936a1606246a10c145feac3770f6287f483f02 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sat, 18 Oct 2008 20:26:53 -0700 Subject: vmscan: unevictable LRU scan sysctl This patch adds a function to scan individual or all zones' unevictable lists and move any pages that have become evictable onto the respective zone's inactive list, where shrink_inactive_list() will deal with them. Adds sysctl to scan all nodes, and per node attributes to individual nodes' zones. Kosaki: If evictable page found in unevictable lru when write /proc/sys/vm/scan_unevictable_pages, print filename and file offset of these pages. [akpm@linux-foundation.org: fix one CONFIG_MMU=n build error] [kosaki.motohiro@jp.fujitsu.com: adapt vmscan-unevictable-lru-scan-sysctl.patch to new sysfs API] Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a0..b3cc73931d1f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -833,6 +833,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.1 From 8174f1503f4bf7e9a14b3fbbfdb30c6be6e29f77 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:19 -0700 Subject: container freezer: make refrigerator always available Now that the TIF_FREEZE flag is available in all architectures, extract the refrigerator() and freeze_task() from kernel/power/process.c and make it available to all. The refrigerator() can now be used in a control group subsystem implementing a control group freezer. Signed-off-by: Cedric Le Goater Signed-off-by: Matt Helsley Acked-by: Serge E. Hallyn Tested-by: Matt Helsley Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/freezer.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/power/Kconfig | 3 ++ kernel/power/process.c | 116 ---------------------------------------------- 4 files changed, 126 insertions(+), 116 deletions(-) create mode 100644 kernel/freezer.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df7c3e2..e8194d15d5f4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/freezer.c b/kernel/freezer.c new file mode 100644 index 000000000000..cb0931f89306 --- /dev/null +++ b/kernel/freezer.c @@ -0,0 +1,122 @@ +/* + * kernel/freezer.c - Function to freeze a process + * + * Originally from kernel/power/process.c + */ + +#include +#include +#include +#include +#include + +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + wmb(); + } + clear_freeze_flag(current); +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(); + task_unlock(current); + } else { + task_unlock(current); + return; + } + save = current->state; + pr_debug("%s entered refrigerator\n", current->comm); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; + schedule(); + } + pr_debug("%s left refrigerator\n", current->comm); + __set_current_state(save); +} +EXPORT_SYMBOL(refrigerator); + +static void fake_signal_wake_up(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); +} + +/** + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and + * either sending a fake signal to it or waking it up, depending on whether + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. + */ +bool freeze_task(struct task_struct *p, bool sig_only) +{ + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; + + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; + } else { + wake_up_state(p, TASK_INTERRUPTIBLE); + } + + return true; +} + +void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + clear_freeze_flag(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_and_wake(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index dcd165f92a88..ebdd7f55273d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -85,6 +85,9 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y +config FREEZER + def_bool PM_SLEEP + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE diff --git a/kernel/power/process.c b/kernel/power/process.c index 278946aecaf0..444cea80fde8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p) return 1; } -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - wmb(); - } - clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) - break; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static inline bool should_send_signal(struct task_struct *p) -{ - return !(p->flags & PF_FREEZER_NOSIG); -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise - * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. - */ -static bool freeze_task(struct task_struct *p, bool sig_only) -{ - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } - - if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else if (sig_only) { - return false; - } else { - wake_up_state(p, TASK_INTERRUPTIBLE); - } - - return true; -} - -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; @@ -264,4 +149,3 @@ void thaw_processes(void) printk("done.\n"); } -EXPORT_SYMBOL(refrigerator); -- cgit v1.2.1 From dc52ddc0e6f45b04780b26fc0813509f8e798c42 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:21 -0700 Subject: container freezer: implement freezer cgroup subsystem This patch implements a new freezer subsystem in the control groups framework. It provides a way to stop and resume execution of all tasks in a cgroup by writing in the cgroup filesystem. The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "RUNNING" will unfreeze the tasks in the cgroup. Reading will return the current state. * Examples of usage : # mkdir /containers/freezer # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks to get status of the freezer subsystem : # cat /containers/0/freezer.state RUNNING to freeze all tasks in the container : # echo FROZEN > /containers/0/freezer.state # cat /containers/0/freezer.state FREEZING # cat /containers/0/freezer.state FROZEN to unfreeze all tasks in the container : # echo RUNNING > /containers/0/freezer.state # cat /containers/0/freezer.state RUNNING This is the basic mechanism which should do the right thing for user space task in a simple scenario. It's important to note that freezing can be incomplete. In that case we return EBUSY. This means that some tasks in the cgroup are busy doing something that prevents us from completely freezing the cgroup at this time. After EBUSY, the cgroup will remain partially frozen -- reflected by freezer.state reporting "FREEZING" when read. The state will remain "FREEZING" until one of these things happens: 1) Userspace cancels the freezing operation by writing "RUNNING" to the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal and returns EIO) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: export thaw_process] Signed-off-by: Cedric Le Goater Signed-off-by: Matt Helsley Acked-by: Serge E. Hallyn Tested-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Kconfig.freezer | 2 + kernel/Makefile | 1 + kernel/cgroup_freezer.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/freezer.c | 32 +++++ kernel/power/Kconfig | 3 - 5 files changed, 401 insertions(+), 3 deletions(-) create mode 100644 kernel/Kconfig.freezer create mode 100644 kernel/cgroup_freezer.c (limited to 'kernel') diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 000000000000..a3bb4cb52539 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index e8194d15d5f4..066550aa61c5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 000000000000..b08722de610c --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,366 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include + +enum freezer_state { + STATE_RUNNING = 0, + STATE_FREEZING, + STATE_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ + struct freezer *freezer; + enum freezer_state state; + + task_lock(task); + freezer = task_freezer(task); + state = freezer->state; + task_unlock(task); + + return state == STATE_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "RUNNING", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (RUNNING) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______RUNNING_______/ | + * \_____________________________RUNNING___________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (to prevent races with freeze_task()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = STATE_RUNNING; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + + +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + int retval = 0; + + /* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ + freezer = cgroup_freezer(new_cgroup); + if (freezer->state == STATE_FROZEN) + retval = -EBUSY; + return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + task_lock(task); + freezer = task_freezer(task); + task_unlock(task); + + BUG_ON(freezer->state == STATE_FROZEN); + spin_lock_irq(&freezer->lock); + /* Locking avoids race with FREEZING -> RUNNING transitions. */ + if (freezer->state == STATE_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void check_if_frozen(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + /* + * Task is frozen or will freeze immediately when next it gets + * woken + */ + if (frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task))) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = STATE_FROZEN; + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == STATE_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + check_if_frozen(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = STATE_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (task_is_stopped_or_traced(task) && freezing(task)) + /* + * The freeze flag is set so these tasks will + * immediately go into the fridge upon waking. + */ + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + int do_wake; + + task_lock(task); + do_wake = __thaw_process(task); + task_unlock(task); + if (do_wake) + wake_up_process(task); + } + cgroup_iter_end(cgroup, &it); + freezer->state = STATE_RUNNING; + + return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + check_if_frozen(cgroup, freezer); /* may update freezer->state */ + if (goal_state == freezer->state) + goto out; + switch (freezer->state) { + case STATE_RUNNING: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + case STATE_FREEZING: + if (goal_state == STATE_FROZEN) { + /* Userspace is retrying after + * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + } + /* state == FREEZING and goal_state == RUNNING, so unfreeze */ + case STATE_FROZEN: + retval = unfreeze_cgroup(cgroup, freezer); + break; + default: + break; + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[STATE_RUNNING]) == 0) + goal_state = STATE_RUNNING; + else if (strcmp(buffer, freezer_state_strs[STATE_FROZEN]) == 0) + goal_state = STATE_FROZEN; + else + return -EIO; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/freezer.c b/kernel/freezer.c index cb0931f89306..ba6248b323ef 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -120,3 +120,35 @@ void cancel_freezing(struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } } + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ebdd7f55273d..dcd165f92a88 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -85,9 +85,6 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y -config FREEZER - def_bool PM_SLEEP - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE -- cgit v1.2.1 From 5a06915c6df9b89cda5ddb3f8cce5f9a6be534d2 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:22 -0700 Subject: container freezer: skip frozen cgroups during power management resume When a system is resumed after a suspend, it will also unfreeze frozen cgroups. This patchs modifies the resume sequence to skip the tasks which are part of a frozen control group. Signed-off-by: Cedric Le Goater Signed-off-by: Matt Helsley Acked-by: Serge E. Hallyn Tested-by: Matt Helsley Acked-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 444cea80fde8..ca634019497a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -135,6 +135,9 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; + if (cgroup_frozen(p)) + continue; + thaw_process(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -- cgit v1.2.1 From 957a4eeaf4af614ab0fc4c09a22593d6ab233f5b Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:22 -0700 Subject: container freezer: prevent frozen tasks or cgroups from changing Don't let frozen tasks or cgroups change. This means frozen tasks can't leave their current cgroup for another cgroup. It also means that tasks cannot be added to or removed from a cgroup in the FROZEN state. We enforce these rules by checking for frozen tasks and cgroups in the can_attach() function. Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b08722de610c..df884022fbd9 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -145,22 +145,40 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } +/* Task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} +/* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, struct task_struct *task) { struct freezer *freezer; - int retval = 0; + int retval; + + /* Anything frozen can't move or be moved to/from */ + + if (is_task_frozen_enough(task)) + return -EBUSY; - /* - * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence the - * can_attach() result will remain valid until the attach completes. - */ freezer = cgroup_freezer(new_cgroup); + if (freezer->state == STATE_FROZEN) + return -EBUSY; + + retval = 0; + task_lock(task); + freezer = task_freezer(task); if (freezer->state == STATE_FROZEN) retval = -EBUSY; + task_unlock(task); return retval; } @@ -193,12 +211,7 @@ static void check_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - /* - * Task is frozen or will freeze immediately when next it gets - * woken - */ - if (frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task))) + if (is_task_frozen_enough(task)) nfrozen++; } @@ -249,11 +262,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) continue; - if (task_is_stopped_or_traced(task) && freezing(task)) - /* - * The freeze flag is set so these tasks will - * immediately go into the fridge upon waking. - */ + if (is_task_frozen_enough(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; -- cgit v1.2.1 From 81dcf33c2ae314899f754aa7aaa1cb1fe2f84da6 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:23 -0700 Subject: container freezer: make freezer state names less generic Rename cgroup freezer states to be less generic to avoid any name collisions while also better describing what each state is. Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 56 ++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index df884022fbd9..6e4280d77429 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -22,9 +22,9 @@ #include enum freezer_state { - STATE_RUNNING = 0, - STATE_FREEZING, - STATE_FROZEN, + CGROUP_THAWED = 0, + CGROUP_FREEZING, + CGROUP_FROZEN, }; struct freezer { @@ -57,7 +57,7 @@ int cgroup_frozen(struct task_struct *task) state = freezer->state; task_unlock(task); - return state == STATE_FROZEN; + return state == CGROUP_FROZEN; } /* @@ -65,7 +65,7 @@ int cgroup_frozen(struct task_struct *task) * CGROUP_LOCAL_BUFFER_SIZE */ static const char *freezer_state_strs[] = { - "RUNNING", + "THAWED", "FREEZING", "FROZEN", }; @@ -75,10 +75,10 @@ static const char *freezer_state_strs[] = { * Transitions are caused by userspace writes to the freezer.state file. * The values in parenthesis are state labels. The rest are edge labels. * - * (RUNNING) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) - * ^ ^ | | - * | \_______RUNNING_______/ | - * \_____________________________RUNNING___________/ + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______THAWED_______/ | + * \__________________________THAWED____________/ */ struct cgroup_subsys freezer_subsys; @@ -135,7 +135,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, return ERR_PTR(-ENOMEM); spin_lock_init(&freezer->lock); - freezer->state = STATE_RUNNING; + freezer->state = CGROUP_THAWED; return &freezer->css; } @@ -170,13 +170,13 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return -EBUSY; freezer = cgroup_freezer(new_cgroup); - if (freezer->state == STATE_FROZEN) + if (freezer->state == CGROUP_FROZEN) return -EBUSY; retval = 0; task_lock(task); freezer = task_freezer(task); - if (freezer->state == STATE_FROZEN) + if (freezer->state == CGROUP_FROZEN) retval = -EBUSY; task_unlock(task); return retval; @@ -190,10 +190,10 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) freezer = task_freezer(task); task_unlock(task); - BUG_ON(freezer->state == STATE_FROZEN); + BUG_ON(freezer->state == CGROUP_FROZEN); spin_lock_irq(&freezer->lock); - /* Locking avoids race with FREEZING -> RUNNING transitions. */ - if (freezer->state == STATE_FREEZING) + /* Locking avoids race with FREEZING -> THAWED transitions. */ + if (freezer->state == CGROUP_FREEZING) freeze_task(task, true); spin_unlock_irq(&freezer->lock); } @@ -221,7 +221,7 @@ static void check_if_frozen(struct cgroup *cgroup, * tasks. */ if (nfrozen == ntotal) - freezer->state = STATE_FROZEN; + freezer->state = CGROUP_FROZEN; cgroup_iter_end(cgroup, &it); } @@ -237,7 +237,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); state = freezer->state; - if (state == STATE_FREEZING) { + if (state == CGROUP_FREEZING) { /* We change from FREEZING to FROZEN lazily if the cgroup was * only partially frozen when we exitted write. */ check_if_frozen(cgroup, freezer); @@ -257,7 +257,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; unsigned int num_cant_freeze_now = 0; - freezer->state = STATE_FREEZING; + freezer->state = CGROUP_FREEZING; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) @@ -288,7 +288,7 @@ static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) wake_up_process(task); } cgroup_iter_end(cgroup, &it); - freezer->state = STATE_RUNNING; + freezer->state = CGROUP_THAWED; return 0; } @@ -305,18 +305,18 @@ static int freezer_change_state(struct cgroup *cgroup, if (goal_state == freezer->state) goto out; switch (freezer->state) { - case STATE_RUNNING: + case CGROUP_THAWED: retval = try_to_freeze_cgroup(cgroup, freezer); break; - case STATE_FREEZING: - if (goal_state == STATE_FROZEN) { + case CGROUP_FREEZING: + if (goal_state == CGROUP_FROZEN) { /* Userspace is retrying after * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ retval = try_to_freeze_cgroup(cgroup, freezer); break; } - /* state == FREEZING and goal_state == RUNNING, so unfreeze */ - case STATE_FROZEN: + /* state == FREEZING and goal_state == THAWED, so unfreeze */ + case CGROUP_FROZEN: retval = unfreeze_cgroup(cgroup, freezer); break; default: @@ -335,10 +335,10 @@ static int freezer_write(struct cgroup *cgroup, int retval; enum freezer_state goal_state; - if (strcmp(buffer, freezer_state_strs[STATE_RUNNING]) == 0) - goal_state = STATE_RUNNING; - else if (strcmp(buffer, freezer_state_strs[STATE_FROZEN]) == 0) - goal_state = STATE_FROZEN; + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + goal_state = CGROUP_THAWED; + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + goal_state = CGROUP_FROZEN; else return -EIO; -- cgit v1.2.1 From 1aece34833721d64eb33fc15cd923c727296d3d3 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:24 -0700 Subject: container freezer: rename check_if_frozen() check_if_frozen() sounds like it should return something when in fact it's just updating the freezer state. Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6e4280d77429..e95056954498 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -201,8 +201,8 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* * caller must hold freezer->lock */ -static void check_if_frozen(struct cgroup *cgroup, - struct freezer *freezer) +static void update_freezer_state(struct cgroup *cgroup, + struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; @@ -222,6 +222,10 @@ static void check_if_frozen(struct cgroup *cgroup, */ if (nfrozen == ntotal) freezer->state = CGROUP_FROZEN; + else if (nfrozen > 0) + freezer->state = CGROUP_FREEZING; + else + freezer->state = CGROUP_THAWED; cgroup_iter_end(cgroup, &it); } @@ -240,7 +244,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, if (state == CGROUP_FREEZING) { /* We change from FREEZING to FROZEN lazily if the cgroup was * only partially frozen when we exitted write. */ - check_if_frozen(cgroup, freezer); + update_freezer_state(cgroup, freezer); state = freezer->state; } spin_unlock_irq(&freezer->lock); @@ -301,7 +305,7 @@ static int freezer_change_state(struct cgroup *cgroup, freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); - check_if_frozen(cgroup, freezer); /* may update freezer->state */ + update_freezer_state(cgroup, freezer); if (goal_state == freezer->state) goto out; switch (freezer->state) { -- cgit v1.2.1 From c3b9f5afc7f4c8abc0452f132edb13ac8c1fafbf Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 18 Oct 2008 20:27:29 -0700 Subject: kernel/configs.c: remove useless comments These comments are useless, remove them. Signed-off-by: WANG Cong Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index 4c345210ed8c..abaee684ecbf 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -54,9 +54,6 @@ #ifdef CONFIG_IKCONFIG_PROC -/**************************************************/ -/* globals and useful constants */ - static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) @@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = { .read = ikconfig_read_current, }; -/***************************************************/ -/* ikconfig_init: start up everything we need to */ - static int __init ikconfig_init(void) { struct proc_dir_entry *entry; @@ -89,9 +83,6 @@ static int __init ikconfig_init(void) return 0; } -/***************************************************/ -/* ikconfig_cleanup: clean up our mess */ - static void __exit ikconfig_cleanup(void) { remove_proc_entry("config.gz", NULL); -- cgit v1.2.1 From 146aa1bd0511f88ddb4e92fafa2b8aad4f2f65f3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 18 Oct 2008 20:28:03 -0700 Subject: cgroups: fix probable race with put_css_set[_taskexit] and find_css_set put_css_set_taskexit may be called when find_css_set is called on other cpu. And the race will occur: put_css_set_taskexit side find_css_set side | atomic_dec_and_test(&kref->refcount) | /* kref->refcount = 0 */ | .................................................................... | read_lock(&css_set_lock) | find_existing_css_set | get_css_set | read_unlock(&css_set_lock); .................................................................... __release_css_set | .................................................................... | /* use a released css_set */ | [put_css_set is the same. But in the current code, all put_css_set are put into cgroup mutex critical region as the same as find_css_set.] [akpm@linux-foundation.org: repair comments] [menage@google.com: eliminate race in css_set refcounting] Signed-off-by: Lai Jiangshan Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 43 ++++++++++++++++++++----------------------- kernel/cgroup_debug.c | 4 ++-- 2 files changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c6e1c17e6d3..1e49218457e0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg) struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; @@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg) list_del(&link->cgrp_link_list); kfree(link); } - - write_unlock(&css_set_lock); } -static void __release_css_set(struct kref *k, int taskexit) +static void __put_css_set(struct css_set *cg, int taskexit) { int i; - struct css_set *cg = container_of(k, struct css_set, ref); - + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cg->refcount, -1, 1)) + return; + write_lock(&css_set_lock); + if (!atomic_dec_and_test(&cg->refcount)) { + write_unlock(&css_set_lock); + return; + } unlink_css_set(cg); + write_unlock(&css_set_lock); rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { @@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit) kfree(cg); } -static void release_css_set(struct kref *k) -{ - __release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ - __release_css_set(k, 1); -} - /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cg) { - kref_get(&cg->ref); + atomic_inc(&cg->refcount); } static inline void put_css_set(struct css_set *cg) { - kref_put(&cg->ref, release_css_set); + __put_css_set(cg, 0); } static inline void put_css_set_taskexit(struct css_set *cg) { - kref_put(&cg->ref, release_css_set_taskexit); + __put_css_set(cg, 1); } /* @@ -427,7 +425,7 @@ static struct css_set *find_css_set( return NULL; } - kref_init(&res->ref); + atomic_set(&res->refcount, 1); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); INIT_HLIST_NODE(&res->hlist); @@ -1728,7 +1726,7 @@ int cgroup_task_count(const struct cgroup *cgrp) read_lock(&css_set_lock); list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->ref.refcount); + count += atomic_read(&link->cg->refcount); } read_unlock(&css_set_lock); return count; @@ -2495,8 +2493,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) int __init cgroup_init_early(void) { int i; - kref_init(&init_css_set.ref); - kref_get(&init_css_set.ref); + atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index c3dc3aba4c02..daca6209202d 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, u64 count; rcu_read_lock(); - count = atomic_read(¤t->cgroups->ref.refcount); + count = atomic_read(¤t->cgroups->refcount); rcu_read_unlock(); return count; } @@ -90,7 +90,7 @@ static struct cftype files[] = { { .name = "releasable", .read_u64 = releasable_read, - } + }, }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -- cgit v1.2.1 From cc31edceee04a7b87f2be48f9489ebb72d264844 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Sat, 18 Oct 2008 20:28:04 -0700 Subject: cgroups: convert tasks file to use a seq_file with shared pid array Rather than pre-generating the entire text for the "tasks" file each time the file is opened, we instead just generate/update the array of process ids and use a seq_file to report these to userspace. All open file handles on the same "tasks" file can share a pid array, which may be updated any time that no thread is actively reading the array. By sharing the array, the potential for userspace to DoS the system by opening many handles on the same "tasks" file is removed. [Based on a patch by Lai Jiangshan, extended to use seq_file] Signed-off-by: Paul Menage Reviewed-by: Lai Jiangshan Cc: Serge Hallyn Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 222 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 139 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1e49218457e0..046c1609606b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -868,6 +868,14 @@ static struct super_operations cgroup_ops = { .remount_fs = cgroup_remount, }; +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); + init_rwsem(&cgrp->pids_mutex); +} static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -876,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1995,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here). The struct - * ctr_struct * is stored in file->private_data. Its resources will - * be freed by release() when the file is closed. The array is used - * to sprintf the PIDs and then used by read(). */ -struct ctr_struct { - char *buf; - int bufsz; -}; /* * Load into 'pidarray' up to 'npids' of the tasks using cgroup @@ -2086,42 +2082,132 @@ static int cmppid(const void *a, const void *b) return *(pid_t *)a - *(pid_t *)b; } + /* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'. Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. + * seq_file methods for the "tasks" file. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->tasks_pids array. */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) + +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) { - int cnt = 0; - int i; + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct cgroup *cgrp = s->private; + int index = 0, pid = *pos; + int *iter; - for (i = 0; i < npids; i++) - cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); - return cnt; + down_read(&cgrp->pids_mutex); + if (pid) { + int end = cgrp->pids_length; + int i; + while (index < end) { + int mid = (index + end) / 2; + if (cgrp->tasks_pids[mid] == pid) { + index = mid; + break; + } else if (cgrp->tasks_pids[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= cgrp->pids_length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = cgrp->tasks_pids + index; + *pos = *iter; + return iter; +} + +static void cgroup_tasks_stop(struct seq_file *s, void *v) +{ + struct cgroup *cgrp = s->private; + up_read(&cgrp->pids_mutex); } +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct cgroup *cgrp = s->private; + int *p = v; + int *end = cgrp->tasks_pids + cgrp->pids_length; + + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_tasks_show(struct seq_file *s, void *v) +{ + return seq_printf(s, "%d\n", *(int *)v); +} + +static struct seq_operations cgroup_tasks_seq_operations = { + .start = cgroup_tasks_start, + .stop = cgroup_tasks_stop, + .next = cgroup_tasks_next, + .show = cgroup_tasks_show, +}; + +static void release_cgroup_pid_array(struct cgroup *cgrp) +{ + down_write(&cgrp->pids_mutex); + BUG_ON(!cgrp->pids_use_count); + if (!--cgrp->pids_use_count) { + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = NULL; + cgrp->pids_length = 0; + } + up_write(&cgrp->pids_mutex); +} + +static int cgroup_tasks_release(struct inode *inode, struct file *file) +{ + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (!(file->f_mode & FMODE_READ)) + return 0; + + release_cgroup_pid_array(cgrp); + return seq_release(inode, file); +} + +static struct file_operations cgroup_tasks_operations = { + .read = seq_read, + .llseek = seq_lseek, + .write = cgroup_file_write, + .release = cgroup_tasks_release, +}; + /* - * Handle an open on 'tasks' file. Prepare a buffer listing the + * Handle an open on 'tasks' file. Prepare an array containing the * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any. */ + static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct ctr_struct *ctr; pid_t *pidarray; int npids; - char c; + int retval; + /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); - if (!ctr) - goto err0; - /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -2129,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * show up until sometime later on. */ npids = cgroup_task_count(cgrp); - if (npids) { - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - goto err1; - - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* Call pid_array_to_buf() twice, first just to get bufsz */ - ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; - ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); - if (!ctr->buf) - goto err2; - ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - - kfree(pidarray); - } else { - ctr->buf = NULL; - ctr->bufsz = 0; - } - file->private_data = ctr; - return 0; - -err2: - kfree(pidarray); -err1: - kfree(ctr); -err0: - return -ENOMEM; -} - -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct ctr_struct *ctr = file->private_data; + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + return -ENOMEM; + npids = pid_array_load(pidarray, npids, cgrp); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} + /* + * Store the array in the cgroup, freeing the old + * array if necessary + */ + down_write(&cgrp->pids_mutex); + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = pidarray; + cgrp->pids_length = npids; + cgrp->pids_use_count++; + up_write(&cgrp->pids_mutex); -static int cgroup_tasks_release(struct inode *unused_inode, - struct file *file) -{ - struct ctr_struct *ctr; + file->f_op = &cgroup_tasks_operations; - if (file->f_mode & FMODE_READ) { - ctr = file->private_data; - kfree(ctr->buf); - kfree(ctr); + retval = seq_open(file, &cgroup_tasks_seq_operations); + if (retval) { + release_cgroup_pid_array(cgrp); + return retval; } + ((struct seq_file *)file->private_data)->private = cgrp; return 0; } @@ -2208,7 +2268,6 @@ static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, - .read = cgroup_tasks_read, .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, @@ -2298,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->root = parent->root; -- cgit v1.2.1 From 40b6a76237563c70466ec7315f644ba87d57dbe5 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sat, 18 Oct 2008 20:28:18 -0700 Subject: cpuset.c: remove extra variable Remove the use of int cpus_nonempty variable from 'update_flag' function. Signed-off-by: Md.Rakib H. Mullick Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index eab7bd6628e0..81d4d2426fd4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, { struct cpuset trialcs; int err; - int cpus_nonempty, balance_flag_changed; + int balance_flag_changed; trialcs = *cs; if (turning_on) @@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) return err; - cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(&trialcs)); @@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); - if (cpus_nonempty && balance_flag_changed) + if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); return 0; -- cgit v1.2.1 From 30e8e13603c247c301fdacabef2a765c84840994 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 18 Oct 2008 20:28:20 -0700 Subject: cpuset: use seq_*mask_* to print masks 1) seq_file excepts that m->count == m->size when it's buf is full, so current code will causes bugs when buf is overflow. 2) There is not too good that cpuset accesses struct seq_file's fields directly. Signed-off-by: Lai Jiangshan Cc: Alexey Dobriyan Acked-by: Paul Menage Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 81d4d2426fd4..3e00526f52ec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2436,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = { void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); - m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Cpus_allowed_list:\t"); - m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask_list(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); - m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed_list:\t"); - m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask_list(m, &task->mems_allowed); seq_printf(m, "\n"); } -- cgit v1.2.1 From b747c8c102cc0677a7a8056a093f58d7c9b500e7 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 18 Oct 2008 20:28:21 -0700 Subject: make ptrace_untrace() static ptrace_untrace() can now become static. Signed-off-by: Adrian Bunk Cc: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 356699a96d56..1e68e4c39e2c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) * TASK_TRACED, resume it now. * Requires that irqs be disabled. */ -void ptrace_untrace(struct task_struct *child) +static void ptrace_untrace(struct task_struct *child) { spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { -- cgit v1.2.1 From 293adee601bcd4cdb5076a9bda187137de17e96e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 18 Oct 2008 20:28:24 -0700 Subject: kthread_bind: use wait_task_inactive(TASK_UNINTERRUPTIBLE) Now that wait_task_inactive(task, state) checks task->state == state, we can simplify the code and make this debugging check more robust. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 96cff2f8710b..14ec64fe175a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -171,12 +171,11 @@ EXPORT_SYMBOL(kthread_create); */ void kthread_bind(struct task_struct *k, unsigned int cpu) { - if (k->state != TASK_UNINTERRUPTIBLE) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; -- cgit v1.2.1 From acd99dbf54020f5c80b9aa2f2ea86f43cb285b02 Mon Sep 17 00:00:00 2001 From: Ken'ichi Ohmichi Date: Sat, 18 Oct 2008 20:28:30 -0700 Subject: kdump: add vmlist.addr to vmcoreinfo for x86 vmalloc translation. Add the symbols 'vmlist' and offset 'vm_struct.addr' to the vmcoreinfo[1] data for i386 vmalloc translation. makedumpfile[2] needs VMALLOC_START value for distinguishing a vmalloc address or not, because it should choose suitable translation method. If applying this patch, makedumpfile will be able to take VMALLOC_START value from 'vmlist.addr'. vmcoreinfo[1]: The vmcoreinfo data has the minimum debugging information only for dump filtering. makedumpfile[2] uses it to distinguish unnecessary pages and creates a small dumpfile. makedumpfile[2]: dump filtering command https://sourceforge.net/projects/makedumpfile/ Signed-off-by: Ken'ichi Ohmichi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index aef265325cd3..777ac458ac99 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1371,6 +1371,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(node_online_map); VMCOREINFO_SYMBOL(swapper_pg_dir); VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmlist); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1406,6 +1407,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vm_struct, addr); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); -- cgit v1.2.1 From 1a651a00e20fd4997f0b91258f6f95b7d96edcd9 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 18 Oct 2008 20:28:37 -0700 Subject: byteorder: remove direct includes of linux/byteorder/swab[b].h A consolidated implementation will provide this generically through asm/byteorder, remove direct includes to avoid breakage when the changeover to the new implementation occurs. Signed-off-by: Harvey Harrison Acked-by: Mauro Carvalho Chehab Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupreempt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ca4bbbe04aa4..59236e8b9daa 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -54,9 +54,9 @@ #include #include #include -#include #include #include +#include /* * PREEMPT_RCU data structures. -- cgit v1.2.1 From f07767fd0f95c385108fa4c456a9cb216a424fec Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 20 Oct 2008 10:23:38 -0700 Subject: byteorder: remove direct includes of linux/byteorder/swab[b].h A consolidated implementation will provide this generically through asm/byteorder, remove direct includes to avoid breakage when the changeover to the new implementation occurs. This hunk was lost from commit 1d8cca44b6a244b7e378546d719041819049a0f9 ("byteorder: provide swabb.h generically in asm/byteorder.h") Signed-off-by: Harvey Harrison Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 90b5b123f7a1..85cb90588a55 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -42,10 +42,10 @@ #include #include #include -#include #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and " -- cgit v1.2.1 From 5f41b8cdc6ef33b3432cee36264d628a80398362 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Mon, 20 Oct 2008 15:23:40 -0700 Subject: kexec: fix crash_save_vmcoreinfo_init build problem This fixes kernel/kexec.c: In function 'crash_save_vmcoreinfo_init': kernel/kexec.c:1374: error: 'vmlist' undeclared (first use in this function) kernel/kexec.c:1374: error: (Each undeclared identifier is reported only once kernel/kexec.c:1374: error: for each function it appears in.) kernel/kexec.c:1410: error: invalid use of undefined type 'struct vm_struct' make[1]: *** [kernel/kexec.o] Error 1 Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- kernel/kexec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 777ac458ac99..ac0fde7b54d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include -- cgit v1.2.1 From 0b3682ba33c59a362901b478bdab965da888b350 Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Mon, 20 Oct 2008 12:41:58 -0600 Subject: genirq: fix set_irq_type() when recording trigger type Impact: fix boot hang on a G5 In set_irq_type() we want to pass the type rather than the current interrupt state. Signed-off-by: Chris Friesen Acked-by: Benjamin Herrenschmidt Acked-by: David Brownell Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4895fde4eb93..3de6ea3ee740 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -127,7 +127,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return 0; spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, flags); + ret = __irq_set_trigger(desc, irq, type); spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.2.1 From 9a1c3542768b5a58e45a9216921cd10a3bae1205 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Feb 2008 20:40:24 -0500 Subject: [PATCH] pass fmode_t to blkdev_put() Signed-off-by: Al Viro --- kernel/power/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 80ccac849e46..7b9d611c1106 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -178,7 +178,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ res = set_blocksize(resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, FMODE_WRITE); return res; } @@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, FMODE_READ); if (!error) pr_debug("PM: Image successfully loaded\n"); @@ -609,7 +609,7 @@ int swsusp_check(void) return -EINVAL; } if (error) - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, FMODE_READ); else pr_debug("PM: Signature found, resuming\n"); } else { @@ -633,7 +633,7 @@ void swsusp_close(void) return; } - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, 0); /* move up */ } static int swsusp_header_init(void) -- cgit v1.2.1 From c2dd0dae185423fb243b13d490c3fcfaa18ff333 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Oct 2007 13:21:10 -0400 Subject: [PATCH] propagate mode through swsusp_close() Signed-off-by: Al Viro --- kernel/power/disk.c | 2 +- kernel/power/power.h | 2 +- kernel/power/swap.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 331f9836383f..c9d74083746f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -651,7 +651,7 @@ static int software_resume(void) pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); if (error) { - swsusp_close(); + swsusp_close(FMODE_READ); goto Done; } diff --git a/kernel/power/power.h b/kernel/power/power.h index acc0c101dbd5..46b5ec7a3afb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -153,7 +153,7 @@ extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(void); +extern void swsusp_close(fmode_t); struct timeval; /* kernel/power/swsusp.c */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7b9d611c1106..178b001a4f17 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -426,7 +426,7 @@ int swsusp_write(unsigned int flags) release_swap_writer(&handle); out: - swsusp_close(); + swsusp_close(FMODE_WRITE); return error; } @@ -626,14 +626,14 @@ int swsusp_check(void) * swsusp_close - close swap device. */ -void swsusp_close(void) +void swsusp_close(fmode_t mode) { if (IS_ERR(resume_bdev)) { pr_debug("PM: Image device not initialised\n"); return; } - blkdev_put(resume_bdev, 0); /* move up */ + blkdev_put(resume_bdev, mode); /* move up */ } static int swsusp_header_init(void) -- cgit v1.2.1 From 572c48921574dbe6dceb958cf965aa962baefde4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Oct 2007 13:24:05 -0400 Subject: [PATCH] sanitize blkdev_get() and friends * get rid of fake struct file/struct dentry in __blkdev_get() * merge __blkdev_get() and do_open() * get rid of flags argument of blkdev_get() Signed-off-by: Al Viro --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 178b001a4f17..b7713b53d07a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -172,7 +172,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ return res; root_swap = res; - res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); + res = blkdev_get(resume_bdev, FMODE_WRITE); if (res) return res; -- cgit v1.2.1 From b6f3b7803a9231eddc36d0a2a6d2d8105ef89344 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Sat, 18 Oct 2008 16:06:56 -0700 Subject: genirq: NULL struct irq_desc's member 'name' in dynamic_irq_cleanup() If the member 'name' of the irq_desc structure happens to point to a character string that is resident within a kernel module, problems ensue if that module is rmmod'd (at which time dynamic_irq_cleanup() is called) and then later show_interrupts() is called by someone. It is also not a good thing if the character string resided in kmalloc'd space that has been kfree'd (after having called dynamic_irq_cleanup()). dynamic_irq_cleanup() fails to NULL the 'name' member and show_interrupts() references it on a few architectures (like h8300, sh and x86). Signed-off-by: Dean Nelson Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3de6ea3ee740..10b5092e9bfe 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; + desc->name = NULL; spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.2.1 From 5f86515158ca86182c1dbecd546f1848121ba135 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 17 Oct 2008 14:40:30 +0800 Subject: rcupdate: fix bug of rcu_barrier*() current rcu_barrier_bh() is like this: void rcu_barrier_bh(void) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 0); /* * The queueing of callbacks in all CPUs must be atomic with * respect to RCU, otherwise one CPU may queue a callback, * wait for a grace period, decrement barrier count and call * complete(), while other CPUs have not yet queued anything. * So, we need to make sure that grace periods cannot complete * until all the callbacks are queued. */ rcu_read_lock(); on_each_cpu(rcu_barrier_func, (void *)RCU_BARRIER_BH, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } The inconsistency of the code and the comments show a bug here. rcu_read_lock() cannot make sure that "grace periods for RCU_BH cannot complete until all the callbacks are queued". it only make sure that race periods for RCU cannot complete until all the callbacks are queued. so we must use rcu_read_lock_bh() for rcu_barrier_bh(). like this: void rcu_barrier_bh(void) { ...... rcu_read_lock_bh(); on_each_cpu(rcu_barrier_func, (void *)RCU_BARRIER_BH, 1); rcu_read_unlock_bh(); ...... } and also rcu_barrier() rcu_barrier_sched() are implemented like this. it will bring a lot of duplicate code. My patch uses another way to fix this bug, please see the comment of my patch. Thank Paul E. McKenney for he rewrote the comment. Signed-off-by: Lai Jiangshan Reviewed-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 467d5940f624..ad63af8b2521 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type) /* Take cpucontrol mutex to protect against CPU hotplug */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 0); /* - * The queueing of callbacks in all CPUs must be atomic with - * respect to RCU, otherwise one CPU may queue a callback, - * wait for a grace period, decrement barrier count and call - * complete(), while other CPUs have not yet queued anything. - * So, we need to make sure that grace periods cannot complete - * until all the callbacks are queued. + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. */ - rcu_read_lock(); + atomic_set(&rcu_barrier_cpu_count, 1); on_each_cpu(rcu_barrier_func, (void *)type, 1); - rcu_read_unlock(); + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } -- cgit v1.2.1 From c4bd822e7b12a9008241d76db45b665f2fef180c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Oct 2008 20:17:35 +0200 Subject: NOHZ: fix thinko in the timer restart code path commit fb02fbc14d17837b4b7b02dbb36142c16a7bf208 (NOHZ: restart tick device from irq_enter()) solves the problem of stale jiffies when long running softirqs happen in a long idle sleep period, but it has a major thinko in it: When the interrupt which came in _is_ the timer interrupt which should expire ts->sched_timer then we cancel and rearm the timer _before_ it gets expired in hrtimer_interrupt() to the next period. That means the call back function is not called. This game can go on for ever :( Prevent this by making sure to only rearm the timer when the expiry time is more than one tick_period away. Otherwise keep it running as it is either already expired or will expiry at the right point to update jiffies. Signed-off-by: Thomas Gleixner Tested-by: Venkatesch Pallipadi --- kernel/time/tick-sched.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0581c11fe6c6..727c1ae0517a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -567,11 +567,21 @@ static void tick_nohz_switch_to_nohz(void) static void tick_nohz_kick_tick(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta, now; if (!ts->tick_stopped) return; - tick_nohz_restart(ts, ktime_get()); + /* + * Do not touch the tick device, when the next expiry is either + * already reached or less/equal than the tick period. + */ + now = ktime_get(); + delta = ktime_sub(ts->sched_timer.expires, now); + if (delta.tv64 <= tick_period.tv64) + return; + + tick_nohz_restart(ts, now); } #else -- cgit v1.2.1 From 5e458cc0f4770eea45d3c07110f01b3a94c72aa5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:13 -0500 Subject: module: simplify load_module. Linus' recent catch of stack overflow in load_module lead me to look at the code. A couple of helpers to get a section address and get objects from a section can help clean things up a little. (And in case you're wondering, the stack size also dropped from 328 to 284 bytes). Signed-off-by: Rusty Russell --- kernel/module.c | 235 ++++++++++++++++++++++++-------------------------------- 1 file changed, 99 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0d8d21ee792c..3d256681ab64 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -132,6 +132,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr, return 0; } +/* Find a module section, or NULL. */ +static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, + const char *secstrings, const char *name) +{ + /* Section 0 has sh_addr 0. */ + return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; +} + +/* Find a module section, or NULL. Fill in number of "objects" in section. */ +static void *section_objs(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = sechdrs[sec].sh_size / object_size; + return (void *)sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -1789,32 +1812,20 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG -static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) { - struct mod_debug *debug_info; - unsigned long pos, end; - unsigned int num_verbose; - - pos = sechdrs[verboseindex].sh_addr; - num_verbose = sechdrs[verboseindex].sh_size / - sizeof(struct mod_debug); - end = pos + (num_verbose * sizeof(struct mod_debug)); +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG + unsigned int i; - for (; pos < end; pos += sizeof(struct mod_debug)) { - debug_info = (struct mod_debug *)pos; - register_dynamic_debug_module(debug_info->modname, - debug_info->type, debug_info->logical_modname, - debug_info->flag_names, debug_info->hash, - debug_info->hash2); + for (i = 0; i < num; i++) { + register_dynamic_debug_module(debug[i].modname, + debug[i].type, + debug[i].logical_modname, + debug[i].flag_names, + debug[i].hash, debug[i].hash2); } -} -#else -static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, - unsigned int verboseindex) -{ -} #endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ +} static void *module_alloc_update_bounds(unsigned long size) { @@ -1843,37 +1854,14 @@ static noinline struct module *load_module(void __user *umod, unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; - unsigned int setupindex; - unsigned int exindex; - unsigned int exportindex; - unsigned int modindex; - unsigned int obsparmindex; - unsigned int infoindex; - unsigned int gplindex; - unsigned int crcindex; - unsigned int gplcrcindex; - unsigned int versindex; - unsigned int pcpuindex; - unsigned int gplfutureindex; - unsigned int gplfuturecrcindex; + unsigned int modindex, versindex, infoindex, pcpuindex; unsigned int unwindex = 0; -#ifdef CONFIG_UNUSED_SYMBOLS - unsigned int unusedindex; - unsigned int unusedcrcindex; - unsigned int unusedgplindex; - unsigned int unusedgplcrcindex; -#endif - unsigned int markersindex; - unsigned int markersstringsindex; - unsigned int verboseindex; - unsigned int tracepointsindex; - unsigned int tracepointsstringsindex; - unsigned int mcountindex; + unsigned int num_kp, num_mcount; + struct kernel_param *kp; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - void *mseg; - struct exception_table_entry *extable; + unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -1937,6 +1925,7 @@ static noinline struct module *load_module(void __user *umod, err = -ENOEXEC; goto free_hdr; } + /* This is temporary: point mod into copy of data. */ mod = (void *)sechdrs[modindex].sh_addr; if (symindex == 0) { @@ -1946,22 +1935,6 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } - /* Optional sections */ - exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); - gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); - gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); - crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); - gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); -#ifdef CONFIG_UNUSED_SYMBOLS - unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); - unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); - unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); - unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); -#endif - setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); - exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); - obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); @@ -2117,42 +2090,57 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto cleanup; - /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ - mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); - mod->syms = (void *)sechdrs[exportindex].sh_addr; - if (crcindex) - mod->crcs = (void *)sechdrs[crcindex].sh_addr; - mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); - mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; - if (gplcrcindex) - mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; - mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / - sizeof(*mod->gpl_future_syms); - mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; - if (gplfuturecrcindex) - mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; + /* Now we've got everything in the final locations, we can + * find optional sections. */ + kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), + &num_kp); + mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); + mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_gpl_future"); #ifdef CONFIG_UNUSED_SYMBOLS - mod->num_unused_syms = sechdrs[unusedindex].sh_size / - sizeof(*mod->unused_syms); - mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / - sizeof(*mod->unused_gpl_syms); - mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; - if (unusedcrcindex) - mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; - mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; - if (unusedgplcrcindex) - mod->unused_gpl_crcs - = (void *)sechdrs[unusedgplcrcindex].sh_addr; + mod->unused_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_unused_gpl"); +#endif + +#ifdef CONFIG_MARKERS + mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", + sizeof(*mod->markers), &mod->num_markers); +#endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(hdr, sechdrs, secstrings, + "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); #endif #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !crcindex) - || (mod->num_gpl_syms && !gplcrcindex) - || (mod->num_gpl_future_syms && !gplfuturecrcindex) + if ((mod->num_syms && !mod->crcs) + || (mod->num_gpl_syms && !mod->gpl_crcs) + || (mod->num_gpl_future_syms && !mod->gpl_future_crcs) #ifdef CONFIG_UNUSED_SYMBOLS - || (mod->num_unused_syms && !unusedcrcindex) - || (mod->num_unused_gpl_syms && !unusedgplcrcindex) + || (mod->num_unused_syms && !mod->unused_crcs) + || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); @@ -2161,16 +2149,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } #endif - markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); - markersstringsindex = find_sec(hdr, sechdrs, secstrings, - "__markers_strings"); - verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); - tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); - tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, - "__tracepoints_strings"); - - mcountindex = find_sec(hdr, sechdrs, secstrings, - "__mcount_loc"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2193,28 +2171,16 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto cleanup; } -#ifdef CONFIG_MARKERS - mod->markers = (void *)sechdrs[markersindex].sh_addr; - mod->num_markers = - sechdrs[markersindex].sh_size / sizeof(*mod->markers); -#endif -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; - mod->num_tracepoints = - sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); -#endif - /* Find duplicate symbols */ err = verify_export_symbols(mod); - if (err < 0) goto cleanup; /* Set up and sort exception table */ - mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); - mod->extable = extable = (void *)sechdrs[exindex].sh_addr; - sort_extable(extable, extable + mod->num_exentries); + mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + sort_extable(mod->extable, mod->extable + mod->num_exentries); /* Finally, copy percpu area over. */ percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, @@ -2223,11 +2189,17 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); if (!mod->taints) { + struct mod_debug *debug; + unsigned int num_debug; + #ifdef CONFIG_MARKERS marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif - dynamic_printk_setup(sechdrs, verboseindex); + debug = section_objs(hdr, sechdrs, secstrings, "__verbose", + sizeof(*debug), &num_debug); + dynamic_printk_setup(debug, num_debug); + #ifdef CONFIG_TRACEPOINTS tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); @@ -2235,8 +2207,9 @@ static noinline struct module *load_module(void __user *umod, } /* sechdrs[0].sh_size is always zero */ - mseg = (void *)sechdrs[mcountindex].sh_addr; - ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", + sizeof(*mseg), &num_mcount); + ftrace_init_module(mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2261,7 +2234,7 @@ static noinline struct module *load_module(void __user *umod, set_fs(old_fs); mod->args = args; - if (obsparmindex) + if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", mod->name); @@ -2270,21 +2243,11 @@ static noinline struct module *load_module(void __user *umod, * strong_try_module_get() will fail. */ stop_machine(__link_module, mod, NULL); - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->name, mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); + err = parse_args(mod->name, mod->args, kp, num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param)); + err = mod_sysfs_setup(mod, kp, num_kp); if (err < 0) goto unlink; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); -- cgit v1.2.1 From d72b37513cdfbd3f53f3d485a8c403cc96d2c95f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Aug 2008 10:09:00 +0200 Subject: Remove stop_machine during module load v2 Remove stop_machine during module load v2 module loading currently does a stop_machine on each module load to insert the module into the global module lists. Especially on larger systems this can be quite expensive. It does that to handle concurrent lock lessmodule list readers like kallsyms. I don't think stop_machine() is actually needed to insert something into a list though. There are no concurrent writers because the module mutex is taken. And the RCU list functions know how to insert a node into a list with the right memory ordering so that concurrent readers don't go off into the wood. So remove the stop_machine for the module list insert and just do a list_add_rcu() instead. Module removal will still do a stop_machine of course, it needs that for other reasons. v2: Revised readers based on Paul's comments. All readers that only rely on disabled preemption need to be changed to list_for_each_rcu(). Done that. The others are ok because they have the modules mutex. Also added a possible missing preempt disable for print_modules(). [cc Paul McKenney for review. It's not RCU, but quite similar.] Acked-by: Paul E. McKenney Signed-off-by: Rusty Russell --- kernel/module.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3d256681ab64..c0f1826e2d9e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -63,7 +64,7 @@ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) /* List of modules, protected by module_mutex or preempt_disable - * (add/delete uses stop_machine). */ + * (delete uses stop_machine/add uses RCU list operations). */ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); @@ -241,7 +242,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr, if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) return true; - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { struct symsearch arr[] = { { mod->syms, mod->syms + mod->num_syms, mod->crcs, NOT_GPL_ONLY, false }, @@ -1416,17 +1417,6 @@ static void mod_kobject_remove(struct module *mod) mod_sysfs_fini(mod); } -/* - * link the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __link_module(void *_mod) -{ - struct module *mod = _mod; - list_add(&mod->list, &modules); - return 0; -} - /* * unlink the module with the whole machine is stopped with interrupts off * - this defends against kallsyms not taking locks @@ -2239,9 +2229,13 @@ static noinline struct module *load_module(void __user *umod, mod->name); /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since - * strong_try_module_get() will fail. */ - stop_machine(__link_module, mod, NULL); + * info during argument parsing. Noone should access us, since + * strong_try_module_get() will fail. + * lockdep/oops can run asynchronous, so use the RCU list insertion + * function to insert in a way safe to concurrent readers. + * The mutex protects against concurrent writers. + */ + list_add_rcu(&mod->list, &modules); err = parse_args(mod->name, mod->args, kp, num_kp, NULL); if (err < 0) @@ -2436,7 +2430,7 @@ const char *module_address_lookup(unsigned long addr, const char *ret = NULL; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { if (modname) @@ -2459,7 +2453,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { const char *sym; @@ -2483,7 +2477,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { const char *sym; @@ -2510,7 +2504,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -2553,7 +2547,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) if ((ret = mod_find_symname(mod, name)) != 0) break; } @@ -2656,7 +2650,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (mod->num_exentries == 0) continue; @@ -2682,7 +2676,7 @@ int is_module_address(unsigned long addr) preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_core, mod->core_size)) { preempt_enable(); return 1; @@ -2703,7 +2697,7 @@ struct module *__module_text_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) if (within(addr, mod->module_init, mod->init_text_size) || within(addr, mod->module_core, mod->core_text_size)) return mod; @@ -2728,8 +2722,11 @@ void print_modules(void) char buf[8]; printk("Modules linked in:"); - list_for_each_entry(mod, &modules, list) + /* Most callers should already have preempt disabled, but make sure */ + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) printk(" %s%s", mod->name, module_flags(mod, buf)); + preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); printk("\n"); -- cgit v1.2.1 From 730b69d225259565c705f5f5a11cb1aba69568f1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:22 -0500 Subject: module: check kernel param length at compile time, not runtime The kparam code tries to handle over-length parameter prefixes at runtime. Not only would I bet this has never been tested, it's not clear that truncating names is a good idea either. So let's check at compile time. We need to move the #define to moduleparam.h to do this, though. Signed-off-by: Rusty Russell --- kernel/params.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index afc46a23eb6d..aca07e1a050f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -585,17 +585,14 @@ static void __init param_sysfs_builtin(void) { struct kernel_param *kp, *kp_begin = NULL; unsigned int i, name_len, count = 0; - char modname[MODULE_NAME_LEN + 1] = ""; + char modname[MODULE_NAME_LEN] = ""; for (i=0; i < __stop___param - __start___param; i++) { char *dot; - size_t max_name_len; kp = &__start___param[i]; - max_name_len = - min_t(size_t, MODULE_NAME_LEN, strlen(kp->name)); - dot = memchr(kp->name, '.', max_name_len); + dot = strchr(kp->name, '.'); if (!dot) { DEBUGP("couldn't find period in first %d characters " "of %s\n", MODULE_NAME_LEN, kp->name); -- cgit v1.2.1 From 9b473de87209fa86eb421b23386693b461612f30 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:22 -0500 Subject: param: Fix duplicate module prefixes Instead of insisting each new module_param sysfs entry is unique, handle the case where it already exists (for builtin modules). The current code assumes that all identical prefixes are together in the section: true for normal uses, but not necessarily so if someone overrides MODULE_PARAM_PREFIX. More importantly, it's not true with the new "core_param()" code which uses "kernel" as a prefix. This simplifies the caller for the builtin case, at a slight loss of efficiency (we do the lookup every time to see if the directory exists). Signed-off-by: Rusty Russell Cc: Greg Kroah-Hartman --- kernel/params.c | 261 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 141 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index aca07e1a050f..f27c992a4625 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp) } /* sysfs output in /sys/modules/XYZ/parameters/ */ +#define to_module_attr(n) container_of(n, struct module_attribute, attr); +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); extern struct kernel_param __start___param[], __stop___param[]; @@ -384,6 +386,7 @@ struct param_attribute struct module_param_attrs { + unsigned int num; struct attribute_group grp; struct param_attribute attrs[0]; }; @@ -434,69 +437,84 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #ifdef CONFIG_SYSFS /* - * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME - * @mk: struct module_kobject (contains parent kobject) - * @kparam: array of struct kernel_param, the actual parameter definitions - * @num_params: number of entries in array - * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules" + * add_sysfs_param - add a parameter to sysfs + * @mk: struct module_kobject + * @kparam: the actual parameter definition to add to sysfs + * @name: name of parameter * - * Create a kobject for a (per-module) group of parameters, and create files - * in sysfs. A pointer to the param_kobject is returned on success, - * NULL if there's no parameter to export, or other ERR_PTR(err). + * Create a kobject if for a (per-module) parameter if mp NULL, and + * create file in sysfs. Returns an error on out of memory. Always cleans up + * if there's an error. */ -static __modinit struct module_param_attrs * -param_sysfs_setup(struct module_kobject *mk, - struct kernel_param *kparam, - unsigned int num_params, - unsigned int name_skip) +static __modinit int add_sysfs_param(struct module_kobject *mk, + struct kernel_param *kp, + const char *name) { - struct module_param_attrs *mp; - unsigned int valid_attrs = 0; - unsigned int i, size[2]; - struct param_attribute *pattr; - struct attribute **gattr; - int err; - - for (i=0; iperm); + + if (!mk->mp) { + num = 0; + attrs = NULL; + } else { + num = mk->mp->num; + attrs = mk->mp->grp.attrs; } - if (!valid_attrs) - return NULL; - - size[0] = ALIGN(sizeof(*mp) + - valid_attrs * sizeof(mp->attrs[0]), - sizeof(mp->grp.attrs[0])); - size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); - - mp = kzalloc(size[0] + size[1], GFP_KERNEL); - if (!mp) - return ERR_PTR(-ENOMEM); + /* Enlarge. */ + new = krealloc(mk->mp, + sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), + GFP_KERNEL); + if (!new) { + kfree(mk->mp); + err = -ENOMEM; + goto fail; + } + attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); + if (!attrs) { + err = -ENOMEM; + goto fail_free_new; + } - mp->grp.name = "parameters"; - mp->grp.attrs = (void *)mp + size[0]; + /* Sysfs wants everything zeroed. */ + memset(new, 0, sizeof(*new)); + memset(&new->attrs[num], 0, sizeof(new->attrs[num])); + memset(&attrs[num], 0, sizeof(attrs[num])); + new->grp.name = "parameters"; + new->grp.attrs = attrs; + + /* Tack new one on the end. */ + new->attrs[num].param = kp; + new->attrs[num].mattr.show = param_attr_show; + new->attrs[num].mattr.store = param_attr_store; + new->attrs[num].mattr.attr.name = (char *)name; + new->attrs[num].mattr.attr.mode = kp->perm; + new->num = num+1; + + /* Fix up all the pointers, since krealloc can move us */ + for (num = 0; num < new->num; num++) + new->grp.attrs[num] = &new->attrs[num].mattr.attr; + new->grp.attrs[num] = NULL; + + mk->mp = new; + return 0; - pattr = &mp->attrs[0]; - gattr = &mp->grp.attrs[0]; - for (i = 0; i < num_params; i++) { - struct kernel_param *kp = &kparam[i]; - if (kp->perm) { - pattr->param = kp; - pattr->mattr.show = param_attr_show; - pattr->mattr.store = param_attr_store; - pattr->mattr.attr.name = (char *)&kp->name[name_skip]; - pattr->mattr.attr.mode = kp->perm; - *(gattr++) = &(pattr++)->mattr.attr; - } - } - *gattr = NULL; +fail_free_new: + kfree(new); +fail: + mk->mp = NULL; + return err; +} - if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { - kfree(mp); - return ERR_PTR(err); - } - return mp; +static void free_module_param_attrs(struct module_kobject *mk) +{ + kfree(mk->mp->grp.attrs); + kfree(mk->mp); + mk->mp = NULL; } #ifdef CONFIG_MODULES @@ -506,21 +524,33 @@ param_sysfs_setup(struct module_kobject *mk, * @kparam: module parameters (array) * @num_params: number of module parameters * - * Adds sysfs entries for module parameters, and creates a link from - * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ + * Adds sysfs entries for module parameters under + * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { - struct module_param_attrs *mp; + int i, err; + bool params = false; + + for (i = 0; i < num_params; i++) { + if (kparam[i].perm == 0) + continue; + err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); + if (err) + return err; + params = true; + } - mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); - if (IS_ERR(mp)) - return PTR_ERR(mp); + if (!params) + return 0; - mod->param_attrs = mp; - return 0; + /* Create the param group. */ + err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); + if (err) + free_module_param_attrs(&mod->mkobj); + return err; } /* @@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod, */ void module_param_sysfs_remove(struct module *mod) { - if (mod->param_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->param_attrs->grp); + if (mod->mkobj.mp) { + sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); /* We are positive that no one is using any param * attrs at this point. Deallocate immediately. */ - kfree(mod->param_attrs); - mod->param_attrs = NULL; + free_module_param_attrs(&mod->mkobj); } } #endif -/* - * kernel_param_sysfs_setup - wrapper for built-in params support - */ -static void __init kernel_param_sysfs_setup(const char *name, - struct kernel_param *kparam, - unsigned int num_params, - unsigned int name_skip) +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) { struct module_kobject *mk; - int ret; + struct kobject *kobj; + int err; - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); - if (ret) { - kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed to be added to sysfs, " - "error number %d\n", name, ret); - printk(KERN_ERR "The system will be unstable now.\n"); - return; + kobj = kset_find_obj(module_kset, name); + if (kobj) { + /* We already have one. Remove params so we can add more. */ + mk = to_module_kobject(kobj); + /* We need to remove it before adding parameters. */ + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + } else { + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, + "%s", name); + if (err) { + kobject_put(&mk->kobj); + printk(KERN_ERR "Module '%s' failed add to sysfs, " + "error number %d\n", name, err); + printk(KERN_ERR "The system will be unstable now.\n"); + return; + } + /* So that exit path is even. */ + kobject_get(&mk->kobj); } - param_sysfs_setup(mk, kparam, num_params, name_skip); + + /* These should not fail at boot. */ + err = add_sysfs_param(mk, kparam, kparam->name + name_skip); + BUG_ON(err); + err = sysfs_create_group(&mk->kobj, &mk->mp->grp); + BUG_ON(err); kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); } /* @@ -579,18 +621,19 @@ static void __init kernel_param_sysfs_setup(const char *name, * The "module" name (KBUILD_MODNAME) is stored before a dot, the * "parameter" name is stored behind a dot in kernel_param->name. So, * extract the "module" name for all built-in kernel_param-eters, - * and for all who have the same, call kernel_param_sysfs_setup. + * and for all who have the same, call kernel_add_sysfs_param. */ static void __init param_sysfs_builtin(void) { - struct kernel_param *kp, *kp_begin = NULL; - unsigned int i, name_len, count = 0; - char modname[MODULE_NAME_LEN] = ""; + struct kernel_param *kp; + unsigned int name_len; + char modname[MODULE_NAME_LEN]; - for (i=0; i < __stop___param - __start___param; i++) { + for (kp = __start___param; kp < __stop___param; kp++) { char *dot; - kp = &__start___param[i]; + if (kp->perm == 0) + continue; dot = strchr(kp->name, '.'); if (!dot) { @@ -599,37 +642,15 @@ static void __init param_sysfs_builtin(void) continue; } name_len = dot - kp->name; - - /* new kbuild_modname? */ - if (strlen(modname) != name_len - || strncmp(modname, kp->name, name_len) != 0) { - /* add a new kobject for previous kernel_params. */ - if (count) - kernel_param_sysfs_setup(modname, - kp_begin, - count, - strlen(modname)+1); - - strncpy(modname, kp->name, name_len); - modname[name_len] = '\0'; - count = 0; - kp_begin = kp; - } - count++; + strncpy(modname, kp->name, name_len); + modname[name_len] = '\0'; + kernel_add_sysfs_param(modname, kp, name_len+1); } - - /* last kernel_params need to be registered as well */ - if (count) - kernel_param_sysfs_setup(modname, kp_begin, count, - strlen(modname)+1); } /* module-related sysfs stuff */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr); -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); - static ssize_t module_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) -- cgit v1.2.1 From 67e67ceaac5bf55dbdceb704ff2d763d438b5373 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:23 -0500 Subject: core_param() for genuinely core kernel parameters There are a lot of one-liner uses of __setup() in the kernel: they're cumbersome and not queryable (definitely not settable) via /sys. Yet it's ugly to simplify them to module_param(), because by default that inserts a prefix of the module name (usually filename). So, introduce a "core_param". The parameter gets no prefix, but appears in /sys/module/kernel/parameters/ (if non-zero perms arg). I thought about using the name "core", but that's more common than "kernel". And if you create a module called "kernel", you will die a horrible death. Signed-off-by: Rusty Russell --- kernel/params.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index f27c992a4625..b077f1b045d3 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -637,14 +637,14 @@ static void __init param_sysfs_builtin(void) dot = strchr(kp->name, '.'); if (!dot) { - DEBUGP("couldn't find period in first %d characters " - "of %s\n", MODULE_NAME_LEN, kp->name); - continue; + /* This happens for core_param() */ + strcpy(modname, "kernel"); + name_len = 0; + } else { + name_len = dot - kp->name + 1; + strlcpy(modname, kp->name, name_len); } - name_len = dot - kp->name; - strncpy(modname, kp->name, name_len); - modname[name_len] = '\0'; - kernel_add_sysfs_param(modname, kp, name_len+1); + kernel_add_sysfs_param(modname, kp, name_len); } } -- cgit v1.2.1 From f44dd164f37336f7e17fabf5740698fd10635172 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:24 -0500 Subject: Make panic= and panic_on_oops into core_params This allows them to be examined and set after boot, plus means they actually give errors if they are misused (eg. panic=yes). Signed-off-by: Rusty Russell --- kernel/panic.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index bda561ef3cdf..6513aac8e992 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,13 +34,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -static int __init panic_setup(char *str) -{ - panic_timeout = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("panic=", panic_setup); - static long no_blink(long time) { return 0; @@ -218,13 +211,6 @@ void add_taint(unsigned flag) } EXPORT_SYMBOL(add_taint); -static int __init pause_on_oops_setup(char *str) -{ - pause_on_oops = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("pause_on_oops=", pause_on_oops_setup); - static void spin_msec(int msecs) { int i; @@ -384,3 +370,6 @@ void __stack_chk_fail(void) } EXPORT_SYMBOL(__stack_chk_fail); #endif + +core_param(panic, panic_timeout, int, 0644); +core_param(pause_on_oops, pause_on_oops, int, 0644); -- cgit v1.2.1 From 0d557dc97f4bb501f086a03d0f00b99a7855d794 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 13 Oct 2008 23:50:09 +0200 Subject: workqueue: introduce create_rt_workqueue create_rt_workqueue will create a real time prioritized workqueue. This is needed for the conversion of stop_machine to a workqueue based implementation. This patch adds yet another parameter to __create_workqueue_key to tell it that we want an rt workqueue. However it looks like we rather should have something like "int type" instead of singlethread, freezable and rt. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 714afad46539..f928f2a87b9b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -62,6 +62,7 @@ struct workqueue_struct { const char *name; int singlethread; int freezeable; /* Freeze threads during suspend */ + int rt; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -766,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct workqueue_struct *wq = cwq->wq; const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; struct task_struct *p; @@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) */ if (IS_ERR(p)) return PTR_ERR(p); - + if (cwq->wq->rt) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); cwq->thread = p; return 0; @@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) struct workqueue_struct *__create_workqueue_key(const char *name, int singlethread, int freezeable, + int rt, struct lock_class_key *key, const char *lock_name) { @@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); wq->singlethread = singlethread; wq->freezeable = freezeable; + wq->rt = rt; INIT_LIST_HEAD(&wq->list); if (singlethread) { -- cgit v1.2.1 From c9583e55fa2b08a230c549bd1e3c0bde6c50d9cc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 13 Oct 2008 23:50:10 +0200 Subject: stop_machine: use workqueues instead of kernel threads Convert stop_machine to a workqueue based approach. Instead of using kernel threads for stop_machine we now use a an rt workqueue to synchronize all cpus. This has the advantage that all needed per cpu threads are already created when stop_machine gets called. And therefore a call to stop_machine won't fail anymore. This is needed for s390 which needs a mechanism to synchronize all cpus without allocating any memory. As Rusty pointed out free_module() needs a non-failing stop_machine interface as well. As a side effect the stop_machine code gets simplified. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell --- kernel/stop_machine.c | 111 +++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index af3c7cea258b..0e688c6a1a63 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -37,9 +37,13 @@ struct stop_machine_data { /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ static unsigned int num_threads; static atomic_t thread_ack; -static struct completion finished; static DEFINE_MUTEX(lock); +static struct workqueue_struct *stop_machine_wq; +static struct stop_machine_data active, idle; +static const cpumask_t *active_cpus; +static void *stop_machine_work; + static void set_state(enum stopmachine_state newstate) { /* Reset ack counter. */ @@ -51,21 +55,25 @@ static void set_state(enum stopmachine_state newstate) /* Last one to ack a state moves to the next state. */ static void ack_state(void) { - if (atomic_dec_and_test(&thread_ack)) { - /* If we're the last one to ack the EXIT, we're finished. */ - if (state == STOPMACHINE_EXIT) - complete(&finished); - else - set_state(state + 1); - } + if (atomic_dec_and_test(&thread_ack)) + set_state(state + 1); } -/* This is the actual thread which stops the CPU. It exits by itself rather - * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ -static int stop_cpu(struct stop_machine_data *smdata) +/* This is the actual function which stops the CPU. It runs + * in the context of a dedicated stopmachine workqueue. */ +static void stop_cpu(struct work_struct *unused) { enum stopmachine_state curstate = STOPMACHINE_NONE; - + struct stop_machine_data *smdata = &idle; + int cpu = smp_processor_id(); + + if (!active_cpus) { + if (cpu == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(cpu, *active_cpus)) + smdata = &active; + } /* Simple state machine */ do { /* Chill out and ensure we re-read stopmachine_state. */ @@ -90,7 +98,6 @@ static int stop_cpu(struct stop_machine_data *smdata) } while (curstate != STOPMACHINE_EXIT); local_irq_enable(); - do_exit(0); } /* Callback for CPUs which aren't supposed to do anything. */ @@ -101,78 +108,34 @@ static int chill(void *unused) int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { - int i, err; - struct stop_machine_data active, idle; - struct task_struct **threads; + struct work_struct *sm_work; + int i; + /* Set up initial state. */ + mutex_lock(&lock); + num_threads = num_online_cpus(); + active_cpus = cpus; active.fn = fn; active.data = data; active.fnret = 0; idle.fn = chill; idle.data = NULL; - /* This could be too big for stack on large machines. */ - threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); - if (!threads) - return -ENOMEM; - - /* Set up initial state. */ - mutex_lock(&lock); - init_completion(&finished); - num_threads = num_online_cpus(); set_state(STOPMACHINE_PREPARE); - for_each_online_cpu(i) { - struct stop_machine_data *smdata = &idle; - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - if (!cpus) { - if (i == first_cpu(cpu_online_map)) - smdata = &active; - } else { - if (cpu_isset(i, *cpus)) - smdata = &active; - } - - threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", - i); - if (IS_ERR(threads[i])) { - err = PTR_ERR(threads[i]); - threads[i] = NULL; - goto kill_threads; - } - - /* Place it onto correct cpu. */ - kthread_bind(threads[i], i); - - /* Make it highest prio. */ - if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) - BUG(); - } - - /* We've created all the threads. Wake them all: hold this CPU so one + /* Schedule the stop_cpu work on all cpus: hold this CPU so one * doesn't hit this CPU until we're ready. */ get_cpu(); - for_each_online_cpu(i) - wake_up_process(threads[i]); - + for_each_online_cpu(i) { + sm_work = percpu_ptr(stop_machine_work, i); + INIT_WORK(sm_work, stop_cpu); + queue_work_on(i, stop_machine_wq, sm_work); + } /* This will release the thread on our CPU. */ put_cpu(); - wait_for_completion(&finished); + flush_workqueue(stop_machine_wq); mutex_unlock(&lock); - - kfree(threads); - return active.fnret; - -kill_threads: - for_each_online_cpu(i) - if (threads[i]) - kthread_stop(threads[i]); - mutex_unlock(&lock); - - kfree(threads); - return err; } int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) @@ -187,3 +150,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) return ret; } EXPORT_SYMBOL_GPL(stop_machine); + +static int __init stop_machine_init(void) +{ + stop_machine_wq = create_rt_workqueue("kstop"); + stop_machine_work = alloc_percpu(struct work_struct); + return 0; +} +early_initcall(stop_machine_init); -- cgit v1.2.1 From 8163bcac779f62c6bf847caed9bce905db0693fb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 22 Oct 2008 10:00:26 -0500 Subject: stop_machine: fix error code handling on multiple cpus Using |= for updating a value which might be updated on several cpus concurrently will not always work since we need to make sure that the update happens atomically. To fix this just use a write if the called function returns an error code on a cpu. We end up writing the error code of an arbitrary cpu if multiple ones fail but that should be sufficient. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell --- kernel/stop_machine.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0e688c6a1a63..8aff79d90ddc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -66,6 +66,7 @@ static void stop_cpu(struct work_struct *unused) enum stopmachine_state curstate = STOPMACHINE_NONE; struct stop_machine_data *smdata = &idle; int cpu = smp_processor_id(); + int err; if (!active_cpus) { if (cpu == first_cpu(cpu_online_map)) @@ -86,9 +87,11 @@ static void stop_cpu(struct work_struct *unused) hard_irq_disable(); break; case STOPMACHINE_RUN: - /* |= allows error detection if functions on - * multiple CPUs. */ - smdata->fnret |= smdata->fn(smdata->data); + /* On multiple CPUs only a single error code + * is needed to tell that something failed. */ + err = smdata->fn(smdata->data); + if (err) + smdata->fnret = err; break; default: break; -- cgit v1.2.1 From 98bc993f99e51467057ef699e47fec020f24d233 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Aug 2008 01:06:21 -0400 Subject: [PATCH] get rid of nameidata in audit_tree Signed-off-by: Al Viro --- kernel/audit_tree.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f7921a2ecf16..8ba0e0d934f2 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -532,7 +532,7 @@ void audit_trim_trees(void) list_add(&cursor, &tree_list); while (cursor.next != &tree_list) { struct audit_tree *tree; - struct nameidata nd; + struct path path; struct vfsmount *root_mnt; struct node *node; struct list_head list; @@ -544,12 +544,12 @@ void audit_trim_trees(void) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = path_lookup(tree->pathname, 0, &nd); + err = kern_path(tree->pathname, 0, &path); if (err) goto skip_it; - root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); + root_mnt = collect_mounts(path.mnt, path.dentry); + path_put(&path); if (!root_mnt) goto skip_it; @@ -580,19 +580,19 @@ skip_it: } static int is_under(struct vfsmount *mnt, struct dentry *dentry, - struct nameidata *nd) + struct path *path) { - if (mnt != nd->path.mnt) { + if (mnt != path->mnt) { for (;;) { if (mnt->mnt_parent == mnt) return 0; - if (mnt->mnt_parent == nd->path.mnt) + if (mnt->mnt_parent == path->mnt) break; mnt = mnt->mnt_parent; } dentry = mnt->mnt_mountpoint; } - return is_subdir(dentry, nd->path.dentry); + return is_subdir(dentry, path->dentry); } int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) @@ -618,7 +618,7 @@ void audit_put_tree(struct audit_tree *tree) int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; - struct nameidata nd; + struct path path; struct vfsmount *mnt, *p; struct list_head list; int err; @@ -637,11 +637,11 @@ int audit_add_tree_rule(struct audit_krule *rule) /* do not set rule->tree yet */ mutex_unlock(&audit_filter_mutex); - err = path_lookup(tree->pathname, 0, &nd); + err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); + mnt = collect_mounts(path.mnt, path.dentry); + path_put(&path); if (!mnt) { err = -ENOMEM; goto Err; @@ -690,29 +690,29 @@ int audit_tag_tree(char *old, char *new) { struct list_head cursor, barrier; int failed = 0; - struct nameidata nd; + struct path path; struct vfsmount *tagged; struct list_head list; struct vfsmount *mnt; struct dentry *dentry; int err; - err = path_lookup(new, 0, &nd); + err = kern_path(new, 0, &path); if (err) return err; - tagged = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); + tagged = collect_mounts(path.mnt, path.dentry); + path_put(&path); if (!tagged) return -ENOMEM; - err = path_lookup(old, 0, &nd); + err = kern_path(old, 0, &path); if (err) { drop_collected_mounts(tagged); return err; } - mnt = mntget(nd.path.mnt); - dentry = dget(nd.path.dentry); - path_put(&nd.path); + mnt = mntget(path.mnt); + dentry = dget(path.dentry); + path_put(&path); if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) follow_up(&mnt, &dentry); @@ -733,7 +733,7 @@ int audit_tag_tree(char *old, char *new) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = path_lookup(tree->pathname, 0, &nd); + err = kern_path(tree->pathname, 0, &path); if (err) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -741,15 +741,15 @@ int audit_tag_tree(char *old, char *new) } spin_lock(&vfsmount_lock); - if (!is_under(mnt, dentry, &nd)) { + if (!is_under(mnt, dentry, &path)) { spin_unlock(&vfsmount_lock); - path_put(&nd.path); + path_put(&path); put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } spin_unlock(&vfsmount_lock); - path_put(&nd.path); + path_put(&path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); -- cgit v1.2.1 From 6e62775ece1c83a84d86df44cfd8ea3e6c96ce23 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 4 Oct 2008 14:28:09 +0400 Subject: proc: move /proc/execdomains to kernel/exec_domain.c Signed-off-by: Alexey Dobriyan --- kernel/exec_domain.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0d407e886735..0511716e9424 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -173,20 +175,39 @@ __set_personality(u_long personality) return 0; } -int -get_exec_domain_list(char *page) +#ifdef CONFIG_PROC_FS +static int execdomains_proc_show(struct seq_file *m, void *v) { struct exec_domain *ep; - int len = 0; read_lock(&exec_domains_lock); - for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) - len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", + for (ep = exec_domains; ep; ep = ep->next) + seq_printf(m, "%d-%d\t%-16s\t[%s]\n", ep->pers_low, ep->pers_high, ep->name, module_name(ep->module)); read_unlock(&exec_domains_lock); - return (len); + return 0; +} + +static int execdomains_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, execdomains_proc_show, NULL); +} + +static const struct file_operations execdomains_proc_fops = { + .open = execdomains_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_execdomains_init(void) +{ + proc_create("execdomains", 0, NULL, &execdomains_proc_fops); + return 0; } +module_init(proc_execdomains_init); +#endif asmlinkage long sys_personality(u_long personality) -- cgit v1.2.1 From 3b5d5c6b0ccba733a313f8752ebc3f8015628ba3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Oct 2008 13:19:27 +0400 Subject: proc: move /proc/modules boilerplate to kernel/module.c Signed-off-by: Alexey Dobriyan --- kernel/module.c | 59 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0d8d21ee792c..6fded84d7029 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -2599,23 +2601,6 @@ unsigned long module_kallsyms_lookup_name(const char *name) } #endif /* CONFIG_KALLSYMS */ -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - static char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -2649,6 +2634,24 @@ static char *module_flags(struct module *mod, char *buf) return buf; } +#ifdef CONFIG_PROC_FS +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); @@ -2679,13 +2682,33 @@ static int m_show(struct seq_file *m, void *p) Where refcount is a number or -, and deps is a comma-separated list of depends or -. */ -const struct seq_operations modules_op = { +static const struct seq_operations modules_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show }; +static int modules_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &modules_op); +} + +static const struct file_operations proc_modules_operations = { + .open = modules_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &proc_modules_operations); + return 0; +} +module_init(proc_modules_init); +#endif + /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { -- cgit v1.2.1 From b5aadf7f14c1acc94956aa257e018e9de3881f41 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Oct 2008 13:23:43 +0400 Subject: proc: move /proc/schedstat boilerplate to kernel/sched_stats.h Signed-off-by: Alexey Dobriyan --- kernel/sched.c | 1 + kernel/sched_stats.h | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d906f72b42d2..5a70189d5051 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index b8c156979cf2..3d14ce273902 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file) return res; } -const struct file_operations proc_schedstat_operations = { +static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); + /* * Expects runqueue lock to be held for atomicity of update */ -- cgit v1.2.1 From d2441183dc222d12961ff2201f5086c846505d93 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Oct 2008 12:07:17 -0700 Subject: Fix compile warning in kernel/params.c Move free_module_param_attrs() into the CONFIG_MODULES section, since it's only used inside there. Thus avoiding the warning kernel/params.c:514: warning: 'free_module_param_attrs' defined but not used Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b077f1b045d3..a1e3025b19a9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -510,6 +510,7 @@ fail: return err; } +#ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { kfree(mk->mp->grp.attrs); @@ -517,7 +518,6 @@ static void free_module_param_attrs(struct module_kobject *mk) mk->mp = NULL; } -#ifdef CONFIG_MODULES /* * module_param_sysfs_setup - setup sysfs support for one module * @mod: module -- cgit v1.2.1 From 4403b406d4369a275d483ece6ddee0088cc0d592 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 25 Oct 2008 19:53:38 -0700 Subject: Revert "Call init_workqueues before pre smp initcalls." This reverts commit a802dd0eb5fc97a50cf1abb1f788a8f6cc5db635 by moving the call to init_workqueues() back where it belongs - after SMP has been initialized. It also moves stop_machine_init() - which needs workqueues - to a later phase using a core_initcall() instead of early_initcall(). That should satisfy all ordering requirements, and was apparently the reason why init_workqueues() was moved to be too early. Cc: Heiko Carstens Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 8aff79d90ddc..9bc4c00872c9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -160,4 +160,4 @@ static int __init stop_machine_init(void) stop_machine_work = alloc_percpu(struct work_struct); return 0; } -early_initcall(stop_machine_init); +core_initcall(stop_machine_init); -- cgit v1.2.1 From 2077776641b6ffb0049f13018d2e162340ec51c7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 21 Oct 2008 16:11:20 +1100 Subject: cgroup: remove unused variable /scratch/sfr/next/kernel/cgroup.c: In function 'cgroup_tasks_start': /scratch/sfr/next/kernel/cgroup.c:2107: warning: unused variable 'i' Introduced in commit cc31edceee04a7b87f2be48f9489ebb72d264844 "cgroups: convert tasks file to use a seq_file with shared pid array". Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 046c1609606b..35eebd5510c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2104,7 +2104,7 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) down_read(&cgrp->pids_mutex); if (pid) { int end = cgrp->pids_length; - int i; + while (index < end) { int mid = (index + end) / 2; if (cgrp->tasks_pids[mid] == pid) { -- cgit v1.2.1