From ae9d67aff60af59548b6c7d1a74febea09660122 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Tue, 18 Jan 2011 06:48:12 +0100
Subject: audit: export symbol for use with xt_AUDIT

When xt_AUDIT is built as a module, modpost reports a problem.

	MODPOST 322 modules
	ERROR: "audit_enabled" [net/netfilter/x_tables.ko] undefined!
	WARNING: modpost: Found 1 section mismatch(es).

Cc: Thomas Graf <tgraf@redhat.com>
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 kernel/audit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..5842f65bedcb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int	audit_initialized;
 int		audit_enabled;
 int		audit_ever_enabled;
 
+EXPORT_SYMBOL_GPL(audit_enabled);
+
 /* Default state when kernel boots without any parameters. */
 static int	audit_default;
 
-- 
cgit v1.2.1


From cd7eab44e9946c28d595abe3e9a43e945bc49141 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 19 Jan 2011 21:01:44 +0000
Subject: genirq: Add IRQ affinity notifiers

When initiating I/O on a multiqueue and multi-IRQ device, we may want
to select a queue for which the response will be handled on the same
or a nearby CPU.  This requires a reverse-map of IRQ affinity.  Add a
notification mechanism to support this.

This is based closely on work by Thomas Gleixner <tglx@linutronix.de>.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Cc: linux-net-drivers@solarflare.com
Cc: Tom Herbert <therbert@google.com>
Cc: David Miller <davem@davemloft.net>
LKML-Reference: <1295470904.11126.84.camel@bwh-desktop>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..0587c5ceaed8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -134,6 +134,10 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 		irq_set_thread_affinity(desc);
 	}
 #endif
+	if (desc->affinity_notify) {
+		kref_get(&desc->affinity_notify->kref);
+		schedule_work(&desc->affinity_notify->work);
+	}
 	desc->status |= IRQ_AFFINITY_SET;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -155,6 +159,79 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+static void irq_affinity_notify(struct work_struct *work)
+{
+	struct irq_affinity_notify *notify =
+		container_of(work, struct irq_affinity_notify, work);
+	struct irq_desc *desc = irq_to_desc(notify->irq);
+	cpumask_var_t cpumask;
+	unsigned long flags;
+
+	if (!desc)
+		goto out;
+
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		goto out;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (desc->status & IRQ_MOVE_PENDING)
+		cpumask_copy(cpumask, desc->pending_mask);
+	else
+#endif
+		cpumask_copy(cpumask, desc->affinity);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	notify->notify(notify, cpumask);
+
+	free_cpumask_var(cpumask);
+out:
+	kref_put(&notify->kref, notify->release);
+}
+
+/**
+ *	irq_set_affinity_notifier - control notification of IRQ affinity changes
+ *	@irq:		Interrupt for which to enable/disable notification
+ *	@notify:	Context for notification, or %NULL to disable
+ *			notification.  Function pointers must be initialised;
+ *			the other fields will be initialised by this function.
+ *
+ *	Must be called in process context.  Notification may only be enabled
+ *	after the IRQ is allocated and must be disabled before the IRQ is
+ *	freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_affinity_notify *old_notify;
+	unsigned long flags;
+
+	/* The release function is promised process context */
+	might_sleep();
+
+	if (!desc)
+		return -EINVAL;
+
+	/* Complete initialisation of *notify */
+	if (notify) {
+		notify->irq = irq;
+		kref_init(&notify->kref);
+		INIT_WORK(&notify->work, irq_affinity_notify);
+	}
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	old_notify = desc->affinity_notify;
+	desc->affinity_notify = notify;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	if (old_notify)
+		kref_put(&old_notify->kref, old_notify->release);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
@@ -1004,6 +1081,11 @@ void free_irq(unsigned int irq, void *dev_id)
 	if (!desc)
 		return;
 
+#ifdef CONFIG_SMP
+	if (WARN_ON(desc->affinity_notify))
+		desc->affinity_notify = NULL;
+#endif
+
 	chip_bus_lock(desc);
 	kfree(__free_irq(irq, dev_id));
 	chip_bus_sync_unlock(desc);
-- 
cgit v1.2.1


From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:01 -0800
Subject: sched: Simplify update_cfs_shares parameters

Re-visiting this: Since update_cfs_shares will now only ever re-weight an
entity that is a relative parent of the current entity in enqueue_entity; we
can safely issue the account_entity_enqueue relative to that cfs_rq and avoid
the requirement for special handling of the enqueue case in update_cfs_shares.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.915214637@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 30 ++++++++++++++----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..e0fa3ff7f194 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8510,7 +8510,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se), 0);
+			update_cfs_shares(group_cfs_rq(se));
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..0c550c841eee 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 /*
  * Update the current task's runtime statistics. Skip current tasks that
@@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 		list_del_leaf_cfs_rq(cfs_rq);
 }
 
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	long load_weight, load, shares;
 
-	load = cfs_rq->load.weight + weight_delta;
+	load = cfs_rq->load.weight;
 
 	load_weight = atomic_read(&tg->load_weight);
-	load_weight -= cfs_rq->load_contribution;
 	load_weight += load;
+	load_weight -= cfs_rq->load_contribution;
 
 	shares = (tg->shares * load);
 	if (load_weight)
@@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 }
 # else /* CONFIG_SMP */
@@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
@@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
@@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	if (likely(se->load.weight == tg->shares))
 		return;
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+	shares = calc_cfs_shares(cfs_rq, tg);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
 
@@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
+	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	 * We need to update shares after updating tg->load_weight in
 	 * order to adjust the weight of groups with long running tasks.
 	 */
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.2.1


From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:03 -0800
Subject: sched: Avoid expensive initial update_cfs_load()

Since cfs->{load_stamp,load_last} are zero-initalized the initial load update
will consider the delta to be 'since the beginning of time'.

This results in a lot of pointless divisions to bring this large period to be
within the sysctl_sched_shares_window.

Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this
allows for an initial load_stamp > load_last which then lets standard idle
truncation proceed.

We avoid spinning (and slightly improve consistency) by fixing delta to be
[period - 1] in this path resulting in a slightly more predictable shares ramp.
(Previously the amount of idle time preserved by the overflow would range between
[period/2,period-1].)

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 ++
 kernel/sched_fair.c | 1 +
 2 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e0fa3ff7f194..6820b5b3a969 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7796,6 +7796,8 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
+	/* allow initial update_cfs_load() to truncate */
+	cfs_rq->load_stamp = 1;
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c550c841eee..4cbc9121094c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	    now - cfs_rq->load_last > 4 * period) {
 		cfs_rq->load_period = 0;
 		cfs_rq->load_avg = 0;
+		delta = period - 1;
 	}
 
 	cfs_rq->load_stamp = now;
-- 
cgit v1.2.1


From 4dd53d891ca46dcc1fde0376a33540d3fd83cb9a Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:00 -0800
Subject: softirqs: Free up pf flag PF_KSOFTIRQD

Cleanup patch, freeing up PF_KSOFTIRQD and use per_cpu ksoftirqd pointer
instead, as suggested by Eric Dumazet.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-2-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c   | 2 +-
 kernel/softirq.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6820b5b3a969..8b89b3bba565 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1880,7 +1880,7 @@ void account_system_vtime(struct task_struct *curr)
 	 */
 	if (hardirq_count())
 		__this_cpu_add(cpu_hardirq_time, delta);
-	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		__this_cpu_add(cpu_softirq_time, delta);
 
 	irq_time_write_end();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..0cee50487629 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	current->flags |= PF_KSOFTIRQD;
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
-- 
cgit v1.2.1


From a1dabb6bfffccb897eff3e1d102dacf2a4bedf3b Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:01 -0800
Subject: time: Add nsecs_to_cputime64 interface for asm-generic

Add nsecs_to_cputime64 interface. This is used in following patches that
updates cpu irq stat based on ns granularity info in IRQ_TIME_ACCOUNTING.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-3-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..55337a816b20 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
 }
 
 /**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
  * @n:	nsecs in u64
  *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
  *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
  *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
  */
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
 {
 #if (NSEC_PER_SEC % HZ) == 0
 	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
 
+
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:	nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+	return (unsigned long)nsecs_to_jiffies64(n);
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.1


From 70a89a6620f658d47a1488515bada4b8ee6291d8 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:02 -0800
Subject: sched: Refactor account_system_time separating id-update

Refactor account_system_time, to separate out the logic of
identifying the update needed and code that does actual update.

This is used by following patch for IRQ_TIME_ACCOUNTING,
which has different identification logic and same update logic.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-4-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b89b3bba565..e3fa92106ed7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3567,6 +3567,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 	}
 }
 
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	cputime64_t tmp = cputime_to_cputime64(cputime);
+
+	/* Add system time to process. */
+	p->stime = cputime_add(p->stime, cputime);
+	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+	account_group_system_time(p, cputime);
+
+	/* Add system time to cpustat. */
+	*target_cputime64 = cputime64_add(*target_cputime64, tmp);
+	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -3578,31 +3604,21 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t tmp;
+	cputime64_t *target_cputime64;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 
-	/* Add system time to process. */
-	p->stime = cputime_add(p->stime, cputime);
-	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
-		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+		target_cputime64 = &cpustat->irq;
 	else if (in_serving_softirq())
-		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+		target_cputime64 = &cpustat->softirq;
 	else
-		cpustat->system = cputime64_add(cpustat->system, tmp);
+		target_cputime64 = &cpustat->system;
 
-	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
-	/* Account for system time used */
-	acct_update_integrals(p);
+	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
 /*
-- 
cgit v1.2.1


From abb74cefa9c682fb38ba86c17ca3c86fed6cc464 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:03 -0800
Subject: sched: Export ns irqtimes through /proc/stat

CONFIG_IRQ_TIME_ACCOUNTING adds ns granularity irq time on each CPU.
This info is already used in scheduler to do proper task chargeback
(earlier patches). This patch retro-fits this ns granularity
hardirq and softirq information to /proc/stat irq and softirq fields.

The update is still done on timer tick, where we look at accumulated
ns hardirq/softirq time and account the tick to user/system/irq/hardirq/guest
accordingly.

No new interface added.

Earlier versions looked at adding this as new fields in some /proc
files. This one seems to be the best in terms of impact to existing
apps, even though it has somewhat more kernel code than earlier versions.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-5-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e3fa92106ed7..2a3c9799d76b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1920,8 +1920,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 		sched_rt_avg_update(rq, irq_delta);
 }
 
+static int irqtime_account_hi_update(void)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	unsigned long flags;
+	u64 latest_ns;
+	int ret = 0;
+
+	local_irq_save(flags);
+	latest_ns = this_cpu_read(cpu_hardirq_time);
+	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+		ret = 1;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	unsigned long flags;
+	u64 latest_ns;
+	int ret = 0;
+
+	local_irq_save(flags);
+	latest_ns = this_cpu_read(cpu_softirq_time);
+	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+		ret = 1;
+	local_irq_restore(flags);
+	return ret;
+}
+
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#define sched_clock_irqtime	(0)
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	rq->clock_task += delta;
@@ -3621,6 +3653,65 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+						struct rq *rq)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+	if (irqtime_account_hi_update()) {
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	} else if (irqtime_account_si_update()) {
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	} else if (user_tick) {
+		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	} else if (p == rq->idle) {
+		account_idle_time(cputime_one_jiffy);
+	} else if (p->flags & PF_VCPU) { /* System time or guest time */
+		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	} else {
+		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+					&cpustat->system);
+	}
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+	int i;
+	struct rq *rq = this_rq();
+
+	for (i = 0; i < ticks; i++)
+		irqtime_account_process_tick(current, 0, rq);
+}
+#else
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+						struct rq *rq) {}
+#endif
+
 /*
  * Account for involuntary wait time.
  * @steal: the cpu time spent in involuntary wait
@@ -3661,6 +3752,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 
+	if (sched_clock_irqtime) {
+		irqtime_account_process_tick(p, user_tick, rq);
+		return;
+	}
+
 	if (user_tick)
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3686,6 +3782,12 @@ void account_steal_ticks(unsigned long ticks)
  */
 void account_idle_ticks(unsigned long ticks)
 {
+
+	if (sched_clock_irqtime) {
+		irqtime_account_idle_ticks(ticks);
+		return;
+	}
+
 	account_idle_time(jiffies_to_cputime(ticks));
 }
 
-- 
cgit v1.2.1


From 414bee9ba613adb3804965e2d84db32d0599f9c6 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:04 -0800
Subject: softirqs: Account ksoftirqd time as cpustat softirq

softirq time in ksoftirqd context is not accounted in ns granularity
per cpu softirq stats, as we want that to be a part of ksoftirqd
exec_runtime.

Accounting them as softirq on /proc/stat separately.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-6-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2a3c9799d76b..8b718b59b09f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3686,6 +3686,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	} else if (irqtime_account_si_update()) {
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	} else if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 * Also, p->stime needs to be updated for ksoftirqd.
+		 */
+		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+					&cpustat->softirq);
 	} else if (user_tick) {
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	} else if (p == rq->idle) {
-- 
cgit v1.2.1


From a8941d7ec81678fb69aea7183338175f112f3e0d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 Jan 2011 16:30:03 +0100
Subject: sched: Simplify the idle scheduling class

Since commit 48c5ccae88dcd (sched: Simplify cpu-hot-unplug task
migration) this should no longer happen, so remove the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_idletask.c | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..41eb62a0808b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,16 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
-			     int running)
+static void
+switched_to_idle(struct rq *rq, struct task_struct *p, int running)
 {
-	/* Can this actually happen?? */
-	if (running)
-		resched_task(rq->curr);
-	else
-		check_preempt_curr(rq, p, 0);
+	BUG();
 }
 
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 			      int oldprio, int running)
 {
-	/* This can happen for hot plug CPUS */
-
-	/*
-	 * Reschedule if we are currently running on this runqueue and
-	 * our priority decreased, or if we are not currently running on
-	 * this runqueue and our priority is higher than the current's
-	 */
-	if (running) {
-		if (p->prio > oldprio)
-			resched_task(rq->curr);
-	} else
-		check_preempt_curr(rq, p, 0);
+	BUG();
 }
 
 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-- 
cgit v1.2.1


From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 17 Jan 2011 17:03:27 +0100
Subject: sched: Fix switch_from_fair()

When a task is taken out of the fair class we must ensure the vruntime
is properly normalized because when we put it back in it will assume
to be normalized.

The case that goes wrong is when changing away from the fair class
while sleeping. Sleeping tasks have non-normalized vruntime in order
to make sleeper-fairness work. So treat the switch away from fair as a
wakeup and preserve the relative vruntime.

Also update sysrq-n to call the ->switch_{to,from} methods.

Reported-by: Onkalo Samu <samu.p.onkalo@nokia.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 25 ++++++++++++++-----------
 kernel/sched_fair.c     | 42 ++++++++++++++++++++++++++++++++++++------
 kernel/sched_idletask.c |  7 +++----
 kernel/sched_rt.c       | 19 ++++++++++---------
 kernel/sched_stoptask.c |  7 +++----
 5 files changed, 66 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b718b59b09f..78fa75394011 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2057,14 +2057,14 @@ inline int task_curr(const struct task_struct *p)
 
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
-				       int oldprio, int running)
+				       int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
-			prev_class->switched_from(rq, p, running);
-		p->sched_class->switched_to(rq, p, running);
-	} else
-		p->sched_class->prio_changed(rq, p, oldprio, running);
+			prev_class->switched_from(rq, p);
+		p->sched_class->switched_to(rq, p);
+	} else if (oldprio != p->prio)
+		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2598,6 +2598,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
+	p->se.vruntime			= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -4696,11 +4697,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (on_rq)
 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
-		check_class_changed(rq, p, prev_class, oldprio, running);
-	}
+	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, &flags);
 }
 
@@ -5028,11 +5028,10 @@ recheck:
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (on_rq)
 		activate_task(rq, p, 0);
 
-		check_class_changed(rq, p, prev_class, oldprio, running);
-	}
+	check_class_changed(rq, p, prev_class, oldprio);
 	__task_rq_unlock(rq);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -8237,6 +8236,8 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+	const struct sched_class *prev_class = p->sched_class;
+	int old_prio = p->prio;
 	int on_rq;
 
 	on_rq = p->se.on_rq;
@@ -8247,6 +8248,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
+
+	check_class_changed(rq, p, prev_class, old_prio);
 }
 
 void normalize_rt_tasks(void)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4cbc9121094c..55040f3938d8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p)
  * Priority of the task has changed. Check to see if we preempt
  * the current task.
  */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * Reschedule if we are currently running on this runqueue and
 	 * our priority decreased, or if we are not currently running on
 	 * this runqueue and our priority is higher than the current's
 	 */
-	if (running) {
+	if (rq->curr == p) {
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
 
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/*
+	 * Ensure the task's vruntime is normalized, so that when its
+	 * switched back to the fair class the enqueue_entity(.flags=0) will
+	 * do the right thing.
+	 *
+	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it was !on_rq, then only when
+	 * the task is sleeping will it still have non-normalized vruntime.
+	 */
+	if (!se->on_rq && p->state != TASK_RUNNING) {
+		/*
+		 * Fix up our vruntime so that the current sleep doesn't
+		 * cause 'unlimited' sleep bonus.
+		 */
+		place_entity(cfs_rq, se, 0);
+		se->vruntime -= cfs_rq->min_vruntime;
+	}
+}
+
 /*
  * We switched to the sched_fair class.
  */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * We were most likely switched from sched_rt, so
 	 * kick off the schedule if running, otherwise just see
 	 * if we can still preempt the current task.
 	 */
-	if (running)
+	if (rq->curr == p)
 		resched_task(rq->curr);
 	else
 		check_preempt_curr(rq, p, 0);
@@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = {
 	.task_fork		= task_fork_fair,
 
 	.prio_changed		= prio_changed_fair,
+	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
 
 	.get_rr_interval	= get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 41eb62a0808b..c82f26c1b7c3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,14 +52,13 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
-static void
-switched_to_idle(struct rq *rq, struct task_struct *p, int running)
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
 {
 	BUG();
 }
 
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	BUG();
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..c381fdc18c64 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1595,8 +1595,7 @@ static void rq_offline_rt(struct rq *rq)
  * When switch from the rt queue, we bring ourselves to a position
  * that we might want to pull RT tasks from other runqueues.
  */
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
-			   int running)
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
 	/*
 	 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1604,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!rq->rt.rt_nr_running)
+	if (p->se.on_rq && !rq->rt.rt_nr_running)
 		pull_rt_task(rq);
 }
 
@@ -1624,8 +1623,7 @@ static inline void init_sched_rt_class(void)
  * with RT tasks. In this case we try to push them off to
  * other runqueues.
  */
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
-			   int running)
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
 	int check_resched = 1;
 
@@ -1636,7 +1634,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
 	 * If that current running task is also an RT task
 	 * then see if we can move to another run queue.
 	 */
-	if (!running) {
+	if (p->se.on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->rt.overloaded && push_rt_task(rq) &&
 		    /* Don't resched if we changed runqueues */
@@ -1652,10 +1650,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
  * Priority of the task has changed. This may cause
  * us to initiate a push or pull.
  */
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
-			    int oldprio, int running)
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (running) {
+	if (!p->se.on_rq)
+		return;
+
+	if (rq->curr == p) {
 #ifdef CONFIG_SMP
 		/*
 		 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..84ec9bcf82d9 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
 {
 }
 
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
 {
 	BUG(); /* its impossible to change to this class */
 }
 
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	BUG(); /* how!?, what priority? */
 }
-- 
cgit v1.2.1


From ccaa8d657117bb1876d471bd91579d774106778d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 25 Jan 2011 23:17:32 +0100
Subject: rtmutex-tester: Remove BKL tests

The BKL is going away, no need to test it any more.
I left the definitions of the test case numbers
in, so that the other tests do not get renumbered.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1295993854-4971-19-git-send-email-arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rtmutex-tester.c | 39 ++++-----------------------------------
 1 file changed, 4 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..d5b543506cbc 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
 	int			opcode;
 	int			opdata;
 	int			mutexes[MAX_RT_TEST_MUTEXES];
-	int			bkl;
 	int			event;
 	struct sys_device	sysdev;
 };
@@ -46,8 +44,8 @@ enum test_opcodes {
 	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
 	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
 	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	RTTEST_LOCKBKL,		/* 9 Lock BKL */
-	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_LOCKBKL,		/* 9 Was: Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Was: Unlock BKL */
 	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
 	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
 	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
@@ -74,13 +72,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
 				td->mutexes[i] = 0;
 			}
 		}
-
-		if (!lockwakeup && td->bkl == 4) {
-#ifdef CONFIG_LOCK_KERNEL
-			unlock_kernel();
-#endif
-			td->bkl = 0;
-		}
 		return 0;
 
 	case RTTEST_RESETEVENT:
@@ -131,25 +122,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
 		td->mutexes[id] = 0;
 		return 0;
 
-	case RTTEST_LOCKBKL:
-		if (td->bkl)
-			return 0;
-		td->bkl = 1;
-#ifdef CONFIG_LOCK_KERNEL
-		lock_kernel();
-#endif
-		td->bkl = 4;
-		return 0;
-
-	case RTTEST_UNLOCKBKL:
-		if (td->bkl != 4)
-			break;
-#ifdef CONFIG_LOCK_KERNEL
-		unlock_kernel();
-#endif
-		td->bkl = 0;
-		return 0;
-
 	default:
 		break;
 	}
@@ -196,7 +168,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
 		td->event = atomic_add_return(1, &rttest_event);
 		break;
 
-	case RTTEST_LOCKBKL:
 	default:
 		break;
 	}
@@ -229,8 +200,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
 		td->event = atomic_add_return(1, &rttest_event);
 		return;
 
-	case RTTEST_LOCKBKL:
-		return;
 	default:
 		return;
 	}
@@ -380,11 +349,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
 	spin_lock(&rttest_lock);
 
 	curr += sprintf(curr,
-		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
 		td->opcode, td->event, tsk->state,
 			(MAX_RT_PRIO - 1) - tsk->prio,
 			(MAX_RT_PRIO - 1) - tsk->normal_prio,
-		tsk->pi_blocked_on, td->bkl);
+		tsk->pi_blocked_on);
 
 	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
 		curr += sprintf(curr, "%d", td->mutexes[i]);
-- 
cgit v1.2.1


From 10389a15e25fd4784d42de7e0e3fc8c242f2011d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:25:56 +0100
Subject: cred: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..b5496e81b0f7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
 static struct thread_group_cred init_tgcred = {
 	.usage	= ATOMIC_INIT(2),
 	.tgid	= 0,
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
 };
 #endif
 
-- 
cgit v1.2.1


From 6ea72f12069306b235151c5b05ac0cca7e1dedfa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 26 Jan 2011 13:36:03 +0100
Subject: sched: Avoid expensive initial update_cfs_load(), on UP too

Fix the build on UP.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 78fa75394011..477e1bcc63f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7922,7 +7922,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 	/* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
 	cfs_rq->load_stamp = 1;
+#endif
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
-- 
cgit v1.2.1


From 8161239a8bcce9ad6b537c04a1fa3b5c68bae693 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 14 Jan 2011 17:09:41 +0800
Subject: rtmutex: Simplify PI algorithm and make highest prio task get lock

In current rtmutex, the pending owner may be boosted by the tasks
in the rtmutex's waitlist when the pending owner is deboosted
or a task in the waitlist is boosted. This boosting is unrelated,
because the pending owner does not really take the rtmutex.
It is not reasonable.

Example.

time1:
A(high prio) onwers the rtmutex.
B(mid prio) and C (low prio) in the waitlist.

time2
A release the lock, B becomes the pending owner
A(or other high prio task) continues to run. B's prio is lower
than A, so B is just queued at the runqueue.

time3
A or other high prio task sleeps, but we have passed some time
The B and C's prio are changed in the period (time2 ~ time3)
due to boosting or deboosting. Now C has the priority higher
than B. ***Is it reasonable that C has to boost B and help B to
get the rtmutex?

NO!! I think, it is unrelated/unneed boosting before B really
owns the rtmutex. We should give C a chance to beat B and
win the rtmutex.

This is the motivation of this patch. This patch *ensures*
only the top waiter or higher priority task can take the lock.

How?
1) we don't dequeue the top waiter when unlock, if the top waiter
   is changed, the old top waiter will fail and go to sleep again.
2) when requiring lock, it will get the lock when the lock is not taken and:
   there is no waiter OR higher priority than waiters OR it is top waiter.
3) In any time, the top waiter is changed, the top waiter will be woken up.

The algorithm is much simpler than before, no pending owner, no
boosting for pending owner.

Other advantage of this patch:
1) The states of a rtmutex are reduced a half, easier to read the code.
2) the codes become shorter.
3) top waiter is not dequeued until it really take the lock:
   they will retain FIFO when it is stolen.

Not advantage nor disadvantage
1) Even we may wakeup multiple waiters(any time when top waiter changed),
   we hardly cause "thundering herd",
   the number of wokenup task is likely 1 or very little.
2) two APIs are changed.
   rt_mutex_owner() will not return pending owner, it will return NULL when
                    the top waiter is going to take the lock.
   rt_mutex_next_owner() always return the top waiter.
	                 will not return NULL if we have waiters
                         because the top waiter is not dequeued.

   I have fixed the code that use these APIs.

need updated after this patch is accepted
1) Document/*
2) the testcase scripts/rt-tester/t4-l2-pi-deboost.tst

Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D3012D5.4060709@cn.fujitsu.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/futex.c          |  22 ++--
 kernel/rtmutex-debug.c  |   1 -
 kernel/rtmutex.c        | 318 +++++++++++++++++-------------------------------
 kernel/rtmutex_common.h |  16 +--
 4 files changed, 127 insertions(+), 230 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..64c38115c7b6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1556,10 +1556,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
 	/*
 	 * We are here either because we stole the rtmutex from the
-	 * pending owner or we are the pending owner which failed to
-	 * get the rtmutex. We have to replace the pending owner TID
-	 * in the user space variable. This must be atomic as we have
-	 * to preserve the owner died bit here.
+	 * previous highest priority waiter or we are the highest priority
+	 * waiter but failed to get the rtmutex the first time.
+	 * We have to replace the newowner TID in the user space variable.
+	 * This must be atomic as we have to preserve the owner died bit here.
 	 *
 	 * Note: We write the user space value _before_ changing the pi_state
 	 * because we can fault here. Imagine swapped out pages or a fork
@@ -1608,8 +1608,8 @@ retry:
 
 	/*
 	 * To handle the page fault we need to drop the hash bucket
-	 * lock here. That gives the other task (either the pending
-	 * owner itself or the task which stole the rtmutex) the
+	 * lock here. That gives the other task (either the highest priority
+	 * waiter itself or the task which stole the rtmutex) the
 	 * chance to try the fixup of the pi_state. So once we are
 	 * back from handling the fault we need to check the pi_state
 	 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1685,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		/*
 		 * pi_state is incorrect, some other task did a lock steal and
 		 * we returned due to timeout or signal without taking the
-		 * rt_mutex. Too late. We can access the rt_mutex_owner without
-		 * locking, as the other task is now blocked on the hash bucket
-		 * lock. Fix the state up.
+		 * rt_mutex. Too late.
 		 */
+		raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
 		owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+		if (!owner)
+			owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+		raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
 		ret = fixup_pi_state_owner(uaddr, q, owner);
 		goto out;
 	}
 
 	/*
 	 * Paranoia check. If we did not take the lock, then we should not be
-	 * the owner, nor the pending owner, of the rt_mutex.
+	 * the owner of the rt_mutex.
 	 */
 	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
 		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 	put_pid(waiter->deadlock_task_pid);
 	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
 	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-	TRACE_WARN_ON(waiter->task);
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
 /*
  * lock->owner state tracking:
  *
- * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
- * are used to keep track of the "owner is pending" and "lock has
- * waiters" state.
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
+ * is used to keep track of the "lock has waiters" state.
  *
- * owner	bit1	bit0
- * NULL		0	0	lock is free (fast acquire possible)
- * NULL		0	1	invalid state
- * NULL		1	0	Transitional State*
- * NULL		1	1	invalid state
- * taskpointer	0	0	lock is held (fast release possible)
- * taskpointer	0	1	task is pending owner
- * taskpointer	1	0	lock is held and has waiters
- * taskpointer	1	1	task is pending owner and lock has more waiters
- *
- * Pending ownership is assigned to the top (highest priority)
- * waiter of the lock, when the lock is released. The thread is woken
- * up and can now take the lock. Until the lock is taken (bit 0
- * cleared) a competing higher priority thread can steal the lock
- * which puts the woken up thread back on the waiters list.
+ * owner	bit0
+ * NULL		0	lock is free (fast acquire possible)
+ * NULL		1	lock is free and has waiters and the top waiter
+ *				is going to take the lock*
+ * taskpointer	0	lock is held (fast release possible)
+ * taskpointer	1	lock is held and has waiters**
  *
  * The fast atomic compare exchange based acquire and release is only
- * possible when bit 0 and 1 of lock->owner are 0.
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
  *
- * (*) There's a small time where the owner can be NULL and the
- * "lock has waiters" bit is set.  This can happen when grabbing the lock.
- * To prevent a cmpxchg of the owner releasing the lock, we need to set this
- * bit before looking at the lock, hence the reason this is a transitional
- * state.
+ * (**) There is a small time when bit 0 is set but there are no
+ * waiters. This can happen when grabbing the lock in the slow path.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
+ * set this bit before looking at the lock.
  */
 
 static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
-		   unsigned long mask)
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
 {
-	unsigned long val = (unsigned long)owner | mask;
+	unsigned long val = (unsigned long)owner;
 
 	if (rt_mutex_has_waiters(lock))
 		val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * reached or the state of the chain has changed while we
 	 * dropped the locks.
 	 */
-	if (!waiter || !waiter->task)
+	if (!waiter)
 		goto out_unlock_pi;
 
 	/*
 	 * Check the orig_waiter state. After we dropped the locks,
-	 * the previous owner of the lock might have released the lock
-	 * and made us the pending owner:
+	 * the previous owner of the lock might have released the lock.
 	 */
-	if (orig_waiter && !orig_waiter->task)
+	if (orig_waiter && !rt_mutex_owner(orig_lock))
 		goto out_unlock_pi;
 
 	/*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 
 	/* Release the task */
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	if (!rt_mutex_owner(lock)) {
+		/*
+		 * If the requeue above changed the top waiter, then we need
+		 * to wake the new top waiter up to try to get the lock.
+		 */
+
+		if (top_waiter != rt_mutex_top_waiter(lock))
+			wake_up_process(rt_mutex_top_waiter(lock)->task);
+		raw_spin_unlock(&lock->wait_lock);
+		goto out_put_task;
+	}
 	put_task_struct(task);
 
 	/* Grab the next task */
@@ -295,79 +298,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	return ret;
 }
 
-/*
- * Optimization: check if we can steal the lock from the
- * assigned pending owner [which might not have taken the
- * lock yet]:
- */
-static inline int try_to_steal_lock(struct rt_mutex *lock,
-				    struct task_struct *task)
-{
-	struct task_struct *pendowner = rt_mutex_owner(lock);
-	struct rt_mutex_waiter *next;
-	unsigned long flags;
-
-	if (!rt_mutex_owner_pending(lock))
-		return 0;
-
-	if (pendowner == task)
-		return 1;
-
-	raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-	if (task->prio >= pendowner->prio) {
-		raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-		return 0;
-	}
-
-	/*
-	 * Check if a waiter is enqueued on the pending owners
-	 * pi_waiters list. Remove it and readjust pending owners
-	 * priority.
-	 */
-	if (likely(!rt_mutex_has_waiters(lock))) {
-		raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-		return 1;
-	}
-
-	/* No chain handling, pending owner is not blocked on anything: */
-	next = rt_mutex_top_waiter(lock);
-	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
-	__rt_mutex_adjust_prio(pendowner);
-	raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
-	/*
-	 * We are going to steal the lock and a waiter was
-	 * enqueued on the pending owners pi_waiters queue. So
-	 * we have to enqueue this waiter into
-	 * task->pi_waiters list. This covers the case,
-	 * where task is boosted because it holds another
-	 * lock and gets unboosted because the booster is
-	 * interrupted, so we would delay a waiter with higher
-	 * priority as task->normal_prio.
-	 *
-	 * Note: in the rare case of a SCHED_OTHER task changing
-	 * its priority and thus stealing the lock, next->task
-	 * might be task:
-	 */
-	if (likely(next->task != task)) {
-		raw_spin_lock_irqsave(&task->pi_lock, flags);
-		plist_add(&next->pi_list_entry, &task->pi_waiters);
-		__rt_mutex_adjust_prio(task);
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-	}
-	return 1;
-}
-
 /*
  * Try to take an rt-mutex
  *
- * This fails
- * - when the lock has a real owner
- * - when a different pending owner exists and has higher priority than current
- *
  * Must be called with lock->wait_lock held.
+ *
+ * @lock:   the lock to be acquired.
+ * @task:   the task which wants to acquire the lock
+ * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		struct rt_mutex_waiter *waiter)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 	 */
 	mark_rt_mutex_waiters(lock);
 
-	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+	if (rt_mutex_owner(lock))
 		return 0;
 
+	/*
+	 * It will get the lock because of one of these conditions:
+	 * 1) there is no waiter
+	 * 2) higher priority than waiters
+	 * 3) it is top waiter
+	 */
+	if (rt_mutex_has_waiters(lock)) {
+		if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+			if (!waiter || waiter != rt_mutex_top_waiter(lock))
+				return 0;
+		}
+	}
+
+	if (waiter || rt_mutex_has_waiters(lock)) {
+		unsigned long flags;
+		struct rt_mutex_waiter *top;
+
+		raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+		/* remove the queued waiter. */
+		if (waiter) {
+			plist_del(&waiter->list_entry, &lock->wait_list);
+			task->pi_blocked_on = NULL;
+		}
+
+		/*
+		 * We have to enqueue the top waiter(if it exists) into
+		 * task->pi_waiters list.
+		 */
+		if (rt_mutex_has_waiters(lock)) {
+			top = rt_mutex_top_waiter(lock);
+			top->pi_list_entry.prio = top->list_entry.prio;
+			plist_add(&top->pi_list_entry, &task->pi_waiters);
+		}
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	}
+
 	/* We got the lock. */
 	debug_rt_mutex_lock(lock);
 
-	rt_mutex_set_owner(lock, current, 0);
+	rt_mutex_set_owner(lock, task);
 
-	rt_mutex_deadlock_account_lock(lock, current);
+	rt_mutex_deadlock_account_lock(lock, task);
 
 	return 1;
 }
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 
+	if (!owner)
+		return 0;
+
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		raw_spin_lock_irqsave(&owner->pi_lock, flags);
 		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
  * Wake up the next waiter on the lock.
  *
- * Remove the top waiter from the current tasks waiter list and from
- * the lock waiter list. Set it as pending owner. Then wake it up.
+ * Remove the top waiter from the current tasks waiter list and wake it up.
  *
  * Called with lock->wait_lock held.
  */
 static void wakeup_next_waiter(struct rt_mutex *lock)
 {
 	struct rt_mutex_waiter *waiter;
-	struct task_struct *pendowner;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&current->pi_lock, flags);
 
 	waiter = rt_mutex_top_waiter(lock);
-	plist_del(&waiter->list_entry, &lock->wait_list);
 
 	/*
 	 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 	 * lock->wait_lock.
 	 */
 	plist_del(&waiter->pi_list_entry, &current->pi_waiters);
-	pendowner = waiter->task;
-	waiter->task = NULL;
 
-	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+	rt_mutex_set_owner(lock, NULL);
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	/*
-	 * Clear the pi_blocked_on variable and enqueue a possible
-	 * waiter into the pi_waiters list of the pending owner. This
-	 * prevents that in case the pending owner gets unboosted a
-	 * waiter with higher priority than pending-owner->normal_prio
-	 * is blocked on the unboosted (pending) owner.
-	 */
-	raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-
-	WARN_ON(!pendowner->pi_blocked_on);
-	WARN_ON(pendowner->pi_blocked_on != waiter);
-	WARN_ON(pendowner->pi_blocked_on->lock != lock);
-
-	pendowner->pi_blocked_on = NULL;
-
-	if (rt_mutex_has_waiters(lock)) {
-		struct rt_mutex_waiter *next;
-
-		next = rt_mutex_top_waiter(lock);
-		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
-	}
-	raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
-	wake_up_process(pendowner);
+	wake_up_process(waiter->task);
 }
 
 /*
- * Remove a waiter from a lock
+ * Remove a waiter from a lock and give up
  *
- * Must be called with lock->wait_lock held
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
  */
 static void remove_waiter(struct rt_mutex *lock,
 			  struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
 
 	raw_spin_lock_irqsave(&current->pi_lock, flags);
 	plist_del(&waiter->list_entry, &lock->wait_list);
-	waiter->task = NULL;
 	current->pi_blocked_on = NULL;
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	if (first && owner != current) {
+	if (!owner)
+		return;
+
+	if (first) {
 
 		raw_spin_lock_irqsave(&owner->pi_lock, flags);
 
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
  * 			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
- * @detect_deadlock:	 passed to task_blocks_on_rt_mutex
  *
  * lock->wait_lock must be held by the caller.
  */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		    struct hrtimer_sleeper *timeout,
-		    struct rt_mutex_waiter *waiter,
-		    int detect_deadlock)
+		    struct rt_mutex_waiter *waiter)
 {
 	int ret = 0;
 
 	for (;;) {
 		/* Try to acquire the lock: */
-		if (try_to_take_rt_mutex(lock))
+		if (try_to_take_rt_mutex(lock, current, waiter))
 			break;
 
 		/*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 				break;
 		}
 
-		/*
-		 * waiter->task is NULL the first time we come here and
-		 * when we have been woken up by the previous owner
-		 * but the lock got stolen by a higher prio task.
-		 */
-		if (!waiter->task) {
-			ret = task_blocks_on_rt_mutex(lock, waiter, current,
-						      detect_deadlock);
-			/*
-			 * If we got woken up by the owner then start loop
-			 * all over without going into schedule to try
-			 * to get the lock now:
-			 */
-			if (unlikely(!waiter->task)) {
-				/*
-				 * Reset the return value. We might
-				 * have returned with -EDEADLK and the
-				 * owner released the lock while we
-				 * were walking the pi chain.
-				 */
-				ret = 0;
-				continue;
-			}
-			if (unlikely(ret))
-				break;
-		}
-
 		raw_spin_unlock(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(waiter);
 
-		if (waiter->task)
-			schedule_rt_mutex(lock);
+		schedule_rt_mutex(lock);
 
 		raw_spin_lock(&lock->wait_lock);
 		set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	int ret = 0;
 
 	debug_rt_mutex_init_waiter(&waiter);
-	waiter.task = NULL;
 
 	raw_spin_lock(&lock->wait_lock);
 
 	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock)) {
+	if (try_to_take_rt_mutex(lock, current, NULL)) {
 		raw_spin_unlock(&lock->wait_lock);
 		return 0;
 	}
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 			timeout->task = NULL;
 	}
 
-	ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
-				  detect_deadlock);
+	ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+
+	if (likely(!ret))
+		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
 
 	set_current_state(TASK_RUNNING);
 
-	if (unlikely(waiter.task))
+	if (unlikely(ret))
 		remove_waiter(lock, &waiter);
 
 	/*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	if (unlikely(timeout))
 		hrtimer_cancel(&timeout->timer);
 
-	/*
-	 * Readjust priority, when we did not get the lock. We might
-	 * have been the pending owner and boosted. Since we did not
-	 * take the lock, the PI boost has to go.
-	 */
-	if (unlikely(ret))
-		rt_mutex_adjust_prio(current);
-
 	debug_rt_mutex_free_waiter(&waiter);
 
 	return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
-		ret = try_to_take_rt_mutex(lock);
+		ret = try_to_take_rt_mutex(lock, current, NULL);
 		/*
 		 * try_to_take_rt_mutex() sets the lock waiters
 		 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 {
 	__rt_mutex_init(lock, NULL);
 	debug_rt_mutex_proxy_lock(lock, proxy_owner);
-	rt_mutex_set_owner(lock, proxy_owner, 0);
+	rt_mutex_set_owner(lock, proxy_owner);
 	rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
 
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 			   struct task_struct *proxy_owner)
 {
 	debug_rt_mutex_proxy_unlock(lock);
-	rt_mutex_set_owner(lock, NULL, 0);
+	rt_mutex_set_owner(lock, NULL);
 	rt_mutex_deadlock_account_unlock(proxy_owner);
 }
 
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 
 	raw_spin_lock(&lock->wait_lock);
 
-	mark_rt_mutex_waiters(lock);
-
-	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
-		/* We got the lock for task. */
-		debug_rt_mutex_lock(lock);
-		rt_mutex_set_owner(lock, task, 0);
+	if (try_to_take_rt_mutex(lock, task, NULL)) {
 		raw_spin_unlock(&lock->wait_lock);
-		rt_mutex_deadlock_account_lock(lock, task);
 		return 1;
 	}
 
 	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
 
-	if (ret && !waiter->task) {
+	if (ret && !rt_mutex_owner(lock)) {
 		/*
 		 * Reset the return value. We might have
 		 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		 */
 		ret = 0;
 	}
+
+	if (unlikely(ret))
+		remove_waiter(lock, waiter);
+
 	raw_spin_unlock(&lock->wait_lock);
 
 	debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
-				  detect_deadlock);
+	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
 
 	set_current_state(TASK_RUNNING);
 
-	if (unlikely(waiter->task))
+	if (unlikely(ret))
 		remove_waiter(lock, waiter);
 
 	/*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	raw_spin_unlock(&lock->wait_lock);
 
-	/*
-	 * Readjust priority, when we did not get the lock. We might have been
-	 * the pending owner and boosted. Since we did not take the lock, the
-	 * PI boost has to go.
-	 */
-	if (unlikely(ret))
-		rt_mutex_adjust_prio(current);
-
 	return ret;
 }
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
 /*
  * lock->owner state tracking:
  */
-#define RT_MUTEX_OWNER_PENDING	1UL
-#define RT_MUTEX_HAS_WAITERS	2UL
-#define RT_MUTEX_OWNER_MASKALL	3UL
+#define RT_MUTEX_HAS_WAITERS	1UL
+#define RT_MUTEX_OWNER_MASKALL	1UL
 
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
 }
 
-static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
-{
-	return (struct task_struct *)
-		((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
-}
-
-static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
-{
-	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
-}
-
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
-- 
cgit v1.2.1


From 1fb0ef31f428f345a7c3666f8e7444a563edd537 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Jan 2011 08:57:41 +0100
Subject: genirq: Fix affinity notifier fallout

The new code of commit cd7eab44e(genirq: Add IRQ affinity notifiers)
references irq_desc.affinity which fails to compile with
CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED=y.

Use irq_desc.irq_data.affinity instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ben Hutchings <bhutchings@solarflare.com>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0587c5ceaed8..538fce2db51c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -179,7 +179,7 @@ static void irq_affinity_notify(struct work_struct *work)
 		cpumask_copy(cpumask, desc->pending_mask);
 	else
 #endif
-		cpumask_copy(cpumask, desc->affinity);
+		cpumask_copy(cpumask, desc->irq_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	notify->notify(notify, cpumask);
-- 
cgit v1.2.1


From 871cf1e5f2a17702f58539a3af8b18fc8666ad4c Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:58:55 +0100
Subject: time: Move do_timer() to kernel/time/timekeeping.c

do_timer() is primary timekeeping related. calc_global_load() is
called from do_timer() as well, but that's more for historical
reasons.

[ tglx: Fixed up the calc_global_load() reject andmassaged changelog ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145855.23248.56933.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 14 +++++++++++++-
 kernel/timer.c            | 13 -------------
 2 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..c1a178ca0f50 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -779,7 +779,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
  *
  * Called from the timer interrupt, must hold a write on xtime_lock.
  */
-void update_wall_time(void)
+static void update_wall_time(void)
 {
 	struct clocksource *clock;
 	cycle_t offset;
@@ -946,3 +946,15 @@ struct timespec get_monotonic_coarse(void)
 				now.tv_nsec + mono.tv_nsec);
 	return now;
 }
+
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+	jiffies_64 += ticks;
+	update_wall_time();
+	calc_global_load(ticks);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca9936f2d0..87f656cc2a55 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1293,19 +1293,6 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
- */
-
-void do_timer(unsigned long ticks)
-{
-	jiffies_64 += ticks;
-	update_wall_time();
-	calc_global_load(ticks);
-}
-
 #ifdef __ARCH_WANT_SYS_ALARM
 
 /*
-- 
cgit v1.2.1


From fbad1ea94159a71bc0f68b00e57ae803606af9fb Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:00 +0100
Subject: time: Move get_jiffies_64 to kernel/time/jiffies.c

Move the jiffies access functions to the jiffies clocksource code.

[ tglx: Add missing include ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145900.23248.73352.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time.c         | 17 -----------------
 kernel/time/jiffies.c | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..a31b51220ac6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -674,23 +674,6 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
 
-#if (BITS_PER_LONG < 64)
-u64 get_jiffies_64(void)
-{
-	unsigned long seq;
-	u64 ret;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		ret = jiffies_64;
-	} while (read_seqretry(&xtime_lock, seq));
-	return ret;
-}
-EXPORT_SYMBOL(get_jiffies_64);
-#endif
-
-EXPORT_SYMBOL(jiffies);
-
 /*
  * Add two timespec values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..2fbc20744797 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,6 +22,7 @@
 ************************************************************************/
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
+#include <linux/module.h>
 #include <linux/init.h>
 
 /* The Jiffies based clocksource is the lowest common
@@ -64,6 +65,23 @@ struct clocksource clocksource_jiffies = {
 	.shift		= JIFFIES_SHIFT,
 };
 
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+	unsigned long seq;
+	u64 ret;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		ret = jiffies_64;
+	} while (read_seqretry(&xtime_lock, seq));
+	return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
 static int __init init_jiffies_clocksource(void)
 {
 	return clocksource_register(&clocksource_jiffies);
-- 
cgit v1.2.1


From 48cf76f7104f655bbd48a75c7759dce82c3e1ab6 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:05 +0100
Subject: time: Provide get_xtime_and_monotonic_offset()

The hrtimer code accesses timekeeping variables under
xtime_lock. Provide a sensible accessor function and use it.

[ tglx: Removed the conditionals, unused variable, fixed codingstyle
  	and massaged changelog ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145905.23248.30458.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c          | 13 ++-----------
 kernel/time/timekeeping.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..57c4d33c9a9d 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -85,13 +85,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, tomono;
 	struct timespec xts, tom;
-	unsigned long seq;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		xts = __current_kernel_time();
-		tom = __get_wall_to_monotonic();
-	} while (read_seqretry(&xtime_lock, seq));
+	get_xtime_and_monotonic_offset(&xts, &tom);
 
 	xtim = timespec_to_ktime(xts);
 	tomono = timespec_to_ktime(tom);
@@ -612,15 +607,11 @@ static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
 	struct timespec realtime_offset, wtm;
-	unsigned long seq;
 
 	if (!hrtimer_hres_active())
 		return;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		wtm = __get_wall_to_monotonic();
-	} while (read_seqretry(&xtime_lock, seq));
+	get_xtime_and_monotonic_offset(&realtime_offset, &wtm);
 	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c1a178ca0f50..c50aaf6cd01d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -958,3 +958,19 @@ void do_timer(unsigned long ticks)
 	update_wall_time();
 	calc_global_load(ticks);
 }
+
+/**
+ * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic
+ * @xtim:	pointer to timespec to be set with xtime
+ * @wtom:	pointer to timespec to be set with wall_to_monotonic
+ */
+void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom)
+{
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		*xtim = xtime;
+		*wtom = wall_to_monotonic;
+	} while (read_seqretry(&xtime_lock, seq));
+}
-- 
cgit v1.2.1


From 79ecaf0d15344d78904becf0f25de3fc9b49d430 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Jan 2011 11:07:54 +0100
Subject: time: Remove unused __get_wall_to_monotonic()

No users left. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c50aaf6cd01d..8da35d1b9e16 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -910,11 +910,6 @@ struct timespec __current_kernel_time(void)
 	return xtime;
 }
 
-struct timespec __get_wall_to_monotonic(void)
-{
-	return wall_to_monotonic;
-}
-
 struct timespec current_kernel_time(void)
 {
 	struct timespec now;
-- 
cgit v1.2.1


From f0af911a9dec9de702645182c8d269449e24d24b Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:10 +0100
Subject: time: Provide xtime_update()

xtime_update() takes xtime_lock write locked and calls
do_timer(). Provided to replace the do_timer() calls in the
architecture code.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145910.23248.21379.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8da35d1b9e16..02c13a313d15 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -969,3 +969,16 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom
 		*wtom = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
 }
+
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks:	number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(ticks);
+	write_sequnlock(&xtime_lock);
+}
-- 
cgit v1.2.1


From e2830b5c1b2b2217894370a3b95af87d4a958401 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:32 +0100
Subject: time: Make do_timer() and xtime_lock local to kernel/time/

All callers of do_timer() are converted to xtime_update(). The only
users of xtime_lock are in kernel/time/. Make both local to
kernel/time/ and remove them from the global header files.

[ tglx: Reuse tick-internal.h instead of creating another local header
  	file. Massaged changelog ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c    | 1 -
 kernel/time/jiffies.c        | 2 ++
 kernel/time/ntp.c            | 2 ++
 kernel/time/tick-broadcast.c | 1 -
 kernel/time/tick-common.c    | 1 -
 kernel/time/tick-internal.h  | 5 +++++
 kernel/time/tick-oneshot.c   | 1 -
 kernel/time/tick-sched.c     | 1 -
 8 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..0d74b9ba90c8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
-#include <linux/tick.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 2fbc20744797..b2fa506667c0 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,6 +25,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
+#include "tick-internal.h"
+
 /* The Jiffies based clocksource is the lowest common
  * denominator clock source which should function on
  * all systems. It has the same coarse resolution as
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..ed8cfdf16983 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 
+#include "tick-internal.h"
+
 /*
  * NTP timekeeping variables:
  */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..92ef9a54f0a4 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..0e98fac3d479 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..28c578568c9d 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,8 @@
 /*
  * tick internal variable and functions used by low/high res code
  */
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
 
 #define TICK_DO_TIMER_NONE	-1
 #define TICK_DO_TIMER_BOOT	-2
@@ -132,3 +134,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 {
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
+
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include <linux/module.h>
 
 #include <asm/irq_regs.h>
-- 
cgit v1.2.1


From 7cf37e87dd2cfa17a64f28ea7f31eed4525f79e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 09:34:58 +0100
Subject: time: Fix legacy arch fallout

The xtime/dotimer cleanup broke architectures which do not implement
clockevents. Time to send out another __do_IRQ threat.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145905.23248.30458.stgit@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-internal.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 28c578568c9d..f77b93df0006 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
+
 #define TICK_DO_TIMER_NONE	-1
 #define TICK_DO_TIMER_BOOT	-2
 
@@ -135,5 +137,7 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
 
+#endif
+
 extern void do_timer(unsigned long ticks);
 extern seqlock_t xtime_lock;
-- 
cgit v1.2.1


From 4916ca401e3051dad326ddd69765bd0e3f32fb9b Mon Sep 17 00:00:00 2001
From: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
Date: Tue, 1 Feb 2011 18:44:56 +0200
Subject: security: remove unused security_sysctl hook

The only user for this hook was selinux. sysctl routes every call
through /proc/sys/. Selinux and other security modules use the file
system checks for sysctl too, so no need for this hook any more.

Signed-off-by: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/sysctl.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..e24254c27eaf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1691,13 +1691,8 @@ static int test_perm(int mode, int op)
 
 int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
-	int error;
 	int mode;
 
-	error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
-	if (error)
-		return error;
-
 	if (root->permissions)
 		mode = root->permissions(root, current->nsproxy, table);
 	else
-- 
cgit v1.2.1


From 1e6d767924c74929c0cfe839ae8f37bcee9e544e Mon Sep 17 00:00:00 2001
From: Richard Cochran <richard.cochran@omicron.at>
Date: Tue, 1 Feb 2011 13:50:58 +0000
Subject: time: Correct the *settime* parameters

Both settimeofday() and clock_settime() promise with a 'const'
attribute not to alter the arguments passed in. This patch adds the
missing 'const' attribute into the various kernel functions
implementing these calls.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134417.545698637@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c     | 4 ++--
 kernel/time.c             | 2 +-
 kernel/time/timekeeping.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..21b7ca205f38 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -192,7 +192,7 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 }
 
 static inline int common_clock_set(const clockid_t which_clock,
-				   struct timespec *tp)
+				   const struct timespec *tp)
 {
 	return do_sys_settimeofday(tp, NULL);
 }
@@ -928,7 +928,7 @@ void exit_itimers(struct signal_struct *sig)
 }
 
 /* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
+int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
 {
 	return -EINVAL;
 }
diff --git a/kernel/time.c b/kernel/time.c
index a31b51220ac6..5cb80533d8b5 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
  * various programs will get confused when the clock gets warped.
  */
 
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 {
 	static int firsttime = 1;
 	int error = 0;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 02c13a313d15..4f9f65b91323 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
  *
  * Sets the time of day to the new time and update NTP and notify hrtimers
  */
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
 {
 	struct timespec ts_delta;
 	unsigned long flags;
-- 
cgit v1.2.1


From 65da528d7cc94966cf24d2a1e0837b689159b543 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:01 +0000
Subject: posix-timers: Define nanosleep not supported error separate

Define the conditional nanosleep not supported error value outside of
do_posix_clock_nonanosleep(). Preparatory patch for further cleanups.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.643486574@linutronix.de>
---
 kernel/posix-timers.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 21b7ca205f38..89bff3766d7d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -81,6 +81,14 @@ static DEFINE_SPINLOCK(idr_lock);
 #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
 #endif
 
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
 
 /*
  * The timer ID is turned into a timer address by idr_find().
@@ -937,11 +945,7 @@ EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
 			       struct timespec *t, struct timespec __user *r)
 {
-#ifndef ENOTSUP
-	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
-#else  /*  parisc does define it separately.  */
-	return -ENOTSUP;
-#endif
+	return -ENANOSLEEP_NOTSUP;
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
-- 
cgit v1.2.1


From 2fd1f04089cb657c5d6c484b280ec4d3398aa157 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:03 +0000
Subject: posix-timers: Cleanup struct initializers

Cosmetic. No functional change

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.745627057@linutronix.de>
---
 kernel/posix-cpu-timers.c | 24 ++++++++++++------------
 kernel/posix-timers.c     | 38 +++++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..11b91dc0992b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1607,20 +1607,20 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
 static __init int init_posix_cpu_timers(void)
 {
 	struct k_clock process = {
-		.clock_getres = process_cpu_clock_getres,
-		.clock_get = process_cpu_clock_get,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = process_cpu_timer_create,
-		.nsleep = process_cpu_nsleep,
-		.nsleep_restart = process_cpu_nsleep_restart,
+		.clock_getres	= process_cpu_clock_getres,
+		.clock_get	= process_cpu_clock_get,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= process_cpu_timer_create,
+		.nsleep		= process_cpu_nsleep,
+		.nsleep_restart	= process_cpu_nsleep_restart,
 	};
 	struct k_clock thread = {
-		.clock_getres = thread_cpu_clock_getres,
-		.clock_get = thread_cpu_clock_get,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = thread_cpu_timer_create,
-		.nsleep = thread_cpu_nsleep,
-		.nsleep_restart = thread_cpu_nsleep_restart,
+		.clock_getres	= thread_cpu_clock_getres,
+		.clock_get	= thread_cpu_clock_get,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= thread_cpu_timer_create,
+		.nsleep		= thread_cpu_nsleep,
+		.nsleep_restart	= thread_cpu_nsleep_restart,
 	};
 	struct timespec ts;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 89bff3766d7d..e7d26afd8ee5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -281,33 +281,33 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
 static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
-		.clock_getres = hrtimer_get_res,
+		.clock_getres	= hrtimer_get_res,
 	};
 	struct k_clock clock_monotonic = {
-		.clock_getres = hrtimer_get_res,
-		.clock_get = posix_ktime_get_ts,
-		.clock_set = do_posix_clock_nosettime,
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_ktime_get_ts,
+		.clock_set	= do_posix_clock_nosettime,
 	};
 	struct k_clock clock_monotonic_raw = {
-		.clock_getres = hrtimer_get_res,
-		.clock_get = posix_get_monotonic_raw,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = no_timer_create,
-		.nsleep = no_nsleep,
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_monotonic_raw,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= no_timer_create,
+		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_realtime_coarse = {
-		.clock_getres = posix_get_coarse_res,
-		.clock_get = posix_get_realtime_coarse,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = no_timer_create,
-		.nsleep = no_nsleep,
+		.clock_getres	= posix_get_coarse_res,
+		.clock_get	= posix_get_realtime_coarse,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= no_timer_create,
+		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_monotonic_coarse = {
-		.clock_getres = posix_get_coarse_res,
-		.clock_get = posix_get_monotonic_coarse,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = no_timer_create,
-		.nsleep = no_nsleep,
+		.clock_getres	= posix_get_coarse_res,
+		.clock_get	= posix_get_monotonic_coarse,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= no_timer_create,
+		.nsleep		= no_nsleep,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
-- 
cgit v1.2.1


From 1976945eeaab5fa461735a6225a82c3cf1e65d62 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:06 +0000
Subject: posix-timers: Introduce clock_posix_cpu

The CLOCK_DISPATCH() macro is a horrible magic. We call common
functions if a function pointer is not set. That's just backwards.

To support dynamic file decriptor based clocks we need to cleanup that
dispatch logic.

Create a k_clock struct clock_posix_cpu which has all the
posix-cpu-timer functions filled in. After the cleanup the functions
can be made static.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.841974553@linutronix.de>
---
 kernel/posix-cpu-timers.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 11b91dc0992b..816cd49a5ad9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1604,6 +1604,18 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
 	return -EINVAL;
 }
 
+struct k_clock clock_posix_cpu = {
+	.clock_getres	= posix_cpu_clock_getres,
+	.clock_set	= posix_cpu_clock_set,
+	.clock_get	= posix_cpu_clock_get,
+	.timer_create	= posix_cpu_timer_create,
+	.nsleep		= posix_cpu_nsleep,
+	.nsleep_restart	= posix_cpu_nsleep_restart,
+	.timer_set	= posix_cpu_timer_set,
+	.timer_del	= posix_cpu_timer_del,
+	.timer_get	= posix_cpu_timer_get,
+};
+
 static __init int init_posix_cpu_timers(void)
 {
 	struct k_clock process = {
-- 
cgit v1.2.1


From cc785ac22b17ed53e8ff5c1501e422be6d10be3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:09 +0000
Subject: posix-timers: Introduce clockid_to_kclock()

New function to find the kclock for a given clockid.

Returns a pointer to clock_posix_cpu if clockid < 0. If clockid >=
MAXCLOCK or if the clock_getres pointer is not set it returns
NULL. For valid clocks it returns a pointer to the matching
posix_clock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
Acked-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.938447839@linutronix.de>
---
 kernel/posix-timers.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e7d26afd8ee5..14b0a70ffb1e 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -531,6 +531,16 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+	if (id < 0)
+		return &clock_posix_cpu;
+
+	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+		return NULL;
+	return &posix_clocks[id];
+}
+
 /* Create a POSIX.1b interval timer. */
 
 SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
-- 
cgit v1.2.1


From a5cd2880106cb2c79b3fe24f1c53dadba6a542a0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:11 +0000
Subject: posix-timers: Convert clock_nanosleep to clockid_to_kclock()

Use the new kclock decoding function in clock_nanosleep and cleanup all
kclocks which use the default functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.034175556@linutronix.de>
---
 kernel/posix-timers.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 14b0a70ffb1e..ee69b216d5c3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -216,12 +216,6 @@ static int no_timer_create(struct k_itimer *new_timer)
 	return -EOPNOTSUPP;
 }
 
-static int no_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *tsave, struct timespec __user *rmtp)
-{
-	return -EOPNOTSUPP;
-}
-
 /*
  * Return nonzero if we know a priori this clockid_t value is bogus.
  */
@@ -282,32 +276,31 @@ static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
+		.nsleep		= common_nsleep,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
 		.clock_set	= do_posix_clock_nosettime,
+		.nsleep		= common_nsleep,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_monotonic_raw,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
-		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_realtime_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_realtime_coarse,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
-		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_monotonic_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
-		.nsleep		= no_nsleep,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -952,13 +945,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 
-int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
-			       struct timespec *t, struct timespec __user *r)
-{
-	return -ENANOSLEEP_NOTSUP;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
-
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
@@ -1023,10 +1009,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec t;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
+	if (!kc->nsleep)
+		return -ENANOSLEEP_NOTSUP;
 
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
@@ -1034,8 +1023,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
-	return CLOCK_DISPATCH(which_clock, nsleep,
-			      (which_clock, flags, &t, rmtp));
+	return kc->nsleep(which_clock, flags, &t, rmtp);
 }
 
 /*
-- 
cgit v1.2.1


From 59bd5bc24aa69f6c62da1e242c16f09f667def96 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:17 +0000
Subject: posix-timers: Convert clock_nanosleep_restart to clockid_to_kclock()

Use the new kclock decoding function in clock_nanosleep_restart.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.131263211@linutronix.de>
---
 kernel/posix-timers.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ee69b216d5c3..4dd86d15bbd0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -277,12 +277,14 @@ static __init int init_posix_timers(void)
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
 		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
 		.clock_set	= do_posix_clock_nosettime,
 		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -1026,23 +1028,17 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	return kc->nsleep(which_clock, flags, &t, rmtp);
 }
 
-/*
- * nanosleep_restart for monotonic and realtime clocks
- */
-static int common_nsleep_restart(struct restart_block *restart_block)
-{
-	return hrtimer_nanosleep_restart(restart_block);
-}
-
 /*
  * This will restart clock_nanosleep. This is required only by
  * compat_clock_nanosleep_restart for now.
  */
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
+long clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
+	struct k_clock *kc = clockid_to_kclock(which_clock);
+
+	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+		return -EINVAL;
 
-	return CLOCK_DISPATCH(which_clock, nsleep_restart,
-			      (restart_block));
+	return kc->nsleep_restart(restart_block);
 }
-- 
cgit v1.2.1


From 3751f9f29bcbc19bd10e92254a273486f150c245 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:20 +0000
Subject: posix-timers: Cleanup restart_block usage

posix timers still use the legacy arg0-arg3 members of
restart_block. Use restart_block.nanosleep instead

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.232288779@linutronix.de>
---
 kernel/posix-cpu-timers.c | 38 +++++++++++++++-----------------------
 kernel/posix-timers.c     |  2 +-
 2 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 816cd49a5ad9..9e617b00afa9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1485,7 +1485,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
+		&current_thread_info()->restart_block;
 	struct itimerspec it;
 	int error;
 
@@ -1501,50 +1501,42 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 
 	if (error == -ERESTART_RESTARTBLOCK) {
 
-	       	if (flags & TIMER_ABSTIME)
+		if (flags & TIMER_ABSTIME)
 			return -ERESTARTNOHAND;
 		/*
-	 	 * Report back to the user the time still remaining.
-	 	 */
-		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		 * Report back to the user the time still remaining.
+		 */
+		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
-		restart_block->arg0 = which_clock;
-		restart_block->arg1 = (unsigned long) rmtp;
-		restart_block->arg2 = rqtp->tv_sec;
-		restart_block->arg3 = rqtp->tv_nsec;
+		restart_block->nanosleep.index = which_clock;
+		restart_block->nanosleep.rmtp = rmtp;
+		restart_block->nanosleep.expires = timespec_to_ns(rqtp);
 	}
 	return error;
 }
 
 long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
-	clockid_t which_clock = restart_block->arg0;
-	struct timespec __user *rmtp;
+	clockid_t which_clock = restart_block->nanosleep.index;
 	struct timespec t;
 	struct itimerspec it;
 	int error;
 
-	rmtp = (struct timespec __user *) restart_block->arg1;
-	t.tv_sec = restart_block->arg2;
-	t.tv_nsec = restart_block->arg3;
+	t = ns_to_timespec(restart_block->nanosleep.expires);
 
-	restart_block->fn = do_no_restart_syscall;
 	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
 
 	if (error == -ERESTART_RESTARTBLOCK) {
+		struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
 		/*
-	 	 * Report back to the user the time still remaining.
-	 	 */
-		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		 * Report back to the user the time still remaining.
+		 */
+		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
-		restart_block->fn = posix_cpu_nsleep_restart;
-		restart_block->arg0 = which_clock;
-		restart_block->arg1 = (unsigned long) rmtp;
-		restart_block->arg2 = t.tv_sec;
-		restart_block->arg3 = t.tv_nsec;
+		restart_block->nanosleep.expires = timespec_to_ns(&t);
 	}
 	return error;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4dd86d15bbd0..4762986814c8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1034,7 +1034,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
  */
 long clock_nanosleep_restart(struct restart_block *restart_block)
 {
-	clockid_t which_clock = restart_block->arg0;
+	clockid_t which_clock = restart_block->nanosleep.index;
 	struct k_clock *kc = clockid_to_kclock(which_clock);
 
 	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
-- 
cgit v1.2.1


From 79c9da0d0539fb341a1b48a2a5a23d974726de90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:45 +0000
Subject: posix-cpu-timers: Remove the stub nanosleep functions

CLOCK_THREAD_CPUTIME_ID implements stub functions for nanosleep and
nanosleep_restart, which return -EINVAL. That return value is
wrong. The correct return value is -ENOTSUP.

Remove the stubs and let the new dispatch code return the correct
error code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.422446502@linutronix.de>
---
 kernel/posix-cpu-timers.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9e617b00afa9..8dc4cd7faf89 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1586,15 +1586,6 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	timer->it_clock = THREAD_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp, struct timespec __user *rmtp)
-{
-	return -EINVAL;
-}
-static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
-{
-	return -EINVAL;
-}
 
 struct k_clock clock_posix_cpu = {
 	.clock_getres	= posix_cpu_clock_getres,
@@ -1623,8 +1614,6 @@ static __init int init_posix_cpu_timers(void)
 		.clock_get	= thread_cpu_clock_get,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= thread_cpu_timer_create,
-		.nsleep		= thread_cpu_nsleep,
-		.nsleep_restart	= thread_cpu_nsleep_restart,
 	};
 	struct timespec ts;
 
-- 
cgit v1.2.1


From 26f9a4796af330173d790c8d2b5e2efcc489e755 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:48 +0000
Subject: posix-timers: Convert clock_settime to clockid_to_kclock()

Use the new kclock decoding function in clock_settime and cleanup all
kclocks which use the default functions. Rename the misnomed
common_clock_set() to posix_clock_realtime_set().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.518851246@linutronix.de>
---
 kernel/posix-cpu-timers.c |  2 --
 kernel/posix-timers.c     | 31 ++++++++++++-------------------
 2 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8dc4cd7faf89..504fbab10b82 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1604,7 +1604,6 @@ static __init int init_posix_cpu_timers(void)
 	struct k_clock process = {
 		.clock_getres	= process_cpu_clock_getres,
 		.clock_get	= process_cpu_clock_get,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= process_cpu_timer_create,
 		.nsleep		= process_cpu_nsleep,
 		.nsleep_restart	= process_cpu_nsleep_restart,
@@ -1612,7 +1611,6 @@ static __init int init_posix_cpu_timers(void)
 	struct k_clock thread = {
 		.clock_getres	= thread_cpu_clock_getres,
 		.clock_get	= thread_cpu_clock_get,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= thread_cpu_timer_create,
 	};
 	struct timespec ts;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4762986814c8..49f358c37708 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -199,12 +199,6 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
-static inline int common_clock_set(const clockid_t which_clock,
-				   const struct timespec *tp)
-{
-	return do_sys_settimeofday(tp, NULL);
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -232,6 +226,13 @@ static inline int invalid_clockid(const clockid_t which_clock)
 	return 1;
 }
 
+/* Set clock_realtime */
+static int posix_clock_realtime_set(const clockid_t which_clock,
+				    const struct timespec *tp)
+{
+	return do_sys_settimeofday(tp, NULL);
+}
+
 /*
  * Get monotonic time for posix timers
  */
@@ -276,32 +277,29 @@ static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
+		.clock_set	= posix_clock_realtime_set,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
-		.clock_set	= do_posix_clock_nosettime,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_monotonic_raw,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_realtime_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_realtime_coarse,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_monotonic_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
 	};
 
@@ -940,24 +938,19 @@ void exit_itimers(struct signal_struct *sig)
 	}
 }
 
-/* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
-{
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
-
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec new_tp;
 
-	if (invalid_clockid(which_clock))
+	if (!kc || !kc->clock_set)
 		return -EINVAL;
+
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
 
-	return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+	return kc->clock_set(which_clock, &new_tp);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-- 
cgit v1.2.1


From 42285777631aa0654fbb6442057b3e176445c6c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:50 +0000
Subject: posix-timers: Convert clock_gettime() to clockid_to_kclock()

Use the new kclock decoding mechanism and rename the misnomed
common_clock_get() to posix_clock_realtime_get().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.611097203@linutronix.de>
---
 kernel/posix-timers.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 49f358c37708..d9e5edfe8a1b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -190,15 +190,6 @@ static inline int common_clock_getres(const clockid_t which_clock,
 	return 0;
 }
 
-/*
- * Get real time for posix timers
- */
-static int common_clock_get(clockid_t which_clock, struct timespec *tp)
-{
-	ktime_get_real_ts(tp);
-	return 0;
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -226,6 +217,13 @@ static inline int invalid_clockid(const clockid_t which_clock)
 	return 1;
 }
 
+/* Get clock_realtime */
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_real_ts(tp);
+	return 0;
+}
+
 /* Set clock_realtime */
 static int posix_clock_realtime_set(const clockid_t which_clock,
 				    const struct timespec *tp)
@@ -277,6 +275,7 @@ static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -956,18 +955,21 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec kernel_tp;
 	int error;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
-	error = CLOCK_DISPATCH(which_clock, clock_get,
-			       (which_clock, &kernel_tp));
+	if (!kc->clock_get)
+		return -EOPNOTSUPP;
+
+	error = kc->clock_get(which_clock, &kernel_tp);
+
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
 
 	return error;
-
 }
 
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
-- 
cgit v1.2.1


From 4359ac0ace1a2a267927390ad27f781a2f8e0ab8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 11:45:23 +0100
Subject: posix-timers: Make clock_getres and clock_get mandatory

Richard said: "I would think that we can require k_clocks to provide
the read function. This could be checked and enforced in
register_posix_clock()."

Add checks for clock_getres and clock_get in the register function.

Suggested-by: Richard Cochran <richardcochran@gmail.com>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d9e5edfe8a1b..7f66143d1ce5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -485,7 +485,18 @@ static struct pid *good_sigevent(sigevent_t * event)
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
-		printk("POSIX clock register failed for clock_id %d\n",
+		printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+		       clock_id);
+		return;
+	}
+
+	if (!new_clock->clock_get) {
+		printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+		       clock_id);
+		return;
+	}
+	if (!new_clock->clock_getres) {
+		printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
 		       clock_id);
 		return;
 	}
@@ -961,8 +972,6 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 
 	if (!kc)
 		return -EINVAL;
-	if (!kc->clock_get)
-		return -EOPNOTSUPP;
 
 	error = kc->clock_get(which_clock, &kernel_tp);
 
-- 
cgit v1.2.1


From e5e542eea9075dd008993c2ee80b2cc9f31fc494 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:53 +0000
Subject: posix-timers: Convert clock_getres() to clockid_to_kclock()

Use the new kclock decoding. Fixup the fallout in mmtimer.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.709802797@linutronix.de>
---
 kernel/posix-timers.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7f66143d1ce5..748497fffd0f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -182,14 +182,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  * the function pointer CALL in struct k_clock.
  */
 
-static inline int common_clock_getres(const clockid_t which_clock,
-				      struct timespec *tp)
-{
-	tp->tv_sec = 0;
-	tp->tv_nsec = posix_clocks[which_clock].res;
-	return 0;
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -984,18 +976,17 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec rtn_tp;
 	int error;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
 
-	error = CLOCK_DISPATCH(which_clock, clock_getres,
-			       (which_clock, &rtn_tp));
+	error = kc->clock_getres(which_clock, &rtn_tp);
 
-	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
 		error = -EFAULT;
-	}
 
 	return error;
 }
-- 
cgit v1.2.1


From ebaac757acae0431e2c79c00e09f1debdabbddd7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:56 +0000
Subject: posix-timers: Remove useless res field from k_clock

The res member of kclock is only used by mmtimer.c, but even there it
contains redundant information. Remove the field and fixup mmtimer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.808714587@linutronix.de>
---
 kernel/posix-timers.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 748497fffd0f..f9142a99b5cb 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -204,8 +204,6 @@ static inline int invalid_clockid(const clockid_t which_clock)
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
 		return 0;
-	if (posix_clocks[which_clock].res != 0)
-		return 0;
 	return 1;
 }
 
-- 
cgit v1.2.1


From 838394fbf989973ec7f5a0ad82cb6ff09e5c39aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:58 +0000
Subject: posix-timers: Convert timer_create() to clockid_to_kclock()

Setup timer_create for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and
remove the no_timer_create() implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.903604289@linutronix.de>
---
 kernel/posix-timers.c | 40 +++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index f9142a99b5cb..4f71382a4ca8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -146,6 +146,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
  */
 static int common_nsleep(const clockid_t, int flags, struct timespec *t,
 			 struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
@@ -174,25 +175,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  	 (posix_clocks[clock].call != NULL \
  	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
 
-/*
- * Default clock hook functions when the struct k_clock passed
- * to register_posix_clock leaves a function pointer null.
- *
- * The function common_CALL is the default implementation for
- * the function pointer CALL in struct k_clock.
- */
-
-static int common_timer_create(struct k_itimer *new_timer)
-{
-	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
-	return 0;
-}
-
-static int no_timer_create(struct k_itimer *new_timer)
-{
-	return -EOPNOTSUPP;
-}
-
 /*
  * Return nonzero if we know a priori this clockid_t value is bogus.
  */
@@ -269,27 +251,26 @@ static __init int init_posix_timers(void)
 		.clock_set	= posix_clock_realtime_set,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_monotonic_raw,
-		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_realtime_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_realtime_coarse,
-		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_monotonic_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
-		.timer_create	= no_timer_create,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -534,19 +515,28 @@ static struct k_clock *clockid_to_kclock(const clockid_t id)
 	return &posix_clocks[id];
 }
 
+static int common_timer_create(struct k_itimer *new_timer)
+{
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+	return 0;
+}
+
 /* Create a POSIX.1b interval timer. */
 
 SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		struct sigevent __user *, timer_event_spec,
 		timer_t __user *, created_timer_id)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
 	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
+	if (!kc->timer_create)
+		return -EOPNOTSUPP;
 
 	new_timer = alloc_posix_timer();
 	if (unlikely(!new_timer))
@@ -608,7 +598,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		goto out;
 	}
 
-	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+	error = kc->timer_create(new_timer);
 	if (error)
 		goto out;
 
@@ -618,7 +608,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	spin_unlock_irq(&current->sighand->siglock);
 
 	return 0;
- 	/*
+	/*
 	 * In the case of the timer belonging to another task, after
 	 * the task is unlocked, the timer is owned by the other task
 	 * and may cease to exist at any time.  Don't use or modify
-- 
cgit v1.2.1


From 27722df16ef143017db55ac7baac1703a68017ff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:01 +0000
Subject: posix-timers: Convert timer_settime() to clockid_to_kclock()

Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks
and use the new decoding function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.001863714@linutronix.de>
---
 kernel/posix-timers.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4f71382a4ca8..a4dbfe71c5a5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -252,6 +252,7 @@ static __init int init_posix_timers(void)
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
@@ -259,6 +260,7 @@ static __init int init_posix_timers(void)
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -814,6 +816,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 	int error = 0;
 	unsigned long flag;
 	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+	struct k_clock *kc;
 
 	if (!new_setting)
 		return -EINVAL;
@@ -829,8 +832,11 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
-	error = CLOCK_DISPATCH(timr->it_clock, timer_set,
-			       (timr, flags, &new_spec, rtn));
+	kc = clockid_to_kclock(timr->it_clock);
+	if (WARN_ON_ONCE(!kc || !kc->timer_set))
+		error = -EINVAL;
+	else
+		error = kc->timer_set(timr, flags, &new_spec, rtn);
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
-- 
cgit v1.2.1


From a7319fa253a549c4c6528fb550ae6e72a9c83811 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:04 +0000
Subject: posix-timers: Convert timer_gettime() to clockid_to_kclock()

Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks
and use the new decoding function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.101243181@linutronix.de>
---
 kernel/posix-timers.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a4dbfe71c5a5..c1e2636f9e45 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -253,6 +253,7 @@ static __init int init_posix_timers(void)
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
@@ -261,6 +262,7 @@ static __init int init_posix_timers(void)
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -712,22 +714,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		struct itimerspec __user *, setting)
 {
-	struct k_itimer *timr;
 	struct itimerspec cur_setting;
+	struct k_itimer *timr;
+	struct k_clock *kc;
 	unsigned long flags;
+	int ret = 0;
 
 	timr = lock_timer(timer_id, &flags);
 	if (!timr)
 		return -EINVAL;
 
-	CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+	kc = clockid_to_kclock(timr->it_clock);
+	if (WARN_ON_ONCE(!kc || !kc->timer_get))
+		ret = -EINVAL;
+	else
+		kc->timer_get(timr, &cur_setting);
 
 	unlock_timer(timr, flags);
 
-	if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+	if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
 		return -EFAULT;
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.1


From 6761c6702e2c647582e1829abe8cf90794f61d9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:07 +0000
Subject: posix-timers: Convert timer_delete() to clockid_to_kclock()

Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks
and use the new decoding function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.198999420@linutronix.de>
---
 kernel/posix-timers.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index c1e2636f9e45..ade7dec49f96 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -254,6 +254,7 @@ static __init int init_posix_timers(void)
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
 		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
@@ -263,6 +264,7 @@ static __init int init_posix_timers(void)
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
 		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -859,7 +861,7 @@ retry:
 	return error;
 }
 
-static inline int common_timer_del(struct k_itimer *timer)
+static int common_timer_del(struct k_itimer *timer)
 {
 	timer->it.real.interval.tv64 = 0;
 
@@ -870,7 +872,11 @@ static inline int common_timer_del(struct k_itimer *timer)
 
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
-	return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+	struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+
+	if (WARN_ON_ONCE(!kc || !kc->timer_del))
+		return -EINVAL;
+	return kc->timer_del(timer);
 }
 
 /* Delete a POSIX.1b interval timer. */
-- 
cgit v1.2.1


From 0aa3975f02ce78f27be3076fbfa3d94ae5a659d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:09 +0000
Subject: posix-timers: Remove CLOCK_DISPATCH leftovers

All users gone. Remove the cruft.

Huge thanks to Richard Cochran who tackled that maze first.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.294620613@linutronix.de>
---
 kernel/posix-timers.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ade7dec49f96..ad154dfd7c51 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -167,28 +167,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 	spin_unlock_irqrestore(&timr->it_lock, flags);
 }
 
-/*
- * Call the k_clock hook function if non-null, or the default function.
- */
-#define CLOCK_DISPATCH(clock, call, arglist) \
- 	((clock) < 0 ? posix_cpu_##call arglist : \
- 	 (posix_clocks[clock].call != NULL \
- 	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
-
-/*
- * Return nonzero if we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(const clockid_t which_clock)
-{
-	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
-		return 0;
-	if ((unsigned) which_clock >= MAX_CLOCKS)
-		return 1;
-	if (posix_clocks[which_clock].clock_getres != NULL)
-		return 0;
-	return 1;
-}
-
 /* Get clock_realtime */
 static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
 {
-- 
cgit v1.2.1


From bc2c8ea483d73e95fc88f1fc9e7755180f89b892 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:12 +0000
Subject: posix-timers: Make posix-cpu-timers functions static

All functions are accessed via clock_posix_cpu now. So make them static.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.389755466@linutronix.de>
---
 kernel/posix-cpu-timers.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 504fbab10b82..609e187f43e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 	return p->utime;
 }
 
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 	return error;
 }
 
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 }
 
 
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
  * This is called from sys_timer_create() and do_cpu_nanosleep() with the
  * new timer already all-zeros initialized.
  */
-int posix_cpu_timer_create(struct k_itimer *new_timer)
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
 	int ret = 0;
 	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
  * If we return TIMER_RETRY, it's necessary to release the timer's lock
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
-int posix_cpu_timer_del(struct k_itimer *timer)
+static int posix_cpu_timer_del(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
  * If we return TIMER_RETRY, it's necessary to release the timer's lock
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
-			struct itimerspec *new, struct itimerspec *old)
+static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			       struct itimerspec *new, struct itimerspec *old)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	return ret;
 }
 
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
 	union cpu_time_count now;
 	struct task_struct *p = timer->it.cpu.task;
@@ -1481,8 +1483,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	return error;
 }
 
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp, struct timespec __user *rmtp)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
+
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+			    struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
 		&current_thread_info()->restart_block;
@@ -1517,7 +1521,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	return error;
 }
 
-long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.index;
 	struct timespec t;
@@ -1542,7 +1546,6 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 
 }
 
-
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-- 
cgit v1.2.1


From 0061748dd2400d0bcd4d49d258db5d7b5d106ca0 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:15 +0000
Subject: posix-timer: Update comment

Pick the cleanup to the comment in posix-timers.c from Richards all in
one conversion patch.

Originally-from: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134419.487708516@linutronix.de>
---
 kernel/posix-timers.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad154dfd7c51..a3fdfd4be0ec 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -102,11 +102,7 @@ static DEFINE_SPINLOCK(idr_lock);
 /*
  * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
  *	    to implement others.  This structure defines the various
- *	    clocks and allows the possibility of adding others.	 We
- *	    provide an interface to add clocks to the table and expect
- *	    the "arch" code to add at least one clock that is high
- *	    resolution.	 Here we define the standard CLOCK_REALTIME as a
- *	    1/HZ resolution clock.
+ *	    clocks.
  *
  * RESOLUTION: Clock resolution is used to round up timer and interval
  *	    times, NOT to report clock times, which are reported with as
@@ -116,20 +112,13 @@ static DEFINE_SPINLOCK(idr_lock);
  *	    necessary code is written.	The standard says we should say
  *	    something about this issue in the documentation...
  *
- * FUNCTIONS: The CLOCKs structure defines possible functions to handle
- *	    various clock functions.  For clocks that use the standard
- *	    system timer code these entries should be NULL.  This will
- *	    allow dispatch without the overhead of indirect function
- *	    calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
- *	    must supply functions here, even if the function just returns
- *	    ENOSYS.  The standard POSIX timer management code assumes the
- *	    following: 1.) The k_itimer struct (sched.h) is used for the
- *	    timer.  2.) The list, it_lock, it_clock, it_id and it_pid
- *	    fields are not modified by timer code.
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
+ *	    handle various clock functions.
  *
- *          At this time all functions EXCEPT clock_nanosleep can be
- *          redirected by the CLOCKS structure.  Clock_nanosleep is in
- *          there, but the code ignores it.
+ *	    The standard POSIX timer management code assumes the
+ *	    following: 1.) The k_itimer struct (sched.h) is used for
+ *	    the timer.  2.) The list, it_lock, it_clock, it_id and
+ *	    it_pid fields are not modified by timer code.
  *
  * Permissions: It is assumed that the clock_settime() function defined
  *	    for each clock will take care of permission checks.	 Some
-- 
cgit v1.2.1


From c528f7c6c208f1fae6b4025957173dec045e5f21 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 1 Feb 2011 13:52:17 +0000
Subject: time: Introduce timekeeping_inject_offset

This adds a kernel-internal timekeeping interface to add or subtract
a fixed amount from CLOCK_REALTIME. This makes it so kernel users or
interfaces trying to do so do not have to read the time, then add an
offset and then call settimeofday(), which adds some extra error in
comparision to just simply adding the offset in the kernel timekeeping
core.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.584311693@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4f9f65b91323..6262c1d18397 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -387,6 +387,42 @@ int do_settimeofday(const struct timespec *tv)
 
 EXPORT_SYMBOL(do_settimeofday);
 
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv:		pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+	unsigned long flags;
+
+	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	timekeeping_forward_now();
+
+	xtime = timespec_add(xtime, *ts);
+	wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+
+	timekeeper.ntp_error = 0;
+	ntp_clear();
+
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
 /**
  * change_clocksource - Swaps clocksources if a new one is available
  *
-- 
cgit v1.2.1


From 094aa1881fdc1b8889b442eb3511b31f3ec2b762 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:20 +0000
Subject: ntp: Add ADJ_SETOFFSET mode bit

This patch adds a new mode bit into the timex structure. When set, the bit
instructs the kernel to add the given time value to the current time.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134320.688829863@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/ntp.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ed8cfdf16983..5ac593267a26 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -648,6 +648,17 @@ int do_adjtimex(struct timex *txc)
 			hrtimer_cancel(&leap_timer);
 	}
 
+	if (txc->modes & ADJ_SETOFFSET) {
+		struct timespec delta;
+		if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC)
+			return -EINVAL;
+		delta.tv_sec  = txc->time.tv_sec;
+		delta.tv_nsec = txc->time.tv_usec;
+		if (!(txc->modes & ADJ_NANO))
+			delta.tv_nsec *= 1000;
+		timekeeping_inject_offset(&delta);
+	}
+
 	getnstimeofday(&ts);
 
 	write_seqlock_irq(&xtime_lock);
-- 
cgit v1.2.1


From 65f5d80bdf83ec0d7f3887db10153bf3f36ed73c Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:23 +0000
Subject: time: Splitout compat timex accessors

Split out the compat timex accessors into separate
functions. Preparatory patch for a new syscall.

[ tglx: Split that patch from Richards "posix-timers: Introduce a
  	syscall for clock tuning.". Keeps the changes strictly
  	separate ]

Originally-from: Richard Cochran <richardcochran@gmail.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20110201134419.772343089@linutronix.de>
---
 kernel/compat.c | 113 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..449e853cf41d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
 		put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
 }
 
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+	memset(txc, 0, sizeof(struct timex));
+
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+			__get_user(txc->modes, &utp->modes) ||
+			__get_user(txc->offset, &utp->offset) ||
+			__get_user(txc->freq, &utp->freq) ||
+			__get_user(txc->maxerror, &utp->maxerror) ||
+			__get_user(txc->esterror, &utp->esterror) ||
+			__get_user(txc->status, &utp->status) ||
+			__get_user(txc->constant, &utp->constant) ||
+			__get_user(txc->precision, &utp->precision) ||
+			__get_user(txc->tolerance, &utp->tolerance) ||
+			__get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+			__get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+			__get_user(txc->tick, &utp->tick) ||
+			__get_user(txc->ppsfreq, &utp->ppsfreq) ||
+			__get_user(txc->jitter, &utp->jitter) ||
+			__get_user(txc->shift, &utp->shift) ||
+			__get_user(txc->stabil, &utp->stabil) ||
+			__get_user(txc->jitcnt, &utp->jitcnt) ||
+			__get_user(txc->calcnt, &utp->calcnt) ||
+			__get_user(txc->errcnt, &utp->errcnt) ||
+			__get_user(txc->stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+			__put_user(txc->modes, &utp->modes) ||
+			__put_user(txc->offset, &utp->offset) ||
+			__put_user(txc->freq, &utp->freq) ||
+			__put_user(txc->maxerror, &utp->maxerror) ||
+			__put_user(txc->esterror, &utp->esterror) ||
+			__put_user(txc->status, &utp->status) ||
+			__put_user(txc->constant, &utp->constant) ||
+			__put_user(txc->precision, &utp->precision) ||
+			__put_user(txc->tolerance, &utp->tolerance) ||
+			__put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+			__put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+			__put_user(txc->tick, &utp->tick) ||
+			__put_user(txc->ppsfreq, &utp->ppsfreq) ||
+			__put_user(txc->jitter, &utp->jitter) ||
+			__put_user(txc->shift, &utp->shift) ||
+			__put_user(txc->stabil, &utp->stabil) ||
+			__put_user(txc->jitcnt, &utp->jitcnt) ||
+			__put_user(txc->calcnt, &utp->calcnt) ||
+			__put_user(txc->errcnt, &utp->errcnt) ||
+			__put_user(txc->stbcnt, &utp->stbcnt) ||
+			__put_user(txc->tai, &utp->tai))
+		return -EFAULT;
+	return 0;
+}
+
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz)
 {
@@ -951,58 +1009,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
-	int ret;
+	int err, ret;
 
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
-			__get_user(txc.modes, &utp->modes) ||
-			__get_user(txc.offset, &utp->offset) ||
-			__get_user(txc.freq, &utp->freq) ||
-			__get_user(txc.maxerror, &utp->maxerror) ||
-			__get_user(txc.esterror, &utp->esterror) ||
-			__get_user(txc.status, &utp->status) ||
-			__get_user(txc.constant, &utp->constant) ||
-			__get_user(txc.precision, &utp->precision) ||
-			__get_user(txc.tolerance, &utp->tolerance) ||
-			__get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-			__get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-			__get_user(txc.tick, &utp->tick) ||
-			__get_user(txc.ppsfreq, &utp->ppsfreq) ||
-			__get_user(txc.jitter, &utp->jitter) ||
-			__get_user(txc.shift, &utp->shift) ||
-			__get_user(txc.stabil, &utp->stabil) ||
-			__get_user(txc.jitcnt, &utp->jitcnt) ||
-			__get_user(txc.calcnt, &utp->calcnt) ||
-			__get_user(txc.errcnt, &utp->errcnt) ||
-			__get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
 
 	ret = do_adjtimex(&txc);
 
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
-			__put_user(txc.modes, &utp->modes) ||
-			__put_user(txc.offset, &utp->offset) ||
-			__put_user(txc.freq, &utp->freq) ||
-			__put_user(txc.maxerror, &utp->maxerror) ||
-			__put_user(txc.esterror, &utp->esterror) ||
-			__put_user(txc.status, &utp->status) ||
-			__put_user(txc.constant, &utp->constant) ||
-			__put_user(txc.precision, &utp->precision) ||
-			__put_user(txc.tolerance, &utp->tolerance) ||
-			__put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-			__put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-			__put_user(txc.tick, &utp->tick) ||
-			__put_user(txc.ppsfreq, &utp->ppsfreq) ||
-			__put_user(txc.jitter, &utp->jitter) ||
-			__put_user(txc.shift, &utp->shift) ||
-			__put_user(txc.stabil, &utp->stabil) ||
-			__put_user(txc.jitcnt, &utp->jitcnt) ||
-			__put_user(txc.calcnt, &utp->calcnt) ||
-			__put_user(txc.errcnt, &utp->errcnt) ||
-			__put_user(txc.stbcnt, &utp->stbcnt) ||
-			__put_user(txc.tai, &utp->tai))
-		ret = -EFAULT;
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
 
 	return ret;
 }
-- 
cgit v1.2.1


From f1f1d5ebd10ffa4242bce7a90a56a222d6b7bc77 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:26 +0000
Subject: posix-timers: Introduce a syscall for clock tuning.

A new syscall is introduced that allows tuning of a POSIX clock. The
new call, clock_adjtime, takes two parameters, the clock ID and a
pointer to a struct timex. Any ADJTIMEX(2) operation may be requested
via this system call, but various POSIX clocks may or may not support
tuning.

[ tglx: Adapted to the posix-timer cleanup series. Avoid copy_to_user
  	in the error case ]

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134419.869804645@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/compat.c       | 23 +++++++++++++++++++++++
 kernel/posix-timers.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 449e853cf41d..38b1d2c1cbe8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -675,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
 	return err;
 }
 
+long compat_sys_clock_adjtime(clockid_t which_clock,
+		struct compat_timex __user *utp)
+{
+	struct timex txc;
+	mm_segment_t oldfs;
+	int err, ret;
+
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+	set_fs(oldfs);
+
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
+
+	return ret;
+}
+
 long compat_sys_clock_getres(clockid_t which_clock,
 		struct compat_timespec __user *tp)
 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a3fdfd4be0ec..5a5a4f1c0971 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -170,6 +170,12 @@ static int posix_clock_realtime_set(const clockid_t which_clock,
 	return do_sys_settimeofday(tp, NULL);
 }
 
+static int posix_clock_realtime_adj(const clockid_t which_clock,
+				    struct timex *t)
+{
+	return do_adjtimex(t);
+}
+
 /*
  * Get monotonic time for posix timers
  */
@@ -216,6 +222,7 @@ static __init int init_posix_timers(void)
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
+		.clock_adj	= posix_clock_realtime_adj,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
@@ -948,6 +955,29 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 	return error;
 }
 
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+		struct timex __user *, utx)
+{
+	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timex ktx;
+	int err;
+
+	if (!kc)
+		return -EINVAL;
+	if (!kc->clock_adj)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&ktx, utx, sizeof(ktx)))
+		return -EFAULT;
+
+	err = kc->clock_adj(which_clock, &ktx);
+
+	if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+		return -EFAULT;
+
+	return err;
+}
+
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
-- 
cgit v1.2.1


From 81e294cba2596f5f10848bbe19d98b344c2a2d5c Mon Sep 17 00:00:00 2001
From: Richard Cochran <richard.cochran@omicron.at>
Date: Tue, 1 Feb 2011 13:52:32 +0000
Subject: posix-timers: Add support for fd based clocks

Extend the negative clockids which are currently used by posix cpu
timers to encode the PID with a file descriptor based type which
encodes the fd in the upper bits.

Originally-from: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134420.062860200@linutronix.de>
---
 kernel/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5a5a4f1c0971..df629d853a81 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -488,7 +488,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 static struct k_clock *clockid_to_kclock(const clockid_t id)
 {
 	if (id < 0)
-		return &clock_posix_cpu;
+		return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu;
 
 	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
 		return NULL;
-- 
cgit v1.2.1


From 527087374faa488776a789375a7d6ea74fda6f71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 12:10:09 +0100
Subject: posix-timers: Cleanup namespace

Rename register_posix_clock() to posix_timers_register_clock(). That's
what the function really does. As a side effect this cleans up the
posix_clock namespace for the upcoming dynamic posix_clock
infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <alpine.LFD.2.00.1102021222240.31804@localhost6.localdomain6>
---
 kernel/posix-cpu-timers.c |  4 ++--
 kernel/posix-timers.c     | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 609e187f43e7..67fea9d25d55 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1618,8 +1618,8 @@ static __init int init_posix_cpu_timers(void)
 	};
 	struct timespec ts;
 
-	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
-	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+	posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+	posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
 
 	cputime_to_timespec(cputime_one_jiffy, &ts);
 	onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index df629d853a81..af936fd37140 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -253,11 +253,11 @@ static __init int init_posix_timers(void)
 		.clock_get	= posix_get_monotonic_coarse,
 	};
 
-	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
-	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
-	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
-	register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
-	register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+	posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
+	posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
+	posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -433,7 +433,8 @@ static struct pid *good_sigevent(sigevent_t * event)
 	return task_pid(rtn);
 }
 
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+void posix_timers_register_clock(const clockid_t clock_id,
+				 struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
 		printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
@@ -454,7 +455,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 
 	posix_clocks[clock_id] = *new_clock;
 }
-EXPORT_SYMBOL_GPL(register_posix_clock);
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
 
 static struct k_itimer * alloc_posix_timer(void)
 {
-- 
cgit v1.2.1


From 0606f422b453f76c31ab2b1bd52943ff06a2dcf2 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:35 +0000
Subject: posix clocks: Introduce dynamic clocks

This patch adds support for adding and removing posix clocks. The
clock lifetime cycle is patterned after usb devices. Each clock is
represented by a standard character device. In addition, the driver
may optionally implement custom character device operations.

The posix clock and timer system calls listed below now work with
dynamic posix clocks, as well as the traditional static clocks.
The following system calls are affected:

   - clock_adjtime (brand new syscall)
   - clock_gettime
   - clock_getres
   - clock_settime
   - timer_create
   - timer_delete
   - timer_gettime
   - timer_settime

[ tglx: Adapted to the posix-timer cleanup. Moved clock_posix_dynamic
  	to posix-clock.c and made all referenced functions static ]

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134420.164172635@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c     |   4 +-
 kernel/time/Makefile      |   3 +-
 kernel/time/posix-clock.c | 441 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 446 insertions(+), 2 deletions(-)
 create mode 100644 kernel/time/posix-clock.c

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index af936fd37140..44fcff131b38 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/idr.h>
+#include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@@ -489,7 +490,8 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 static struct k_clock *clockid_to_kclock(const clockid_t id)
 {
 	if (id < 0)
-		return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu;
+		return (id & CLOCKFD_MASK) == CLOCKFD ?
+			&clock_posix_dynamic : &clock_posix_cpu;
 
 	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
 		return NULL;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..b0425991e9ac 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..04498cbf6002
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,441 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+static void delete_clock(struct kref *kref);
+
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+	struct posix_clock *clk = fp->private_data;
+
+	mutex_lock(&clk->mutex);
+
+	if (!clk->zombie)
+		return clk;
+
+	mutex_unlock(&clk->mutex);
+
+	return NULL;
+}
+
+static void put_posix_clock(struct posix_clock *clk)
+{
+	mutex_unlock(&clk->mutex);
+}
+
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -EINVAL;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.read)
+		err = clk->ops.read(clk, fp->f_flags, buf, count);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int result = 0;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.poll)
+		result = clk->ops.poll(clk, fp, wait);
+
+	put_posix_clock(clk);
+
+	return result;
+}
+
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = 0;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.fasync)
+		err = clk->ops.fasync(clk, fd, fp, on);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -ENODEV;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.mmap)
+		err = clk->ops.mmap(clk, vma);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+static long posix_clock_ioctl(struct file *fp,
+			      unsigned int cmd, unsigned long arg)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -ENOTTY;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.ioctl)
+		err = clk->ops.ioctl(clk, cmd, arg);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+				     unsigned int cmd, unsigned long arg)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -ENOTTY;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.ioctl)
+		err = clk->ops.ioctl(clk, cmd, arg);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+#endif
+
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+	int err;
+	struct posix_clock *clk =
+		container_of(inode->i_cdev, struct posix_clock, cdev);
+
+	mutex_lock(&clk->mutex);
+
+	if (clk->zombie) {
+		err = -ENODEV;
+		goto out;
+	}
+	if (clk->ops.open)
+		err = clk->ops.open(clk, fp->f_mode);
+	else
+		err = 0;
+
+	if (!err) {
+		kref_get(&clk->kref);
+		fp->private_data = clk;
+	}
+out:
+	mutex_unlock(&clk->mutex);
+	return err;
+}
+
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+	struct posix_clock *clk = fp->private_data;
+	int err = 0;
+
+	if (clk->ops.release)
+		err = clk->ops.release(clk);
+
+	kref_put(&clk->kref, delete_clock);
+
+	fp->private_data = NULL;
+
+	return err;
+}
+
+static const struct file_operations posix_clock_file_operations = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= posix_clock_read,
+	.poll		= posix_clock_poll,
+	.unlocked_ioctl	= posix_clock_ioctl,
+	.open		= posix_clock_open,
+	.release	= posix_clock_release,
+	.fasync		= posix_clock_fasync,
+	.mmap		= posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= posix_clock_compat_ioctl,
+#endif
+};
+
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+	int err;
+
+	kref_init(&clk->kref);
+	mutex_init(&clk->mutex);
+
+	cdev_init(&clk->cdev, &posix_clock_file_operations);
+	clk->cdev.owner = clk->ops.owner;
+	err = cdev_add(&clk->cdev, devid, 1);
+	if (err)
+		goto no_cdev;
+
+	return err;
+no_cdev:
+	mutex_destroy(&clk->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+
+static void delete_clock(struct kref *kref)
+{
+	struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+	mutex_destroy(&clk->mutex);
+	if (clk->release)
+		clk->release(clk);
+}
+
+void posix_clock_unregister(struct posix_clock *clk)
+{
+	cdev_del(&clk->cdev);
+
+	mutex_lock(&clk->mutex);
+	clk->zombie = true;
+	mutex_unlock(&clk->mutex);
+
+	kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+
+struct posix_clock_desc {
+	struct file *fp;
+	struct posix_clock *clk;
+};
+
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+	struct file *fp = fget(CLOCKID_TO_FD(id));
+	int err = -EINVAL;
+
+	if (!fp)
+		return err;
+
+	if (fp->f_op->open != posix_clock_open || !fp->private_data)
+		goto out;
+
+	cd->fp = fp;
+	cd->clk = get_posix_clock(fp);
+
+	err = cd->clk ? 0 : -ENODEV;
+out:
+	if (err)
+		fput(fp);
+	return err;
+}
+
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+	put_posix_clock(cd->clk);
+	fput(cd->fp);
+}
+
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_adjtime)
+		err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_gettime)
+		err = cd.clk->ops.clock_gettime(cd.clk, ts);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_getres)
+		err = cd.clk->ops.clock_getres(cd.clk, ts);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_settime)
+		err = cd.clk->ops.clock_settime(cd.clk, ts);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_timer_create(struct k_itimer *kit)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.timer_create)
+		err = cd.clk->ops.timer_create(cd.clk, kit);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_timer_delete(struct k_itimer *kit)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.timer_delete)
+		err = cd.clk->ops.timer_delete(cd.clk, kit);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+
+	if (get_clock_desc(id, &cd))
+		return;
+
+	if (cd.clk->ops.timer_gettime)
+		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
+	put_clock_desc(&cd);
+}
+
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+			    struct itimerspec *ts, struct itimerspec *old)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.timer_settime)
+		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+struct k_clock clock_posix_dynamic = {
+	.clock_getres	= pc_clock_getres,
+	.clock_set	= pc_clock_settime,
+	.clock_get	= pc_clock_gettime,
+	.clock_adj	= pc_clock_adjtime,
+	.timer_create	= pc_timer_create,
+	.timer_set	= pc_timer_settime,
+	.timer_del	= pc_timer_delete,
+	.timer_get	= pc_timer_gettime,
+};
-- 
cgit v1.2.1


From fe4b04fa31a6dcf4358aa84cf81e5a7fd079469b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Feb 2011 13:19:09 +0100
Subject: perf: Cure task_oncpu_function_call() races

Oleg reported that on architectures with
__ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI from
task_oncpu_function_call() can land before perf_event_task_sched_in()
and cause interesting situations for eg. perf_install_in_context().

This patch reworks the task_oncpu_function_call() interface to give a
more usable primitive as well as rework all its users to hopefully be
more obvious as well as remove the races.

While looking at the code I also found a number of races against
perf_event_task_sched_out() which can flip contexts between tasks so
plug those too.

Reported-and-reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 260 +++++++++++++++++++++++++++++++++-------------------
 kernel/sched.c      |  29 +-----
 2 files changed, 172 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 126a302c481c..7d3faa25e136 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,79 @@
 
 #include <asm/irq_regs.h>
 
+struct remote_function_call {
+	struct task_struct *p;
+	int (*func)(void *info);
+	void *info;
+	int ret;
+};
+
+static void remote_function(void *data)
+{
+	struct remote_function_call *tfc = data;
+	struct task_struct *p = tfc->p;
+
+	if (p) {
+		tfc->ret = -EAGAIN;
+		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+			return;
+	}
+
+	tfc->ret = tfc->func(tfc->info);
+}
+
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p:		the task to evaluate
+ * @func:	the function to be called
+ * @info:	the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ *	    -ESRCH  - when the process isn't running
+ *	    -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+	struct remote_function_call data = {
+		.p = p,
+		.func = func,
+		.info = info,
+		.ret = -ESRCH, /* No such (running) process */
+	};
+
+	if (task_curr(p))
+		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+
+	return data.ret;
+}
+
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func:	the function to be called
+ * @info:	the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+	struct remote_function_call data = {
+		.p = NULL,
+		.func = func,
+		.info = info,
+		.ret = -ENXIO, /* No such CPU */
+	};
+
+	smp_call_function_single(cpu, remote_function, &data, 1);
+
+	return data.ret;
+}
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
@@ -254,7 +327,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 	raw_spin_lock_irqsave(&ctx->lock, flags);
 	--ctx->pin_count;
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-	put_ctx(ctx);
 }
 
 /*
@@ -618,35 +690,24 @@ __get_cpu_context(struct perf_event_context *ctx)
  * We disable the event on the hardware level first. After that we
  * remove it from the context list.
  */
-static void __perf_event_remove_from_context(void *info)
+static int __perf_remove_from_context(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
 	raw_spin_lock(&ctx->lock);
-
 	event_sched_out(event, cpuctx, ctx);
-
 	list_del_event(event, ctx);
-
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 
 /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
- * Must be called with ctx->mutex held.
- *
  * CPU events are removed with a smp call. For task events we only
  * call when the task is on a CPU.
  *
@@ -657,49 +718,48 @@ static void __perf_event_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_event_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 	struct task_struct *task = ctx->task;
 
+	lockdep_assert_held(&ctx->mutex);
+
 	if (!task) {
 		/*
 		 * Per cpu events are removed via an smp call and
 		 * the removal is always successful.
 		 */
-		smp_call_function_single(event->cpu,
-					 __perf_event_remove_from_context,
-					 event, 1);
+		cpu_function_call(event->cpu, __perf_remove_from_context, event);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_event_remove_from_context,
-				 event);
+	if (!task_function_call(task, __perf_remove_from_context, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * If the context is active we need to retry the smp call.
+	 * If we failed to find a running task, but find the context active now
+	 * that we've acquired the ctx->lock, retry.
 	 */
-	if (ctx->nr_active && !list_empty(&event->group_entry)) {
+	if (ctx->is_active) {
 		raw_spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can remove the event safely, if the call above did not
-	 * succeed.
+	 * Since the task isn't running, its safe to remove the event, us
+	 * holding the ctx->lock ensures the task won't get scheduled in.
 	 */
-	if (!list_empty(&event->group_entry))
-		list_del_event(event, ctx);
+	list_del_event(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
 /*
  * Cross CPU call to disable a performance event
  */
-static void __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
@@ -708,9 +768,12 @@ static void __perf_event_disable(void *info)
 	/*
 	 * If this is a per-task event, need to check whether this
 	 * event's task is the current task on this cpu.
+	 *
+	 * Can trigger due to concurrent perf_event_context_sched_out()
+	 * flipping contexts around.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+		return -EINVAL;
 
 	raw_spin_lock(&ctx->lock);
 
@@ -729,6 +792,8 @@ static void __perf_event_disable(void *info)
 	}
 
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 /*
@@ -753,13 +818,13 @@ void perf_event_disable(struct perf_event *event)
 		/*
 		 * Disable the event on the cpu that it's on
 		 */
-		smp_call_function_single(event->cpu, __perf_event_disable,
-					 event, 1);
+		cpu_function_call(event->cpu, __perf_event_disable, event);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_event_disable, event);
+	if (!task_function_call(task, __perf_event_disable, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 	/*
@@ -767,6 +832,11 @@ retry:
 	 */
 	if (event->state == PERF_EVENT_STATE_ACTIVE) {
 		raw_spin_unlock_irq(&ctx->lock);
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
 		goto retry;
 	}
 
@@ -778,7 +848,6 @@ retry:
 		update_group_times(event);
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -928,12 +997,14 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
+static void perf_event_context_sched_in(struct perf_event_context *ctx);
+
 /*
  * Cross CPU call to install and enable a performance event
  *
  * Must be called with ctx->mutex held
  */
-static void __perf_install_in_context(void *info)
+static int  __perf_install_in_context(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
@@ -942,17 +1013,12 @@ static void __perf_install_in_context(void *info)
 	int err;
 
 	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 * Or possibly this is the right context but it isn't
-	 * on this cpu because it had no events.
+	 * In case we're installing a new context to an already running task,
+	 * could also happen before perf_event_task_sched_in() on architectures
+	 * which do context switches with IRQs enabled.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
+	if (ctx->task && !cpuctx->task_ctx)
+		perf_event_context_sched_in(ctx);
 
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
@@ -997,6 +1063,8 @@ static void __perf_install_in_context(void *info)
 
 unlock:
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 /*
@@ -1008,8 +1076,6 @@ unlock:
  * If the event is attached to a task which is on a CPU we use a smp
  * call to enable it in the task context. The task might have been
  * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
  */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
@@ -1018,6 +1084,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
+	lockdep_assert_held(&ctx->mutex);
+
 	event->ctx = ctx;
 
 	if (!task) {
@@ -1025,31 +1093,29 @@ perf_install_in_context(struct perf_event_context *ctx,
 		 * Per cpu events are installed via an smp call and
 		 * the install is always successful.
 		 */
-		smp_call_function_single(cpu, __perf_install_in_context,
-					 event, 1);
+		cpu_function_call(cpu, __perf_install_in_context, event);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_install_in_context,
-				 event);
+	if (!task_function_call(task, __perf_install_in_context, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * we need to retry the smp call.
+	 * If we failed to find a running task, but find the context active now
+	 * that we've acquired the ctx->lock, retry.
 	 */
-	if (ctx->is_active && list_empty(&event->group_entry)) {
+	if (ctx->is_active) {
 		raw_spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can add the event safely, if it the call above did not
-	 * succeed.
+	 * Since the task isn't running, its safe to add the event, us holding
+	 * the ctx->lock ensures the task won't get scheduled in.
 	 */
-	if (list_empty(&event->group_entry))
-		add_event_to_ctx(event, ctx);
+	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -1078,7 +1144,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 /*
  * Cross CPU call to enable a performance event
  */
-static void __perf_event_enable(void *info)
+static int __perf_event_enable(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
@@ -1086,18 +1152,10 @@ static void __perf_event_enable(void *info)
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
-	/*
-	 * If this is a per-task event, need to check whether this
-	 * event's task is the current task on this cpu.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
+	if (WARN_ON_ONCE(!ctx->is_active))
+		return -EINVAL;
 
 	raw_spin_lock(&ctx->lock);
-	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
@@ -1138,6 +1196,8 @@ static void __perf_event_enable(void *info)
 
 unlock:
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 /*
@@ -1158,8 +1218,7 @@ void perf_event_enable(struct perf_event *event)
 		/*
 		 * Enable the event on the cpu that it's on
 		 */
-		smp_call_function_single(event->cpu, __perf_event_enable,
-					 event, 1);
+		cpu_function_call(event->cpu, __perf_event_enable, event);
 		return;
 	}
 
@@ -1178,8 +1237,15 @@ void perf_event_enable(struct perf_event *event)
 		event->state = PERF_EVENT_STATE_OFF;
 
 retry:
+	if (!ctx->is_active) {
+		__perf_event_mark_enabled(event, ctx);
+		goto out;
+	}
+
 	raw_spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_event_enable, event);
+
+	if (!task_function_call(task, __perf_event_enable, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 
@@ -1187,15 +1253,14 @@ retry:
 	 * If the context is active and the event is still off,
 	 * we need to retry the cross-call.
 	 */
-	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+		/*
+		 * task could have been flipped by a concurrent
+		 * perf_event_context_sched_out()
+		 */
+		task = ctx->task;
 		goto retry;
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (event->state == PERF_EVENT_STATE_OFF)
-		__perf_event_mark_enabled(event, ctx);
+	}
 
 out:
 	raw_spin_unlock_irq(&ctx->lock);
@@ -1339,8 +1404,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
-void perf_event_context_sched_out(struct task_struct *task, int ctxn,
-				  struct task_struct *next)
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+					 struct task_struct *next)
 {
 	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
 	struct perf_event_context *next_ctx;
@@ -1533,7 +1598,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 {
 	struct perf_cpu_context *cpuctx;
 
-       	cpuctx = __get_cpu_context(ctx);
+	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
 
@@ -1541,7 +1606,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 	cpuctx->task_ctx = ctx;
 }
 
-void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -1627,7 +1692,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 	 * Reduce accuracy by one bit such that @a and @b converge
 	 * to a similar magnitude.
 	 */
-#define REDUCE_FLS(a, b) 		\
+#define REDUCE_FLS(a, b)		\
 do {					\
 	if (a##_fls > b##_fls) {	\
 		a >>= 1;		\
@@ -2213,6 +2278,9 @@ errout:
 
 }
 
+/*
+ * Returns a matching context with refcount and pincount.
+ */
 static struct perf_event_context *
 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 {
@@ -2237,6 +2305,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 		get_ctx(ctx);
+		++ctx->pin_count;
 
 		return ctx;
 	}
@@ -2250,6 +2319,7 @@ retry:
 	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		unclone_ctx(ctx);
+		++ctx->pin_count;
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -2271,8 +2341,10 @@ retry:
 			err = -ESRCH;
 		else if (task->perf_event_ctxp[ctxn])
 			err = -EAGAIN;
-		else
+		else {
+			++ctx->pin_count;
 			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+		}
 		mutex_unlock(&task->perf_event_mutex);
 
 		if (unlikely(err)) {
@@ -5950,10 +6022,10 @@ SYSCALL_DEFINE5(perf_event_open,
 		struct perf_event_context *gctx = group_leader->ctx;
 
 		mutex_lock(&gctx->mutex);
-		perf_event_remove_from_context(group_leader);
+		perf_remove_from_context(group_leader);
 		list_for_each_entry(sibling, &group_leader->sibling_list,
 				    group_entry) {
-			perf_event_remove_from_context(sibling);
+			perf_remove_from_context(sibling);
 			put_ctx(gctx);
 		}
 		mutex_unlock(&gctx->mutex);
@@ -5976,6 +6048,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	perf_install_in_context(ctx, event, cpu);
 	++ctx->generation;
+	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
 	event->owner = current;
@@ -6001,6 +6074,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	return event_fd;
 
 err_context:
+	perf_unpin_context(ctx);
 	put_ctx(ctx);
 err_alloc:
 	free_event(event);
@@ -6051,6 +6125,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, event, cpu);
 	++ctx->generation;
+	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
 	return event;
@@ -6104,7 +6179,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 {
 	struct perf_event *parent_event;
 
-	perf_event_remove_from_context(child_event);
+	perf_remove_from_context(child_event);
 
 	parent_event = child_event->parent;
 	/*
@@ -6411,7 +6486,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		return 0;
 	}
 
-       	child_ctx = child->perf_event_ctxp[ctxn];
+	child_ctx = child->perf_event_ctxp[ctxn];
 	if (!child_ctx) {
 		/*
 		 * This is executed from the parent task context, so
@@ -6526,6 +6601,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 	mutex_unlock(&parent_ctx->mutex);
 
 	perf_unpin_context(parent_ctx);
+	put_ctx(parent_ctx);
 
 	return ret;
 }
@@ -6595,9 +6671,9 @@ static void __perf_event_exit_context(void *__info)
 	perf_pmu_rotate_stop(ctx->pmu);
 
 	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-		__perf_event_remove_from_context(event);
+		__perf_remove_from_context(event);
 	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-		__perf_event_remove_from_context(event);
+		__perf_remove_from_context(event);
 }
 
 static void perf_event_exit_cpu_context(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..31cb5d5e1aac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2265,27 +2265,6 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:		the task to evaluate
- * @func:	the function to be called
- * @info:	the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-			      void (*func) (void *info), void *info)
-{
-	int cpu;
-
-	preempt_disable();
-	cpu = task_cpu(p);
-	if (task_curr(p))
-		smp_call_function_single(cpu, func, info, 1);
-	preempt_enable();
-}
-
 #ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2776,9 +2755,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
+	sched_info_switch(prev, next);
+	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
+	trace_sched_switch(prev, next);
 }
 
 /**
@@ -2911,7 +2893,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	trace_sched_switch(prev, next);
+
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
@@ -3989,9 +3971,6 @@ need_resched_nonpreemptible:
 	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
-		sched_info_switch(prev, next);
-		perf_event_task_sched_out(prev, next);
-
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
-- 
cgit v1.2.1


From 725e7580aaf98e9f7b22b8ccfc640ad0c09e2b08 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:47:15 -0500
Subject: sched: Check the right ->nr_running in yield_task_fair()

With CONFIG_FAIR_GROUP_SCHED, each task_group has its own cfs_rq.
Yielding to a task from another cfs_rq may be worthwhile, since
a process calling yield typically cannot use the CPU right now.

Therefor, we want to check the per-cpu nr_running, not the
cgroup local one.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201094715.798c4f86@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 55040f3938d8..4de9905228c4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ static void yield_task_fair(struct rq *rq)
 	/*
 	 * Are we the only task in the tree?
 	 */
-	if (unlikely(cfs_rq->nr_running == 1))
+	if (unlikely(rq->nr_running == 1))
 		return;
 
 	clear_buddies(cfs_rq, se);
-- 
cgit v1.2.1


From 2c13c919d9e9a3db9896143a501f83dcbbe1ced4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:48:37 -0500
Subject: sched: Limit the scope of clear_buddies

The clear_buddies function does not seem to play well with the concept
of hierarchical runqueues.  In the following tree, task groups are
represented by 'G', tasks by 'T', next by 'n' and last by 'l'.

     (nl)
    /    \
   G(nl)  G
   / \     \
 T(l) T(n)  T

This situation can arise when a task is woken up T(n), and the previously
running task T(l) is marked last.

When clear_buddies is called from either T(l) or T(n), the next and last
buddies of the group G(nl) will be cleared.  This is not the desired
result, since we would like to be able to find the other type of buddy
in many cases.

This especially a worry when implementing yield_task_fair through the
buddy system.

The fix is simple: only clear the buddy type that the task itself
is indicated to be.  As an added bonus, we stop walking up the tree
when the buddy has already been cleared or pointed elsewhere.

Signed-off-by: Rik van Riel <riel@redhat.coM>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201094837.6b0962a9@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4de9905228c4..a785e08202cf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -995,19 +995,35 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		list_add_leaf_cfs_rq(cfs_rq);
 }
 
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
 {
-	if (!se || cfs_rq->last == se)
-		cfs_rq->last = NULL;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->last == se)
+			cfs_rq->last = NULL;
+		else
+			break;
+	}
+}
 
-	if (!se || cfs_rq->next == se)
-		cfs_rq->next = NULL;
+static void __clear_buddies_next(struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->next == se)
+			cfs_rq->next = NULL;
+		else
+			break;
+	}
 }
 
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	for_each_sched_entity(se)
-		__clear_buddies(cfs_rq_of(se), se);
+	if (cfs_rq->last == se)
+		__clear_buddies_last(se);
+
+	if (cfs_rq->next == se)
+		__clear_buddies_next(se);
 }
 
 static void
-- 
cgit v1.2.1


From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:51:03 -0500
Subject: sched: Use a buddy to implement yield_task_fair()

Use the buddy mechanism to implement yield_task_fair.  This
allows us to skip onto the next highest priority se at every
level in the CFS tree, unless doing so would introduce gross
unfairness in CPU time distribution.

We order the buddy selection in pick_next_entity to check
yield first, then last, then next.  We need next to be able
to override yield, because it is possible for the "next" and
"yield" task to be different processen in the same sub-tree
of the CFS tree.  When they are, we need to go into that
sub-tree regardless of the "yield" hint, and pick the correct
entity once we get to the right level.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       |   2 +-
 kernel/sched_debug.c |   2 +-
 kernel/sched_fair.c  | 148 ++++++++++++++++++++++++++++++---------------------
 kernel/sysctl.c      |   7 ---
 4 files changed, 90 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 477e1bcc63f9..ae5e1a19b9d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
-	struct sched_entity *curr, *next, *last;
+	struct sched_entity *curr, *next, *last, *skip;
 
 	unsigned int nr_spread_over;
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	if (cfs_rq->rb_leftmost)
-		MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+		MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a785e08202cf..c0fbeb992833 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -68,14 +68,6 @@ static unsigned int sched_nr_latency = 8;
  */
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 
-/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
 /*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = cfs_rq->rb_leftmost;
 
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 	return rb_entry(left, struct sched_entity, run_node);
 }
 
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+	struct rb_node *next = rb_next(&se->run_node);
+
+	if (!next)
+		return NULL;
+
+	return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
-#ifdef CONFIG_SCHED_DEBUG
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se)
 	}
 }
 
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->skip == se)
+			cfs_rq->skip = NULL;
+		else
+			break;
+	}
+}
+
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->last == se)
@@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	if (cfs_rq->next == se)
 		__clear_buddies_next(se);
+
+	if (cfs_rq->skip == se)
+		__clear_buddies_skip(se);
 }
 
 static void
@@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (cfs_rq->nr_running > 1) {
-		struct sched_entity *se = __pick_next_entity(cfs_rq);
+		struct sched_entity *se = __pick_first_entity(cfs_rq);
 		s64 delta = curr->vruntime - se->vruntime;
 
 		if (delta < 0)
@@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_next_entity(cfs_rq);
+	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *left = se;
 
-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-		se = cfs_rq->next;
+	/*
+	 * Avoid running the skip buddy, if running something else can
+	 * be done without getting too unfair.
+	 */
+	if (cfs_rq->skip == se) {
+		struct sched_entity *second = __pick_next_entity(se);
+		if (second && wakeup_preempt_entity(second, left) < 1)
+			se = second;
+	}
 
 	/*
 	 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
 		se = cfs_rq->last;
 
+	/*
+	 * Someone really wants this to run. If it's not unfair, run it.
+	 */
+	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+		se = cfs_rq->next;
+
 	clear_buddies(cfs_rq, se);
 
 	return se;
@@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	hrtick_update(rq);
 }
 
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
-	struct task_struct *curr = rq->curr;
-	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *rightmost, *se = &curr->se;
-
-	/*
-	 * Are we the only task in the tree?
-	 */
-	if (unlikely(rq->nr_running == 1))
-		return;
-
-	clear_buddies(cfs_rq, se);
-
-	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-		update_rq_clock(rq);
-		/*
-		 * Update run-time statistics of the 'current'.
-		 */
-		update_curr(cfs_rq);
-
-		return;
-	}
-	/*
-	 * Find the rightmost entry in the rbtree:
-	 */
-	rightmost = __pick_last_entity(cfs_rq);
-	/*
-	 * Already in the rightmost position?
-	 */
-	if (unlikely(!rightmost || entity_before(rightmost, se)))
-		return;
-
-	/*
-	 * Minimally necessary key value to be last in the tree:
-	 * Upon rescheduling, sched_class::put_prev_task() will place
-	 * 'current' within the tree based on its new key value.
-	 */
-	se->vruntime = rightmost->vruntime + 1;
-}
-
 #ifdef CONFIG_SMP
 
 static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
 	}
 }
 
+static void set_skip_buddy(struct sched_entity *se)
+{
+	if (likely(task_of(se)->policy != SCHED_IDLE)) {
+		for_each_sched_entity(se)
+			cfs_rq_of(se)->skip = se;
+	}
+}
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 	}
 }
 
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	struct sched_entity *se = &curr->se;
+
+	/*
+	 * Are we the only task in the tree?
+	 */
+	if (unlikely(rq->nr_running == 1))
+		return;
+
+	clear_buddies(cfs_rq, se);
+
+	if (curr->policy != SCHED_BATCH) {
+		update_rq_clock(rq);
+		/*
+		 * Update run-time statistics of the 'current'.
+		 */
+		update_curr(cfs_rq);
+	}
+
+	set_skip_buddy(se);
+}
+
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bc86bb32e126..cbfda7e2416e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -360,13 +360,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rt_handler,
 	},
-	{
-		.procname	= "sched_compat_yield",
-		.data		= &sysctl_sched_compat_yield,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",
-- 
cgit v1.2.1


From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 1 Feb 2011 09:50:51 -0500
Subject: sched: Add yield_to(task, preempt) functionality

Currently only implemented for fair class tasks.

Add a yield_to_task method() to the fair scheduling class. allowing the
caller of yield_to() to accelerate another thread in it's thread group,
task group.

Implemented via a scheduler hint, using cfs_rq->next to encourage the
target being selected.  We can rely on pick_next_entity to keep things
fair, so noone can accelerate a thread that has already used its fair
share of CPU time.

This also means callers should only call yield_to when they really
mean it.  Calling it too often can result in the scheduler just
ignoring the hint.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_fair.c | 20 +++++++++++++
 2 files changed, 105 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ae5e1a19b9d6..2effcb71a478 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	BUG_ON(rq1 != rq2);
+	raw_spin_lock(&rq1->lock);
+	__acquire(rq2->lock);	/* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	BUG_ON(rq1 != rq2);
+	raw_spin_unlock(&rq1->lock);
+	__release(rq2->lock);
+}
+
 #endif
 
 static void calc_load_account_idle(struct rq *this_rq);
@@ -5448,6 +5481,58 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
 
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+	struct task_struct *curr = current;
+	struct rq *rq, *p_rq;
+	unsigned long flags;
+	bool yielded = 0;
+
+	local_irq_save(flags);
+	rq = this_rq();
+
+again:
+	p_rq = task_rq(p);
+	double_rq_lock(rq, p_rq);
+	while (task_rq(p) != p_rq) {
+		double_rq_unlock(rq, p_rq);
+		goto again;
+	}
+
+	if (!curr->sched_class->yield_to_task)
+		goto out;
+
+	if (curr->sched_class != p->sched_class)
+		goto out;
+
+	if (task_running(p_rq, p) || p->state)
+		goto out;
+
+	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+	if (yielded)
+		schedstat_inc(rq, yld_count);
+
+out:
+	double_rq_unlock(rq, p_rq);
+	local_irq_restore(flags);
+
+	if (yielded)
+		schedule();
+
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c0fbeb992833..027024694043 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1975,6 +1975,25 @@ static void yield_task_fair(struct rq *rq)
 	set_skip_buddy(se);
 }
 
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+	struct sched_entity *se = &p->se;
+
+	if (!se->on_rq)
+		return false;
+
+	/* Tell the scheduler that we'd really like pse to run next. */
+	set_next_buddy(se);
+
+	/* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
+	if (preempt)
+		resched_task(rq->curr);
+
+	yield_task_fair(rq);
+
+	return true;
+}
+
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
@@ -4243,6 +4262,7 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
+	.yield_to_task		= yield_to_task_fair,
 
 	.check_preempt_curr	= check_preempt_wakeup,
 
-- 
cgit v1.2.1


From 76022db323dd6d7c6958df3d595f7dedf7a14778 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:51:53 +0900
Subject: tracing/kprobes: Cleanup strict_strtol() using code

Since strict_strtol() accepts minus digits started with '-', it doesn't
need to invert after converting.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125153.9507.49335.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..2088893c049e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -767,16 +767,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 		}
 		break;
 	case '+':	/* deref memory */
+		arg++;	/* Skip '+', because strict_strtol() rejects it. */
 	case '-':
 		tmp = strchr(arg, '(');
 		if (!tmp)
 			break;
 		*tmp = '\0';
-		ret = strict_strtol(arg + 1, 0, &offset);
+		ret = strict_strtol(arg, 0, &offset);
 		if (ret)
 			break;
-		if (arg[0] == '-')
-			offset = -offset;
 		arg = tmp + 1;
 		tmp = strrchr(arg, ')');
 		if (tmp) {
-- 
cgit v1.2.1


From e3745369986ddcdaa19f70e2d24e658876b97e84 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:51:59 +0900
Subject: tracing/kprobes: Support longer (>128 bytes) command

Expand command line buffer of kprobe-tracer to 4096 bytes.

Reported-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125159.9507.20895.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2088893c049e..c6ed88660856 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1129,7 +1129,7 @@ static int command_trace_probe(const char *buf)
 	return ret;
 }
 
-#define WRITE_BUFSIZE 128
+#define WRITE_BUFSIZE 4096
 
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
-- 
cgit v1.2.1


From 1ff511e35ed87cc2ebade9e678e4a2fe39b6f9c5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:52:05 +0900
Subject: tracing/kprobes: Add bitfield type

Add bitfield type for tracing arguments on kprobe-tracer.  The syntax of
a bitfield type is:

 b<bit-size>@<bit-offset>/<container-size>

e.g.

Accessing 2 bits-width field with 4 bits-offset in 32 bits-width data at
4 bytes offseted from the address pointed by AX register:

 +4(%ax):b2@4/32

Since the width of container data depends on the arch, so I just added
the container-size at the end.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125205.9507.11363.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 104 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c6ed88660856..ccdc542022c3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 	kfree(data);
 }
 
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+	struct fetch_param orig;
+	unsigned char hi_shift;
+	unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type)					\
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+					    void *data, void *dest)	\
+{									\
+	struct bitfield_fetch_param *bprm = data;			\
+	type buf = 0;							\
+	call_fetch(&bprm->orig, regs, &buf);				\
+	if (buf) {							\
+		buf <<= bprm->hi_shift;					\
+		buf >>= bprm->low_shift;				\
+	}								\
+	*(type *)dest = buf;						\
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+	/*
+	 * Don't check the bitfield itself, because this must be the
+	 * last fetch function.
+	 */
+	if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+		free_deref_fetch_param(data->orig.data);
+	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+		free_symbol_cache(data->orig.data);
+	kfree(data);
+}
 /* Default (unsigned long) fetch type */
 #define __DEFAULT_FETCH_TYPE(t) u##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
 	FETCH_MTD_memory,
 	FETCH_MTD_symbol,
 	FETCH_MTD_deref,
+	FETCH_MTD_bitfield,
 	FETCH_MTD_END,
 };
 
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype),			\
 ASSIGN_FETCH_FUNC(memory, ftype),			\
 ASSIGN_FETCH_FUNC(symbol, ftype),			\
 ASSIGN_FETCH_FUNC(deref, ftype),			\
+ASSIGN_FETCH_FUNC(bitfield, ftype),			\
 	  }						\
 	}
 
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
 	if (!type)
 		type = DEFAULT_FETCH_TYPE_STR;
 
+	/* Special case: bitfield */
+	if (*type == 'b') {
+		unsigned long bs;
+		type = strchr(type, '/');
+		if (!type)
+			goto fail;
+		type++;
+		if (strict_strtoul(type, 0, &bs))
+			goto fail;
+		switch (bs) {
+		case 8:
+			return find_fetch_type("u8");
+		case 16:
+			return find_fetch_type("u16");
+		case 32:
+			return find_fetch_type("u32");
+		case 64:
+			return find_fetch_type("u64");
+		default:
+			goto fail;
+		}
+	}
+
 	for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
 		if (strcmp(type, fetch_type_table[i].name) == 0)
 			return &fetch_type_table[i];
+fail:
 	return NULL;
 }
 
@@ -586,7 +649,9 @@ error:
 
 static void free_probe_arg(struct probe_arg *arg)
 {
-	if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+	if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+		free_bitfield_fetch_param(arg->fetch.data);
+	else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
 		free_deref_fetch_param(arg->fetch.data);
 	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
 		free_symbol_cache(arg->fetch.data);
@@ -806,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 	return ret;
 }
 
+#define BYTES_TO_BITS(nb)	((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+				      const struct fetch_type *t,
+				      struct fetch_param *f)
+{
+	struct bitfield_fetch_param *bprm;
+	unsigned long bw, bo;
+	char *tail;
+
+	if (*bf != 'b')
+		return 0;
+
+	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+	if (!bprm)
+		return -ENOMEM;
+	bprm->orig = *f;
+	f->fn = t->fetch[FETCH_MTD_bitfield];
+	f->data = (void *)bprm;
+
+	bw = simple_strtoul(bf + 1, &tail, 0);	/* Use simple one */
+	if (bw == 0 || *tail != '@')
+		return -EINVAL;
+
+	bf = tail + 1;
+	bo = simple_strtoul(bf, &tail, 0);
+	if (tail == bf || *tail != '/')
+		return -EINVAL;
+
+	bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+	bprm->low_shift = bprm->hi_shift + bo;
+	return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
 /* String length checking wrapper */
 static int parse_probe_arg(char *arg, struct trace_probe *tp,
 			   struct probe_arg *parg, int is_return)
@@ -835,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
 	parg->offset = tp->size;
 	tp->size += parg->type->size;
 	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+	if (ret >= 0)
+		ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
 	if (ret >= 0) {
 		parg->fetch_size.fn = get_fetch_size_function(parg->type,
 							      parg->fetch.fn);
-- 
cgit v1.2.1


From 6d54057d76e25c91165cda0e6e007f1811faa2be Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:33:26 -0500
Subject: tracing/filter: Have no filter return a match

The n_preds field of a file can change at anytime, and even can become
zero, just as the filter is about to be processed by an event.
In the case that is zero on entering the filter, return 1, telling
the caller the event matchs and should be trace.

Also use a variable and assign it with ACCESS_ONCE() such that the
count stays consistent within the function.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..7275f0310ed8 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -383,9 +383,14 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	int match, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
+	int n_preds = ACCESS_ONCE(filter->n_preds);
 	int i;
 
-	for (i = 0; i < filter->n_preds; i++) {
+	/* no filter is considered a match */
+	if (!n_preds)
+		return 1;
+
+	for (i = 0; i < n_preds; i++) {
 		pred = filter->preds[i];
 		if (!pred->pop_n) {
 			match = pred->fn(pred, rec, val1, val2);
-- 
cgit v1.2.1


From 58d9a597c4275d830a819625e7d437cd6fb23fa5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:37:09 -0500
Subject: tracing/filter: Move OR and AND logic out of fn() method

The ops OR and AND act different from the other ops, as they
are the only ones to take other ops as their arguements.
These ops als change the logic of the filter_match_preds.

By removing the OR and AND fn's we can also remove the val1 and val2
that is passed to all other fn's and are unused.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |  3 +--
 kernel/trace/trace_events_filter.c | 51 ++++++++++++++------------------------
 2 files changed, 20 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..1597bc0749c1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -677,8 +677,7 @@ struct event_subsystem {
 struct filter_pred;
 struct regex;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
-				 int val1, int val2);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
 
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7275f0310ed8..5d719b340a2b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -124,8 +124,7 @@ struct filter_parse_state {
 };
 
 #define DEFINE_COMPARISON_PRED(type)					\
-static int filter_pred_##type(struct filter_pred *pred, void *event,	\
-			      int val1, int val2)			\
+static int filter_pred_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
@@ -152,8 +151,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event,	\
 }
 
 #define DEFINE_EQUALITY_PRED(size)					\
-static int filter_pred_##size(struct filter_pred *pred, void *event,	\
-			      int val1, int val2)			\
+static int filter_pred_##size(struct filter_pred *pred, void *event)	\
 {									\
 	u##size *addr = (u##size *)(event + pred->offset);		\
 	u##size val = (u##size)pred->val;				\
@@ -178,23 +176,8 @@ DEFINE_EQUALITY_PRED(32);
 DEFINE_EQUALITY_PRED(16);
 DEFINE_EQUALITY_PRED(8);
 
-static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
-			   void *event __attribute((unused)),
-			   int val1, int val2)
-{
-	return val1 && val2;
-}
-
-static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
-			  void *event __attribute((unused)),
-			  int val1, int val2)
-{
-	return val1 || val2;
-}
-
 /* Filter predicate for fixed sized arrays of characters */
-static int filter_pred_string(struct filter_pred *pred, void *event,
-			      int val1, int val2)
+static int filter_pred_string(struct filter_pred *pred, void *event)
 {
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
@@ -207,8 +190,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 }
 
 /* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event,
-			     int val1, int val2)
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
 {
 	char **addr = (char **)(event + pred->offset);
 	int cmp, match;
@@ -231,8 +213,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
  * and add it to the address of the entry, and at last we have
  * the address of the string.
  */
-static int filter_pred_strloc(struct filter_pred *pred, void *event,
-			      int val1, int val2)
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
 {
 	u32 str_item = *(u32 *)(event + pred->offset);
 	int str_loc = str_item & 0xffff;
@@ -247,8 +228,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
 	return match;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event,
-			    int val1, int val2)
+static int filter_pred_none(struct filter_pred *pred, void *event)
 {
 	return 0;
 }
@@ -380,7 +360,7 @@ static void filter_build_regex(struct filter_pred *pred)
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	int match, top = 0, val1 = 0, val2 = 0;
+	int match = -1, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
@@ -393,7 +373,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	for (i = 0; i < n_preds; i++) {
 		pred = filter->preds[i];
 		if (!pred->pop_n) {
-			match = pred->fn(pred, rec, val1, val2);
+			match = pred->fn(pred, rec);
 			stack[top++] = match;
 			continue;
 		}
@@ -403,7 +383,16 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 		}
 		val1 = stack[--top];
 		val2 = stack[--top];
-		match = pred->fn(pred, rec, val1, val2);
+		switch (pred->op) {
+		case OP_AND:
+			match = val1 && val2;
+			break;
+		case OP_OR:
+			match = val1 || val2;
+			break;
+		default:
+			WARN_ONCE(1, "filter op is not AND or OR");
+		}
 		stack[top++] = match;
 	}
 
@@ -775,15 +764,13 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	unsigned long long val;
 	int ret;
 
-	pred->fn = filter_pred_none;
+	fn = pred->fn = filter_pred_none;
 
 	if (pred->op == OP_AND) {
 		pred->pop_n = 2;
-		fn = filter_pred_and;
 		goto add_pred_fn;
 	} else if (pred->op == OP_OR) {
 		pred->pop_n = 2;
-		fn = filter_pred_or;
 		goto add_pred_fn;
 	}
 
-- 
cgit v1.2.1


From c9c53ca03d6f97fdd9832d5ed3f15b30ee5cdb86 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:42:43 -0500
Subject: tracing/filter: Dynamically allocate preds

For every filter that is made, we create predicates to hold every
operation within the filter. We have a max of 32 predicates that we
can hold. Currently, we allocate all 32 even if we only need to
use one.

Part of the reason we do this is that the filter can be used at
any moment by any event. Fortunately, the filter is only used
with preemption disabled. By reseting the count of preds used "n_preds"
to zero, then performing a synchronize_sched(), we can safely
free and reallocate a new array of preds.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |   3 +-
 kernel/trace/trace_events_filter.c | 143 ++++++++++++++++++++++++++++---------
 2 files changed, 110 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1597bc0749c1..441fc1bc85d6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -661,7 +661,8 @@ struct ftrace_event_field {
 };
 
 struct event_filter {
-	int			n_preds;
+	int			n_preds;	/* Number assigned */
+	int			a_preds;	/* allocated */
 	struct filter_pred	**preds;
 	char			*filter_string;
 };
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 5d719b340a2b..aac6a6183e6a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -362,6 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 {
 	int match = -1, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
+	struct filter_pred **preds;
 	struct filter_pred *pred;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
 	int i;
@@ -370,8 +371,13 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	if (!n_preds)
 		return 1;
 
+	/*
+	 * n_preds and filter->preds is protect with preemption disabled.
+	 */
+	preds = rcu_dereference_sched(filter->preds);
+
 	for (i = 0; i < n_preds; i++) {
-		pred = filter->preds[i];
+		pred = preds[i];
 		if (!pred->pop_n) {
 			match = pred->fn(pred, rec);
 			stack[top++] = match;
@@ -548,46 +554,55 @@ static int filter_set_pred(struct filter_pred *dest,
 	return 0;
 }
 
+static void __free_preds(struct event_filter *filter)
+{
+	int i;
+
+	if (filter->preds) {
+		for (i = 0; i < filter->a_preds; i++) {
+			if (filter->preds[i])
+				filter_free_pred(filter->preds[i]);
+		}
+		kfree(filter->preds);
+		filter->preds = NULL;
+	}
+	filter->a_preds = 0;
+	filter->n_preds = 0;
+}
+
 static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
 	int i;
 
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
+	if (filter->preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter->preds[i]->fn = filter_pred_none;
+	}
 	filter->n_preds = 0;
-
-	for (i = 0; i < MAX_FILTER_PRED; i++)
-		filter->preds[i]->fn = filter_pred_none;
 }
 
-static void __free_preds(struct event_filter *filter)
+static void __free_filter(struct event_filter *filter)
 {
-	int i;
-
 	if (!filter)
 		return;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (filter->preds[i])
-			filter_free_pred(filter->preds[i]);
-	}
-	kfree(filter->preds);
+	__free_preds(filter);
 	kfree(filter->filter_string);
 	kfree(filter);
 }
 
 void destroy_preds(struct ftrace_event_call *call)
 {
-	__free_preds(call->filter);
+	__free_filter(call->filter);
 	call->filter = NULL;
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 
-static struct event_filter *__alloc_preds(void)
+static struct event_filter *__alloc_filter(void)
 {
 	struct event_filter *filter;
-	struct filter_pred *pred;
-	int i;
 
 	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
 	if (!filter)
@@ -595,32 +610,63 @@ static struct event_filter *__alloc_preds(void)
 
 	filter->n_preds = 0;
 
-	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	return filter;
+}
+
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
+	struct filter_pred *pred;
+	int i;
+
+	if (filter->preds) {
+		if (filter->a_preds < n_preds) {
+			/* We need to reallocate */
+			filter->n_preds = 0;
+			/*
+			 * It is possible that the filter is currently
+			 * being used. We need to zero out the number
+			 * of preds, wait on preemption and then free
+			 * the preds.
+			 */
+			synchronize_sched();
+			__free_preds(filter);
+		}
+	}
+
+	if (!filter->preds) {
+		filter->preds =
+			kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
+		filter->a_preds = n_preds;
+	}
 	if (!filter->preds)
-		goto oom;
+		return -ENOMEM;
+
+	if (WARN_ON(filter->a_preds < n_preds))
+		return -EINVAL;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	for (i = 0; i < n_preds; i++) {
+		pred = filter->preds[i];
+		if (!pred)
+			pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 		if (!pred)
 			goto oom;
 		pred->fn = filter_pred_none;
 		filter->preds[i] = pred;
 	}
 
-	return filter;
-
-oom:
+	return 0;
+ oom:
 	__free_preds(filter);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
-static int init_preds(struct ftrace_event_call *call)
+static int init_filter(struct ftrace_event_call *call)
 {
 	if (call->filter)
 		return 0;
 
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	call->filter = __alloc_preds();
+	call->filter = __alloc_filter();
 	if (IS_ERR(call->filter))
 		return PTR_ERR(call->filter);
 
@@ -636,7 +682,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		err = init_preds(call);
+		err = init_filter(call);
 		if (err)
 			return err;
 	}
@@ -665,7 +711,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 {
 	int idx, err;
 
-	if (filter->n_preds == MAX_FILTER_PRED) {
+	if (WARN_ON(filter->n_preds == filter->a_preds)) {
 		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
@@ -1179,6 +1225,20 @@ static int check_preds(struct filter_parse_state *ps)
 	return 0;
 }
 
+static int count_preds(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+	int n_preds = 0;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+		n_preds++;
+	}
+
+	return n_preds;
+}
+
 static int replace_preds(struct ftrace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
@@ -1191,10 +1251,23 @@ static int replace_preds(struct ftrace_event_call *call,
 	int err;
 	int n_preds = 0;
 
+	n_preds = count_preds(ps);
+	if (n_preds >= MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+		return -ENOSPC;
+	}
+
 	err = check_preds(ps);
 	if (err)
 		return err;
 
+	if (!dry_run) {
+		err = __alloc_preds(filter, n_preds);
+		if (err)
+			return err;
+	}
+
+	n_preds = 0;
 	list_for_each_entry(elt, &ps->postfix, list) {
 		if (elt->op == OP_NONE) {
 			if (!operand1)
@@ -1208,7 +1281,7 @@ static int replace_preds(struct ftrace_event_call *call,
 			continue;
 		}
 
-		if (n_preds++ == MAX_FILTER_PRED) {
+		if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
 			parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 			return -ENOSPC;
 		}
@@ -1283,7 +1356,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 
 	mutex_lock(&event_mutex);
 
-	err = init_preds(call);
+	err = init_filter(call);
 	if (err)
 		goto out_unlock;
 
@@ -1376,7 +1449,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
 	struct event_filter *filter = event->filter;
 
 	event->filter = NULL;
-	__free_preds(filter);
+	__free_filter(filter);
 }
 
 int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1402,7 +1475,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (event->filter)
 		goto out_unlock;
 
-	filter = __alloc_preds();
+	filter = __alloc_filter();
 	if (IS_ERR(filter)) {
 		err = PTR_ERR(filter);
 		goto out_unlock;
@@ -1411,7 +1484,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		goto free_preds;
+		goto free_filter;
 
 	parse_init(ps, filter_ops, filter_str);
 	err = filter_parse(ps);
@@ -1427,9 +1500,9 @@ free_ps:
 	postfix_clear(ps);
 	kfree(ps);
 
-free_preds:
+free_filter:
 	if (err)
-		__free_preds(filter);
+		__free_filter(filter);
 
 out_unlock:
 	mutex_unlock(&event_mutex);
-- 
cgit v1.2.1


From 0fc3ca9a10a61a77f18710fb708b41fd99c79a56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:46:46 -0500
Subject: tracing/filter: Call synchronize_sched() just once for system filters

By separating out the reseting of the filter->n_preds to zero from
the reallocation of preds for the filter, we can reset groups of
filters first, call synchronize_sched() just once, and then reallocate
each of the filters in the system group.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 80 ++++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index aac6a6183e6a..8f00a11ce778 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -570,17 +570,28 @@ static void __free_preds(struct event_filter *filter)
 	filter->n_preds = 0;
 }
 
+static void reset_preds(struct event_filter *filter)
+{
+	struct filter_pred *pred;
+	int n_preds = filter->n_preds;
+	int i;
+
+	filter->n_preds = 0;
+	if (!filter->preds)
+		return;
+
+	for (i = 0; i < n_preds; i++) {
+		pred = filter->preds[i];
+		pred->fn = filter_pred_none;
+	}
+}
+
 static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
-	int i;
 
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	if (filter->preds) {
-		for (i = 0; i < filter->n_preds; i++)
-			filter->preds[i]->fn = filter_pred_none;
-	}
-	filter->n_preds = 0;
+	reset_preds(filter);
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -620,15 +631,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 	if (filter->preds) {
 		if (filter->a_preds < n_preds) {
-			/* We need to reallocate */
-			filter->n_preds = 0;
 			/*
-			 * It is possible that the filter is currently
-			 * being used. We need to zero out the number
-			 * of preds, wait on preemption and then free
-			 * the preds.
+			 * We need to reallocate.
+			 * We should have already have zeroed out
+			 * the pred count and called synchronized_sched()
+			 * to make sure no one is using the preds.
 			 */
-			synchronize_sched();
+			if (WARN_ON_ONCE(filter->n_preds)) {
+				/* We need to reset it now */
+				filter->n_preds = 0;
+				synchronize_sched();
+			}
 			__free_preds(filter);
 		}
 	}
@@ -1328,6 +1341,30 @@ static int replace_system_preds(struct event_subsystem *system,
 		/* try to see if the filter can be applied */
 		err = replace_preds(call, filter, ps, filter_string, true);
 		if (err)
+			goto fail;
+	}
+
+	/* set all filter pred counts to zero */
+	list_for_each_entry(call, &ftrace_events, list) {
+		struct event_filter *filter = call->filter;
+
+		if (strcmp(call->class->system, system->name) != 0)
+			continue;
+
+		reset_preds(filter);
+	}
+
+	/*
+	 * Since some of the preds may be used under preemption
+	 * we need to wait for them to finish before we may
+	 * reallocate them.
+	 */
+	synchronize_sched();
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		struct event_filter *filter = call->filter;
+
+		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
 		/* really apply the filter */
@@ -1342,11 +1379,13 @@ static int replace_system_preds(struct event_subsystem *system,
 		fail = false;
 	}
 
-	if (fail) {
-		parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-		return -EINVAL;
-	}
+	if (fail)
+		goto fail;
+
 	return 0;
+ fail:
+	parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+	return -EINVAL;
 }
 
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
@@ -1381,6 +1420,13 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out;
 	}
 
+	/*
+	 * Make sure all the pred counts are zero so that
+	 * no task is using it when we reallocate the preds array.
+	 */
+	reset_preds(call->filter);
+	synchronize_sched();
+
 	err = replace_preds(call, call->filter, ps, filter_string, false);
 	if (err)
 		append_filter_err(ps, call->filter);
-- 
cgit v1.2.1


From 74e9e58c350a24139e268dd6857bbaa55c5aafcf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:49:48 -0500
Subject: tracing/filter: Allocate the preds in an array

Currently we allocate an array of pointers to filter_preds, and then
allocate a separate filter_pred for each item in the array.
This adds slight overhead in the filters as it needs to derefernce
twice to get to the op condition.

Allocating the preds themselves in a single array removes a dereference
as well as helps on the cache footprint.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |  2 +-
 kernel/trace/trace_events_filter.c | 31 +++++++++----------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 441fc1bc85d6..254d04a84ec3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -663,7 +663,7 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
-	struct filter_pred	**preds;
+	struct filter_pred	*preds;
 	char			*filter_string;
 };
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8f00a11ce778..b6c910642a1e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -362,7 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 {
 	int match = -1, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
-	struct filter_pred **preds;
+	struct filter_pred *preds;
 	struct filter_pred *pred;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
 	int i;
@@ -377,7 +377,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	preds = rcu_dereference_sched(filter->preds);
 
 	for (i = 0; i < n_preds; i++) {
-		pred = preds[i];
+		pred = &preds[i];
 		if (!pred->pop_n) {
 			match = pred->fn(pred, rec);
 			stack[top++] = match;
@@ -559,10 +559,8 @@ static void __free_preds(struct event_filter *filter)
 	int i;
 
 	if (filter->preds) {
-		for (i = 0; i < filter->a_preds; i++) {
-			if (filter->preds[i])
-				filter_free_pred(filter->preds[i]);
-		}
+		for (i = 0; i < filter->a_preds; i++)
+			kfree(filter->preds[i].field_name);
 		kfree(filter->preds);
 		filter->preds = NULL;
 	}
@@ -572,7 +570,6 @@ static void __free_preds(struct event_filter *filter)
 
 static void reset_preds(struct event_filter *filter)
 {
-	struct filter_pred *pred;
 	int n_preds = filter->n_preds;
 	int i;
 
@@ -580,10 +577,8 @@ static void reset_preds(struct event_filter *filter)
 	if (!filter->preds)
 		return;
 
-	for (i = 0; i < n_preds; i++) {
-		pred = filter->preds[i];
-		pred->fn = filter_pred_none;
-	}
+	for (i = 0; i < n_preds; i++)
+		filter->preds[i].fn = filter_pred_none;
 }
 
 static void filter_disable_preds(struct ftrace_event_call *call)
@@ -658,19 +653,11 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 		return -EINVAL;
 
 	for (i = 0; i < n_preds; i++) {
-		pred = filter->preds[i];
-		if (!pred)
-			pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-		if (!pred)
-			goto oom;
+		pred = &filter->preds[i];
 		pred->fn = filter_pred_none;
-		filter->preds[i] = pred;
 	}
 
 	return 0;
- oom:
-	__free_preds(filter);
-	return -ENOMEM;
 }
 
 static int init_filter(struct ftrace_event_call *call)
@@ -730,8 +717,8 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 	}
 
 	idx = filter->n_preds;
-	filter_clear_pred(filter->preds[idx]);
-	err = filter_set_pred(filter->preds[idx], pred, fn);
+	filter_clear_pred(&filter->preds[idx]);
+	err = filter_set_pred(&filter->preds[idx], pred, fn);
 	if (err)
 		return err;
 
-- 
cgit v1.2.1


From f76690afd05e3e163149310bdcd30234f93b3a7a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:53:06 -0500
Subject: tracing/filter: Free pred array on disabling of filter

When a filter is disabled, free the preds.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b6c910642a1e..2f5458e244a3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1388,6 +1388,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_disable_preds(call);
+		reset_preds(call->filter);
+		/* Make sure the filter is not being used */
+		synchronize_sched();
+		__free_preds(call->filter);
 		remove_filter_string(call->filter);
 		goto out_unlock;
 	}
-- 
cgit v1.2.1


From 61e9dea20e1ada886cc49a9ec6fe3c6ac0de7324 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:54:33 -0500
Subject: tracing/filter: Use a tree instead of stack for filter_match_preds()

Currently the filter_match_preds() requires a stack to push
and pop the preds to determine if the filter matches the record or not.
This has two drawbacks:

1) It requires a stack to store state information. As this is done
   in fast paths we can't allocate the storage for this stack, and
   we can't use a global as it must be re-entrant. The stack is stored
   on the kernel stack and this greatly limits how many preds we
   may allow.

2) All conditions are calculated even when a short circuit exists.
   a || b  will always calculate a and b even though a was determined
   to be true.

Using a tree we can walk a constant structure that will save
the state as we go. The algorithm is simply:

  pred = root;
  do {
	switch (move) {
	case MOVE_DOWN:
		if (OR or AND) {
			pred = left;
			continue;
		}
		if (pred == root)
			break;
		match = pred->fn();
		pred = pred->parent;
		move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT;
		continue;

	case MOVE_UP_FROM_LEFT:
		/* Only OR or AND can be a parent */
		if (match && OR || !match && AND) {
			/* short circuit */
			if (pred == root)
				break;
			pred = pred->parent;
			move = left child ?
				MOVE_UP_FROM_LEFT :
				MOVE_UP_FROM_RIGHT;
			continue;
		}
		pred = pred->right;
		move = MOVE_DOWN;
		continue;

	case MOVE_UP_FROM_RIGHT:
		if (pred == root)
			break;
		pred = pred->parent;
		move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT;
		continue;
	}
	done = 1;
  } while (!done);

This way there's no strict limit to how many preds we allow
and it also will short circuit the logical operations when possible.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |   9 +-
 kernel/trace/trace_events_filter.c | 231 +++++++++++++++++++++++++++++--------
 2 files changed, 194 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 254d04a84ec3..bba34a72c780 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -664,6 +664,7 @@ struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
 	struct filter_pred	*preds;
+	struct filter_pred	*root;
 	char			*filter_string;
 };
 
@@ -675,6 +676,9 @@ struct event_subsystem {
 	int			nr_events;
 };
 
+#define FILTER_PRED_INVALID	((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT	(1 << 15)
+
 struct filter_pred;
 struct regex;
 
@@ -704,7 +708,10 @@ struct filter_pred {
 	int 			offset;
 	int 			not;
 	int 			op;
-	int 			pop_n;
+	unsigned short		index;
+	unsigned short		parent;
+	unsigned short		left;
+	unsigned short		right;
 };
 
 extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2f5458e244a3..10390491b6d0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,6 +123,11 @@ struct filter_parse_state {
 	} operand;
 };
 
+struct pred_stack {
+	struct filter_pred	**preds;
+	int			index;
+};
+
 #define DEFINE_COMPARISON_PRED(type)					\
 static int filter_pred_##type(struct filter_pred *pred, void *event)	\
 {									\
@@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred)
 	pred->not ^= not;
 }
 
+enum move_type {
+	MOVE_DOWN,
+	MOVE_UP_FROM_LEFT,
+	MOVE_UP_FROM_RIGHT
+};
+
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+		int index, enum move_type *move)
+{
+	if (pred->parent & FILTER_PRED_IS_RIGHT)
+		*move = MOVE_UP_FROM_RIGHT;
+	else
+		*move = MOVE_UP_FROM_LEFT;
+	pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+
+	return pred;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	int match = -1, top = 0, val1 = 0, val2 = 0;
-	int stack[MAX_FILTER_PRED];
+	int match = -1;
+	enum move_type move = MOVE_DOWN;
 	struct filter_pred *preds;
 	struct filter_pred *pred;
+	struct filter_pred *root;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
-	int i;
+	int done = 0;
 
 	/* no filter is considered a match */
 	if (!n_preds)
 		return 1;
 
 	/*
-	 * n_preds and filter->preds is protect with preemption disabled.
+	 * n_preds, root and filter->preds are protect with preemption disabled.
 	 */
 	preds = rcu_dereference_sched(filter->preds);
+	root = rcu_dereference_sched(filter->root);
+	if (!root)
+		return 1;
 
-	for (i = 0; i < n_preds; i++) {
-		pred = &preds[i];
-		if (!pred->pop_n) {
+	pred = root;
+
+	/* match is currently meaningless */
+	match = -1;
+
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			/* only AND and OR have children */
+			if (pred->left != FILTER_PRED_INVALID) {
+				/* keep going to leaf node */
+				pred = &preds[pred->left];
+				continue;
+			}
 			match = pred->fn(pred, rec);
-			stack[top++] = match;
+			/* If this pred is the only pred */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			/* Check for short circuits */
+			if ((match && pred->op == OP_OR) ||
+			    (!match && pred->op == OP_AND)) {
+				if (pred == root)
+					break;
+				pred = get_pred_parent(pred, preds,
+						       pred->parent, &move);
+				continue;
+			}
+			/* now go down the right side of the tree. */
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			/* We finished this equation. */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
 			continue;
 		}
-		if (pred->pop_n > top) {
-			WARN_ON_ONCE(1);
-			return 0;
-		}
-		val1 = stack[--top];
-		val2 = stack[--top];
-		switch (pred->op) {
-		case OP_AND:
-			match = val1 && val2;
-			break;
-		case OP_OR:
-			match = val1 || val2;
-			break;
-		default:
-			WARN_ONCE(1, "filter op is not AND or OR");
-		}
-		stack[top++] = match;
-	}
+		done = 1;
+	} while (!done);
 
-	return stack[--top];
+	return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
@@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred)
 	pred->regex.len = 0;
 }
 
-static int filter_set_pred(struct filter_pred *dest,
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+	stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+	if (!stack->preds)
+		return -ENOMEM;
+	stack->index = n_preds;
+	return 0;
+}
+
+static void __free_pred_stack(struct pred_stack *stack)
+{
+	kfree(stack->preds);
+	stack->index = 0;
+}
+
+static int __push_pred_stack(struct pred_stack *stack,
+			     struct filter_pred *pred)
+{
+	int index = stack->index;
+
+	if (WARN_ON(index == 0))
+		return -ENOSPC;
+
+	stack->preds[--index] = pred;
+	stack->index = index;
+	return 0;
+}
+
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+	struct filter_pred *pred;
+	int index = stack->index;
+
+	pred = stack->preds[index++];
+	if (!pred)
+		return NULL;
+
+	stack->index = index;
+	return pred;
+}
+
+static int filter_set_pred(struct event_filter *filter,
+			   int idx,
+			   struct pred_stack *stack,
 			   struct filter_pred *src,
 			   filter_pred_fn_t fn)
 {
+	struct filter_pred *dest = &filter->preds[idx];
+	struct filter_pred *left;
+	struct filter_pred *right;
+
 	*dest = *src;
 	if (src->field_name) {
 		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest,
 			return -ENOMEM;
 	}
 	dest->fn = fn;
+	dest->index = idx;
 
-	return 0;
+	if (dest->op == OP_OR || dest->op == OP_AND) {
+		right = __pop_pred_stack(stack);
+		left = __pop_pred_stack(stack);
+		if (!left || !right)
+			return -EINVAL;
+		dest->left = left->index;
+		dest->right = right->index;
+		left->parent = dest->index;
+		right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+	} else
+		/*
+		 * Make dest->left invalid to be used as a quick
+		 * way to know this is a leaf node.
+		 */
+		dest->left = FILTER_PRED_INVALID;
+
+	return __push_pred_stack(stack, dest);
 }
 
 static void __free_preds(struct event_filter *filter)
@@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter)
 	int i;
 
 	filter->n_preds = 0;
+	filter->root = NULL;
 	if (!filter->preds)
 		return;
 
@@ -707,6 +821,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 			      struct ftrace_event_call *call,
 			      struct event_filter *filter,
 			      struct filter_pred *pred,
+			      struct pred_stack *stack,
 			      filter_pred_fn_t fn)
 {
 	int idx, err;
@@ -718,7 +833,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 
 	idx = filter->n_preds;
 	filter_clear_pred(&filter->preds[idx]);
-	err = filter_set_pred(&filter->preds[idx], pred, fn);
+	err = filter_set_pred(filter, idx, stack, pred, fn);
 	if (err)
 		return err;
 
@@ -803,6 +918,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 			   struct ftrace_event_call *call,
 			   struct event_filter *filter,
 			   struct filter_pred *pred,
+			   struct pred_stack *stack,
 			   bool dry_run)
 {
 	struct ftrace_event_field *field;
@@ -812,13 +928,10 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 	fn = pred->fn = filter_pred_none;
 
-	if (pred->op == OP_AND) {
-		pred->pop_n = 2;
+	if (pred->op == OP_AND)
 		goto add_pred_fn;
-	} else if (pred->op == OP_OR) {
-		pred->pop_n = 2;
+	else if (pred->op == OP_OR)
 		goto add_pred_fn;
-	}
 
 	field = find_event_field(call, pred->field_name);
 	if (!field) {
@@ -867,7 +980,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 add_pred_fn:
 	if (!dry_run)
-		return filter_add_pred_fn(ps, call, filter, pred, fn);
+		return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
 	return 0;
 }
 
@@ -1248,6 +1361,7 @@ static int replace_preds(struct ftrace_event_call *call,
 	char *operand1 = NULL, *operand2 = NULL;
 	struct filter_pred *pred;
 	struct postfix_elt *elt;
+	struct pred_stack stack = { }; /* init to NULL */
 	int err;
 	int n_preds = 0;
 
@@ -1262,9 +1376,12 @@ static int replace_preds(struct ftrace_event_call *call,
 		return err;
 
 	if (!dry_run) {
-		err = __alloc_preds(filter, n_preds);
+		err = __alloc_pred_stack(&stack, n_preds);
 		if (err)
 			return err;
+		err = __alloc_preds(filter, n_preds);
+		if (err)
+			goto fail;
 	}
 
 	n_preds = 0;
@@ -1276,14 +1393,16 @@ static int replace_preds(struct ftrace_event_call *call,
 				operand2 = elt->operand;
 			else {
 				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
-				return -EINVAL;
+				err = -EINVAL;
+				goto fail;
 			}
 			continue;
 		}
 
 		if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
 			parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-			return -ENOSPC;
+			err = -ENOSPC;
+			goto fail;
 		}
 
 		if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1293,22 +1412,44 @@ static int replace_preds(struct ftrace_event_call *call,
 
 		if (!operand1 || !operand2) {
 			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
-			return -EINVAL;
+			err = -EINVAL;
+			goto fail;
 		}
 
 		pred = create_pred(elt->op, operand1, operand2);
 add_pred:
-		if (!pred)
-			return -ENOMEM;
-		err = filter_add_pred(ps, call, filter, pred, dry_run);
+		if (!pred) {
+			err = -ENOMEM;
+			goto fail;
+		}
+		err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
 		filter_free_pred(pred);
 		if (err)
-			return err;
+			goto fail;
 
 		operand1 = operand2 = NULL;
 	}
 
-	return 0;
+	if (!dry_run) {
+		/* We should have one item left on the stack */
+		pred = __pop_pred_stack(&stack);
+		if (!pred)
+			return -EINVAL;
+		/* This item is where we start from in matching */
+		filter->root = pred;
+		/* Make sure the stack is empty */
+		pred = __pop_pred_stack(&stack);
+		if (WARN_ON(pred)) {
+			err = -EINVAL;
+			filter->root = NULL;
+			goto fail;
+		}
+	}
+
+	err = 0;
+fail:
+	__free_pred_stack(&stack);
+	return err;
 }
 
 static int replace_system_preds(struct event_subsystem *system,
-- 
cgit v1.2.1


From 55719274188f13cff9e3bd11fdd4c0e7617cd03d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:12:05 -0500
Subject: tracing/filter: Optimize short ciruit check

The test if we should break out early for OR and AND operations
can be optimized by comparing the current result with
  (pred->op == OP_OR)

That is if the result is true and the op is an OP_OR, or
if the result is false and the op is not an OP_OR (thus an OP_AND)
we can break out early in either case. Otherwise we continue
processing.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 10390491b6d0..0a3e0502b507 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -426,9 +426,15 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 					       pred->parent, &move);
 			continue;
 		case MOVE_UP_FROM_LEFT:
-			/* Check for short circuits */
-			if ((match && pred->op == OP_OR) ||
-			    (!match && pred->op == OP_AND)) {
+			/*
+			 * Check for short circuits.
+			 *
+			 * Optimization: !!match == (pred->op == OP_OR)
+			 *   is the same as:
+			 * if ((match && pred->op == OP_OR) ||
+			 *     (!match && pred->op == OP_AND))
+			 */
+			if (!!match == (pred->op == OP_OR)) {
 				if (pred == root)
 					break;
 				pred = get_pred_parent(pred, preds,
-- 
cgit v1.2.1


From ec126cac23945de12eb2d103374e1f7ee97c5595 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:14:25 -0500
Subject: tracing/filter: Check the created pred tree

Since the filter walks a tree to determine if a match is made or not,
if the tree was incorrectly created, it could cause an infinite loop.

Add a check to walk the entire tree before assigning it as a filter
to make sure the tree is correct.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 72 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0a3e0502b507..91c9cdcb040b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1358,6 +1358,68 @@ static int count_preds(struct filter_parse_state *ps)
 	return n_preds;
 }
 
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+			   struct filter_pred *root)
+{
+	struct filter_pred *preds;
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int count = 0;
+	int done = 0;
+	int max;
+
+	/*
+	 * The max that we can hit a node is three times.
+	 * Once going down, once coming up from left, and
+	 * once coming up from right. This is more than enough
+	 * since leafs are only hit a single time.
+	 */
+	max = 3 * filter->n_preds;
+
+	preds = filter->preds;
+	if  (!preds)
+		return -EINVAL;
+	pred = root;
+
+	do {
+		if (WARN_ON(count++ > max))
+			return -EINVAL;
+
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			/* A leaf at the root is just a leaf in the tree */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	/* We are fine. */
+	return 0;
+}
+
 static int replace_preds(struct ftrace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
@@ -1366,6 +1428,7 @@ static int replace_preds(struct ftrace_event_call *call,
 {
 	char *operand1 = NULL, *operand2 = NULL;
 	struct filter_pred *pred;
+	struct filter_pred *root;
 	struct postfix_elt *elt;
 	struct pred_stack stack = { }; /* init to NULL */
 	int err;
@@ -1442,7 +1505,7 @@ add_pred:
 		if (!pred)
 			return -EINVAL;
 		/* This item is where we start from in matching */
-		filter->root = pred;
+		root = pred;
 		/* Make sure the stack is empty */
 		pred = __pop_pred_stack(&stack);
 		if (WARN_ON(pred)) {
@@ -1450,6 +1513,13 @@ add_pred:
 			filter->root = NULL;
 			goto fail;
 		}
+		err = check_pred_tree(filter, root);
+		if (err)
+			goto fail;
+
+		/* We don't set root until we know it works */
+		barrier();
+		filter->root = root;
 	}
 
 	err = 0;
-- 
cgit v1.2.1


From 43cd414552d8137157e926e46361678ea867e476 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:16:51 -0500
Subject: tracing/filter: Optimize filter by folding the tree

There are many cases that a filter will contain multiple ORs or
ANDs together near the leafs. Walking up and down the tree to get
to the next compare can be a waste.

If there are several ORs or ANDs together, fold them into a single
pred and allocate an array of the conditions that they check.
This will speed up the filter by linearly walking an array
and can still break out if a short circuit condition is met.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |  12 +-
 kernel/trace/trace_events_filter.c | 233 +++++++++++++++++++++++++++++++++++--
 2 files changed, 235 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bba34a72c780..d754330934bb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -678,6 +678,7 @@ struct event_subsystem {
 
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
+#define FILTER_PRED_FOLD	(1 << 15)
 
 struct filter_pred;
 struct regex;
@@ -704,7 +705,16 @@ struct filter_pred {
 	filter_pred_fn_t 	fn;
 	u64 			val;
 	struct regex		regex;
-	char 			*field_name;
+	/*
+	 * Leaf nodes use field_name, ops is used by AND and OR
+	 * nodes. The field_name is always freed when freeing a pred.
+	 * We can overload field_name for ops and have it freed
+	 * as well.
+	 */
+	union {
+		char		*field_name;
+		unsigned short	*ops;
+	};
 	int 			offset;
 	int 			not;
 	int 			op;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 91c9cdcb040b..2403ce5b6507 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,42 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
 	return pred;
 }
 
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+		       struct filter_pred *op, void *rec)
+{
+	struct filter_pred *pred;
+	int type;
+	int match;
+	int i;
+
+	/*
+	 * Micro-optimization: We set type to true if op
+	 * is an OR and false otherwise (AND). Then we
+	 * just need to test if the match is equal to
+	 * the type, and if it is, we can short circuit the
+	 * rest of the checks:
+	 *
+	 * if ((match && op->op == OP_OR) ||
+	 *     (!match && op->op == OP_AND))
+	 *	  return match;
+	 */
+	type = op->op == OP_OR;
+
+	for (i = 0; i < op->val; i++) {
+		pred = &preds[op->ops[i]];
+		match = pred->fn(pred, rec);
+		if (!!match == type)
+			return match;
+	}
+	return match;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
@@ -414,11 +450,16 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 		case MOVE_DOWN:
 			/* only AND and OR have children */
 			if (pred->left != FILTER_PRED_INVALID) {
-				/* keep going to leaf node */
-				pred = &preds[pred->left];
-				continue;
-			}
-			match = pred->fn(pred, rec);
+				/* If ops is set, then it was folded. */
+				if (!pred->ops) {
+					/* keep going to down the left side */
+					pred = &preds[pred->left];
+					continue;
+				}
+				/* We can treat folded ops as a leaf node */
+				match = process_ops(preds, pred, rec);
+			} else
+				match = pred->fn(pred, rec);
 			/* If this pred is the only pred */
 			if (pred == root)
 				break;
@@ -659,17 +700,34 @@ static int filter_set_pred(struct event_filter *filter,
 		left = __pop_pred_stack(stack);
 		if (!left || !right)
 			return -EINVAL;
-		dest->left = left->index;
-		dest->right = right->index;
-		left->parent = dest->index;
+		/*
+		 * If both children can be folded
+		 * and they are the same op as this op or a leaf,
+		 * then this op can be folded.
+		 */
+		if (left->index & FILTER_PRED_FOLD &&
+		    (left->op == dest->op ||
+		     left->left == FILTER_PRED_INVALID) &&
+		    right->index & FILTER_PRED_FOLD &&
+		    (right->op == dest->op ||
+		     right->left == FILTER_PRED_INVALID))
+			dest->index |= FILTER_PRED_FOLD;
+
+		dest->left = left->index & ~FILTER_PRED_FOLD;
+		dest->right = right->index & ~FILTER_PRED_FOLD;
+		left->parent = dest->index & ~FILTER_PRED_FOLD;
 		right->parent = dest->index | FILTER_PRED_IS_RIGHT;
-	} else
+	} else {
 		/*
 		 * Make dest->left invalid to be used as a quick
 		 * way to know this is a leaf node.
 		 */
 		dest->left = FILTER_PRED_INVALID;
 
+		/* All leafs allow folding the parent ops. */
+		dest->index |= FILTER_PRED_FOLD;
+	}
+
 	return __push_pred_stack(stack, dest);
 }
 
@@ -1420,6 +1478,158 @@ static int check_pred_tree(struct event_filter *filter,
 	return 0;
 }
 
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int count = 0;
+	int done = 0;
+
+	pred = root;
+
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			/* A leaf at the root is just a leaf in the tree */
+			if (pred == root)
+				return 1;
+			count++;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	return count;
+}
+
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int count = 0;
+	int children;
+	int done = 0;
+
+	/* No need to keep the fold flag */
+	root->index &= ~FILTER_PRED_FOLD;
+
+	/* If the root is a leaf then do nothing */
+	if (root->left == FILTER_PRED_INVALID)
+		return 0;
+
+	/* count the children */
+	children = count_leafs(preds, &preds[root->left]);
+	children += count_leafs(preds, &preds[root->right]);
+
+	root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+	if (!root->ops)
+		return -ENOMEM;
+
+	root->val = children;
+
+	pred = root;
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			if (WARN_ON(count == children))
+				return -EINVAL;
+			pred->index &= ~FILTER_PRED_FOLD;
+			root->ops[count++] = pred->index;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	return 0;
+}
+
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+			   struct filter_pred *root)
+{
+	struct filter_pred *preds;
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int done = 0;
+	int err;
+
+	preds = filter->preds;
+	if  (!preds)
+		return -EINVAL;
+	pred = root;
+
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->index & FILTER_PRED_FOLD) {
+				err = fold_pred(preds, pred);
+				if (err)
+					return err;
+				/* Folded nodes are like leafs */
+			} else if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+
+			/* A leaf at the root is just a leaf in the tree */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	return 0;
+}
+
 static int replace_preds(struct ftrace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
@@ -1517,6 +1727,11 @@ add_pred:
 		if (err)
 			goto fail;
 
+		/* Optimize the tree */
+		err = fold_pred_tree(filter, root);
+		if (err)
+			goto fail;
+
 		/* We don't set root until we know it works */
 		barrier();
 		filter->root = root;
-- 
cgit v1.2.1


From 4a3d27e98a7f2682e96d6f863752e0424b00d691 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:19:49 -0500
Subject: tracing/filter: Move MAX_FILTER_PRED to local tracing directory

The MAX_FILTER_PRED is only needed by the kernel/trace/*.c files.
Move it to kernel/trace/trace.h.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d754330934bb..fbff872f8db1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -680,6 +680,8 @@ struct event_subsystem {
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
 
+#define MAX_FILTER_PRED		32
+
 struct filter_pred;
 struct regex;
 
-- 
cgit v1.2.1


From bf93f9ed3a2cb89eb7e58851139d3be375b98027 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:21:34 -0500
Subject: tracing/filter: Increase the max preds to 2^14

Now that the filter logic does not require to save the pred results
on the stack, we can increase the max number of preds we allow.
As the preds are index by a short value, and we use the MSBs as flags
we can increase the max preds to 2^14 (16384) which should be way
more than enough.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fbff872f8db1..856e73cc1d3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -680,7 +680,14 @@ struct event_subsystem {
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
 
-#define MAX_FILTER_PRED		32
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED		16384
 
 struct filter_pred;
 struct regex;
-- 
cgit v1.2.1


From 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 Feb 2011 23:25:46 -0500
Subject: tracing/filter: Swap entire filter of events

When creating a new filter, instead of allocating the filter to the
event call first and then processing the filter, it is easier to
process a temporary filter and then just swap it with the call filter.
By doing this, it simplifies the code.

A filter is allocated and processed, when it is done, it is
swapped with the call filter, synchronize_sched() is called to make
sure all callers are done with the old filter (filters are called
with premption disabled), and then the old filter is freed.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 251 +++++++++++++++++++++----------------
 1 file changed, 146 insertions(+), 105 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2403ce5b6507..f5d335d28d0b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -425,10 +425,15 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	struct filter_pred *preds;
 	struct filter_pred *pred;
 	struct filter_pred *root;
-	int n_preds = ACCESS_ONCE(filter->n_preds);
+	int n_preds;
 	int done = 0;
 
 	/* no filter is considered a match */
+	if (!filter)
+		return 1;
+
+	n_preds = filter->n_preds;
+
 	if (!n_preds)
 		return 1;
 
@@ -509,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
 
 static void remove_filter_string(struct event_filter *filter)
 {
+	if (!filter)
+		return;
+
 	kfree(filter->filter_string);
 	filter->filter_string = NULL;
 }
@@ -568,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
 
 void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-	struct event_filter *filter = call->filter;
+	struct event_filter *filter;
 
 	mutex_lock(&event_mutex);
+	filter = call->filter;
 	if (filter && filter->filter_string)
 		trace_seq_printf(s, "%s\n", filter->filter_string);
 	else
@@ -581,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 void print_subsystem_event_filter(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
-	struct event_filter *filter = system->filter;
+	struct event_filter *filter;
 
 	mutex_lock(&event_mutex);
+	filter = system->filter;
 	if (filter && filter->filter_string)
 		trace_seq_printf(s, "%s\n", filter->filter_string);
 	else
@@ -745,26 +755,9 @@ static void __free_preds(struct event_filter *filter)
 	filter->n_preds = 0;
 }
 
-static void reset_preds(struct event_filter *filter)
-{
-	int n_preds = filter->n_preds;
-	int i;
-
-	filter->n_preds = 0;
-	filter->root = NULL;
-	if (!filter->preds)
-		return;
-
-	for (i = 0; i < n_preds; i++)
-		filter->preds[i].fn = filter_pred_none;
-}
-
-static void filter_disable_preds(struct ftrace_event_call *call)
+static void filter_disable(struct ftrace_event_call *call)
 {
-	struct event_filter *filter = call->filter;
-
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	reset_preds(filter);
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -777,11 +770,16 @@ static void __free_filter(struct event_filter *filter)
 	kfree(filter);
 }
 
+/*
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
+ * the tracepoints from within it.
+ */
 void destroy_preds(struct ftrace_event_call *call)
 {
 	__free_filter(call->filter);
 	call->filter = NULL;
-	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 
 static struct event_filter *__alloc_filter(void)
@@ -789,11 +787,6 @@ static struct event_filter *__alloc_filter(void)
 	struct event_filter *filter;
 
 	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-	if (!filter)
-		return ERR_PTR(-ENOMEM);
-
-	filter->n_preds = 0;
-
 	return filter;
 }
 
@@ -838,46 +831,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 	return 0;
 }
 
-static int init_filter(struct ftrace_event_call *call)
-{
-	if (call->filter)
-		return 0;
-
-	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	call->filter = __alloc_filter();
-	if (IS_ERR(call->filter))
-		return PTR_ERR(call->filter);
-
-	return 0;
-}
-
-static int init_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
-	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		err = init_filter(call);
-		if (err)
-			return err;
+		filter_disable(call);
+		remove_filter_string(call->filter);
 	}
-
-	return 0;
 }
 
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
-
-		filter_disable_preds(call);
-		remove_filter_string(call->filter);
+		__free_filter(call->filter);
+		call->filter = NULL;
 	}
 }
 
@@ -1743,88 +1718,129 @@ fail:
 	return err;
 }
 
+struct filter_list {
+	struct list_head	list;
+	struct event_filter	*filter;
+};
+
 static int replace_system_preds(struct event_subsystem *system,
 				struct filter_parse_state *ps,
 				char *filter_string)
 {
 	struct ftrace_event_call *call;
+	struct filter_list *filter_item;
+	struct filter_list *tmp;
+	LIST_HEAD(filter_list);
 	bool fail = true;
 	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		struct event_filter *filter = call->filter;
 
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		/* try to see if the filter can be applied */
-		err = replace_preds(call, filter, ps, filter_string, true);
+		/*
+		 * Try to see if the filter can be applied
+		 *  (filter arg is ignored on dry_run)
+		 */
+		err = replace_preds(call, NULL, ps, filter_string, true);
 		if (err)
 			goto fail;
 	}
 
-	/* set all filter pred counts to zero */
 	list_for_each_entry(call, &ftrace_events, list) {
-		struct event_filter *filter = call->filter;
+		struct event_filter *filter;
 
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		reset_preds(filter);
-	}
+		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+		if (!filter_item)
+			goto fail_mem;
 
-	/*
-	 * Since some of the preds may be used under preemption
-	 * we need to wait for them to finish before we may
-	 * reallocate them.
-	 */
-	synchronize_sched();
+		list_add_tail(&filter_item->list, &filter_list);
 
-	list_for_each_entry(call, &ftrace_events, list) {
-		struct event_filter *filter = call->filter;
+		filter_item->filter = __alloc_filter();
+		if (!filter_item->filter)
+			goto fail_mem;
+		filter = filter_item->filter;
 
-		if (strcmp(call->class->system, system->name) != 0)
-			continue;
+		/* Can only fail on no memory */
+		err = replace_filter_string(filter, filter_string);
+		if (err)
+			goto fail_mem;
 
-		/* really apply the filter */
-		filter_disable_preds(call);
 		err = replace_preds(call, filter, ps, filter_string, false);
-		if (err)
-			filter_disable_preds(call);
-		else {
+		if (err) {
+			filter_disable(call);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+			append_filter_err(ps, filter);
+		} else
 			call->flags |= TRACE_EVENT_FL_FILTERED;
-			replace_filter_string(filter, filter_string);
-		}
+		/*
+		 * Regardless of if this returned an error, we still
+		 * replace the filter for the call.
+		 */
+		filter = call->filter;
+		call->filter = filter_item->filter;
+		filter_item->filter = filter;
+
 		fail = false;
 	}
 
 	if (fail)
 		goto fail;
 
+	/*
+	 * The calls can still be using the old filters.
+	 * Do a synchronize_sched() to ensure all calls are
+	 * done with them before we free them.
+	 */
+	synchronize_sched();
+	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+		__free_filter(filter_item->filter);
+		list_del(&filter_item->list);
+		kfree(filter_item);
+	}
 	return 0;
  fail:
+	/* No call succeeded */
+	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+		list_del(&filter_item->list);
+		kfree(filter_item);
+	}
 	parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 	return -EINVAL;
+ fail_mem:
+	/* If any call succeeded, we still need to sync */
+	if (!fail)
+		synchronize_sched();
+	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+		__free_filter(filter_item->filter);
+		list_del(&filter_item->list);
+		kfree(filter_item);
+	}
+	return -ENOMEM;
 }
 
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-	int err;
 	struct filter_parse_state *ps;
+	struct event_filter *filter;
+	struct event_filter *tmp;
+	int err = 0;
 
 	mutex_lock(&event_mutex);
 
-	err = init_filter(call);
-	if (err)
-		goto out_unlock;
-
 	if (!strcmp(strstrip(filter_string), "0")) {
-		filter_disable_preds(call);
-		reset_preds(call->filter);
+		filter_disable(call);
+		filter = call->filter;
+		if (!filter)
+			goto out_unlock;
+		call->filter = NULL;
 		/* Make sure the filter is not being used */
 		synchronize_sched();
-		__free_preds(call->filter);
-		remove_filter_string(call->filter);
+		__free_filter(filter);
 		goto out_unlock;
 	}
 
@@ -1833,29 +1849,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 	if (!ps)
 		goto out_unlock;
 
-	filter_disable_preds(call);
-	replace_filter_string(call->filter, filter_string);
+	filter = __alloc_filter();
+	if (!filter) {
+		kfree(ps);
+		goto out_unlock;
+	}
+
+	replace_filter_string(filter, filter_string);
 
 	parse_init(ps, filter_ops, filter_string);
 	err = filter_parse(ps);
 	if (err) {
-		append_filter_err(ps, call->filter);
+		append_filter_err(ps, filter);
 		goto out;
 	}
 
-	/*
-	 * Make sure all the pred counts are zero so that
-	 * no task is using it when we reallocate the preds array.
-	 */
-	reset_preds(call->filter);
-	synchronize_sched();
-
-	err = replace_preds(call, call->filter, ps, filter_string, false);
-	if (err)
-		append_filter_err(ps, call->filter);
-	else
+	err = replace_preds(call, filter, ps, filter_string, false);
+	if (err) {
+		filter_disable(call);
+		append_filter_err(ps, filter);
+	} else
 		call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
+	/*
+	 * Always swap the call filter with the new filter
+	 * even if there was an error. If there was an error
+	 * in the filter, we disable the filter and show the error
+	 * string
+	 */
+	tmp = call->filter;
+	call->filter = filter;
+	if (tmp) {
+		/* Make sure the call is done with the filter */
+		synchronize_sched();
+		__free_filter(tmp);
+	}
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
@@ -1868,18 +1896,21 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
 				 char *filter_string)
 {
-	int err;
 	struct filter_parse_state *ps;
+	struct event_filter *filter;
+	int err = 0;
 
 	mutex_lock(&event_mutex);
 
-	err = init_subsystem_preds(system);
-	if (err)
-		goto out_unlock;
-
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_free_subsystem_preds(system);
 		remove_filter_string(system->filter);
+		filter = system->filter;
+		system->filter = NULL;
+		/* Ensure all filters are no longer used */
+		synchronize_sched();
+		filter_free_subsystem_filters(system);
+		__free_filter(filter);
 		goto out_unlock;
 	}
 
@@ -1888,7 +1919,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 	if (!ps)
 		goto out_unlock;
 
-	replace_filter_string(system->filter, filter_string);
+	filter = __alloc_filter();
+	if (!filter)
+		goto out;
+
+	replace_filter_string(filter, filter_string);
+	/*
+	 * No event actually uses the system filter
+	 * we can free it without synchronize_sched().
+	 */
+	__free_filter(system->filter);
+	system->filter = filter;
 
 	parse_init(ps, filter_ops, filter_string);
 	err = filter_parse(ps);
@@ -1945,7 +1986,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 		goto out_unlock;
 
 	filter = __alloc_filter();
-	if (IS_ERR(filter)) {
+	if (!filter) {
 		err = PTR_ERR(filter);
 		goto out_unlock;
 	}
-- 
cgit v1.2.1


From 4defe682d81a4960b6840ee4ed1a36f9db77c7bd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 Feb 2011 23:29:06 -0500
Subject: tracing/filter: Remove synchronize_sched() from __alloc_preds()

Because the filters are processed first and then activated
(added to the call), we no longer need to worry about the preds
of the filter in __alloc_preds() being used. As the filter that
is allocating preds is not activated yet.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f5d335d28d0b..3249b4f77ef0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -795,33 +795,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 	struct filter_pred *pred;
 	int i;
 
-	if (filter->preds) {
-		if (filter->a_preds < n_preds) {
-			/*
-			 * We need to reallocate.
-			 * We should have already have zeroed out
-			 * the pred count and called synchronized_sched()
-			 * to make sure no one is using the preds.
-			 */
-			if (WARN_ON_ONCE(filter->n_preds)) {
-				/* We need to reset it now */
-				filter->n_preds = 0;
-				synchronize_sched();
-			}
-			__free_preds(filter);
-		}
-	}
+	if (filter->preds)
+		__free_preds(filter);
+
+	filter->preds =
+		kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
 
-	if (!filter->preds) {
-		filter->preds =
-			kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
-		filter->a_preds = n_preds;
-	}
 	if (!filter->preds)
 		return -ENOMEM;
 
-	if (WARN_ON(filter->a_preds < n_preds))
-		return -EINVAL;
+	filter->a_preds = n_preds;
+	filter->n_preds = 0;
 
 	for (i = 0; i < n_preds; i++) {
 		pred = &filter->preds[i];
-- 
cgit v1.2.1


From ba976970c79fd2fbfe1a4b3b6766a318f4eb9d4c Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:20 +1100
Subject: tracing/syscalls: Don't add events for unmapped syscalls

FTRACE_SYSCALLS would create events for each and every system call, even
if it had failed to map the system call's name with it's number. This
resulted in a number of events being created that would not behave as
expected.

This could happen, for example, on architectures who's symbol names are
unusual and will not match the system call name. It could also happen
with system calls which were mapped to sys_ni_syscall.

This patch changes the default system call number in the metadata to -1.
If the system call name from the metadata is not successfully mapped to
a system call number during boot, than the event initialisation routine
will now return an error, preventing the event from being created.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-2-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..a9ceabd52247 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -424,6 +424,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 int init_syscall_trace(struct ftrace_event_call *call)
 {
 	int id;
+	int num;
+
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	if (num < 0 || num >= NR_syscalls) {
+		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+				((struct syscall_metadata *)call->data)->name);
+		return -ENOSYS;
+	}
 
 	if (set_syscall_print_fmt(call) < 0)
 		return -ENOMEM;
-- 
cgit v1.2.1


From 3773b389b6927595512558594d040c1edba46f36 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:21 +1100
Subject: tracing/syscalls: Convert redundant syscall_nr checks into WARN_ON

With the ftrace events now checking if the syscall_nr is valid upon
initialisation it should no longer be possible to register or unregister
a syscall event without a valid syscall_nr since they should not be
created. This adds a WARN_ON_ONCE in the register and unregister
functions to locate potential regressions in the future.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-3-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a9ceabd52247..423094288fb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -359,7 +359,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_enter)
@@ -377,7 +377,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_enter--;
@@ -393,7 +393,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_exit)
@@ -411,7 +411,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_exit--;
-- 
cgit v1.2.1


From c763ba06bd9b5db2c46c36276c89103d92d2c604 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:22 +1100
Subject: tracing/syscalls: Make arch_syscall_addr weak

Some architectures use non-trivial system call tables and will not work
with the generic arch_syscall_addr code. For example, PowerPC64 uses a
table of twin long longs.

This patch makes the generic arch_syscall_addr weak to allow
architectures with non-trivial system call tables to override it.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-4-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 423094288fb5..af831545f656 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -446,7 +446,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
 	return id;
 }
 
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
 {
 	return (unsigned long)sys_call_table[nr];
 }
-- 
cgit v1.2.1


From b2d55496818d64310b9f5486d4eea76ea614d7f8 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:23 +1100
Subject: tracing/syscalls: Allow arch specific syscall symbol matching

Some architectures have unusual symbol names and the generic code to
match the symbol name with the function name for the syscall metadata
will fail. For example, symbols on PPC64 start with a period and the
generic code will fail to match them.

This patch moves the match logic out into a separate function which an
arch can override by defining ARCH_HAS_SYSCALL_MATCH_SYM_NAME in
asm/ftrace.h and implementing arch_syscall_match_sym_name.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-5-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index af831545f656..86a23e7de031 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
 
 static struct syscall_metadata **syscalls_metadata;
 
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+	/*
+	 * Only compare after the "sys" prefix. Archs that use
+	 * syscall wrappers may have syscalls symbols aliases prefixed
+	 * with "SyS" instead of "sys", leading to an unwanted
+	 * mismatch.
+	 */
+	return !strcmp(sym + 3, name + 3);
+}
+#endif
+
 static __init struct syscall_metadata *
 find_syscall_meta(unsigned long syscall)
 {
@@ -73,13 +86,7 @@ find_syscall_meta(unsigned long syscall)
 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
 
 	for ( ; start < stop; start++) {
-		/*
-		 * Only compare after the "sys" prefix. Archs that use
-		 * syscall wrappers may have syscalls symbols aliases prefixed
-		 * with "SyS" instead of "sys", leading to an unwanted
-		 * mismatch.
-		 */
-		if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
 			return *start;
 	}
 	return NULL;
-- 
cgit v1.2.1


From ae07f551c42d6e4162436ca452a199deac9dab4d Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:25 +1100
Subject: tracing/syscalls: Early terminate search for sys_ni_syscall

Many system calls are unimplemented and mapped to sys_ni_syscall, but at
boot ftrace would still search through every syscall metadata entry for
a match which wouldn't be there.

This patch adds causes the search to terminate early if the system call
is not mapped.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-7-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 86a23e7de031..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -85,6 +85,9 @@ find_syscall_meta(unsigned long syscall)
 	stop = __stop_syscalls_metadata;
 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
 
+	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+		return NULL;
+
 	for ( ; start < stop; start++) {
 		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
 			return *start;
-- 
cgit v1.2.1


From 7ff207928eb0761fa6b6c39eda82ac07a5241acf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 8 Feb 2011 15:18:00 +0100
Subject: Revert "lockdep, timer: Fix del_timer_sync() annotation"

Both attempts at trying to allow softirq usage for
del_timer_sync() failed (produced bogus warnings),
so revert the commit for this release:

  f266a5110d45: lockdep, timer: Fix del_timer_sync() annotation

and try again later.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1297174680.13327.107.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/timer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index d53ce66daea0..d6459923d245 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  *
  * Synchronization rules: Callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
- * hardirq contexts. The caller must not hold locks which would prevent
+ * interrupt contexts. The caller must not hold locks which would prevent
  * completion of the timer's handler. The timer's handler must not call
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
@@ -971,12 +971,10 @@ int del_timer_sync(struct timer_list *timer)
 #ifdef CONFIG_LOCKDEP
 	unsigned long flags;
 
-	raw_local_irq_save(flags);
-	local_bh_disable();
+	local_irq_save(flags);
 	lock_map_acquire(&timer->lockdep_map);
 	lock_map_release(&timer->lockdep_map);
-	_local_bh_enable();
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 #endif
 	/*
 	 * don't use it in hardirq context, because it
-- 
cgit v1.2.1


From dc5f219e88294b93009eef946251251ffffb6d60 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 13:19:20 +0100
Subject: genirq: Add IRQF_FORCE_RESUME

Xen needs to reenable interrupts which are marked IRQF_NO_SUSPEND in the
resume path. Add a flag to force the reenabling in the resume code.

Tested-and-acked-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 11 ++++++++++-
 kernel/irq/pm.c     |  3 ---
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..b4198ee8cfdf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq);
 
 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
-	if (resume)
+	if (resume) {
+		if (!(desc->status & IRQ_SUSPENDED)) {
+			if (!desc->action)
+				return;
+			if (!(desc->action->flags & IRQF_FORCE_RESUME))
+				return;
+			/* Pretend that it got disabled ! */
+			desc->depth++;
+		}
 		desc->status &= ~IRQ_SUSPENDED;
+	}
 
 	switch (desc->depth) {
 	case 0:
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..d6bfb89cce91 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
 	for_each_irq_desc(irq, desc) {
 		unsigned long flags;
 
-		if (!(desc->status & IRQ_SUSPENDED))
-			continue;
-
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		__enable_irq(desc, irq, true);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
-- 
cgit v1.2.1


From 5e38ca8f3ea423442eaafe1b7e206084aa38120a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 2 Feb 2011 13:28:18 +0100
Subject: tracing: Add unstable sched clock note to the warning

The warning "Delta way too big" warning might appear on a system with
unstable shed clock right after the system is resumed and tracing
was enabled during the suspend.

Since it's not realy bug, and the unstable sched clock is working
fast and reliable otherwise, Steven suggested to keep using the
sched clock in any case and just to make note in the warning itself.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1296649698-6003-1-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..7739893a1d0a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2163,10 +2163,14 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp);
+				  (unsigned long long)cpu_buffer->write_stamp,
+				  sched_clock_stable ? "" :
+				  "If you just came from a suspend/resume,\n"
+				  "please switch to the trace global clock:\n"
+				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
 			add_timestamp = 1;
 		}
 	}
-- 
cgit v1.2.1


From c305d524e5dd3c3c7a6035083e30950bea1b52dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 17:10:48 +0100
Subject: softirq: Avoid stack switch from ksoftirqd

ksoftirqd() calls do_softirq() which switches stacks on several
architectures. That makes no sense at all. ksoftirqd's stack is
sufficient.

Call __do_softirq() directly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>
LKML-Reference: <alpine.LFD.2.00.1102021704530.31804@localhost6.localdomain6>
---
 kernel/softirq.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..c0490464e92f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -738,7 +738,10 @@ static int run_ksoftirqd(void * __bind_cpu)
 			   don't process */
 			if (cpu_is_offline((long)__bind_cpu))
 				goto wait_to_die;
-			do_softirq();
+			local_irq_disable();
+			if (local_softirq_pending())
+				__do_softirq();
+			local_irq_enable();
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
-- 
cgit v1.2.1


From 44951a60ff888add9e84f509ffce20052e45af94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 17:33:49 +0100
Subject: genirq: Remove dead code

CONFIG_KSTAT_IRQS_ONDEMAND does not exist. It's not worth to implement
it. Use sparse irqs if you care about memory consumption of the
interrupt layer.

Found by undertaker: http://vamos.informatik.uni-erlangen.de/trac/undertaker

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f20230e67..a7ac6e1e7074 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -251,7 +251,6 @@ int __init early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc[i].irq_data.irq = i;
 		desc[i].irq_data.chip = &no_irq_chip;
-		/* TODO : do this allocation on-demand ... */
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
@@ -277,22 +276,6 @@ static void free_desc(unsigned int irq)
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
-#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
-	struct irq_desc *desc;
-	unsigned int i;
-
-	for (i = 0; i < cnt; i++) {
-		desc = irq_to_desc(start + i);
-		if (desc && !desc->kstat_irqs) {
-			unsigned int __percpu *stats = alloc_percpu(unsigned int);
-
-			if (!stats)
-				return -1;
-			if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
-				free_percpu(stats);
-		}
-	}
-#endif
 	return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
-- 
cgit v1.2.1


From 87d80de2800d087ea833cb79bc13f85ff34ed49f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Feb 2011 13:19:49 -0500
Subject: tracing: Remove obsolete sched_switch tracer

The trace events sched_switch and sched_wakeup do the same thing
as the stand alone sched_switch tracer does. It is no longer needed.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_switch.c | 48 ---------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 	ctx_trace = tr;
 }
 
-static void stop_sched_trace(struct trace_array *tr)
-{
-	tracing_stop_sched_switch_record();
-}
-
-static int sched_switch_trace_init(struct trace_array *tr)
-{
-	ctx_trace = tr;
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch_record();
-	return 0;
-}
-
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
-	if (sched_ref)
-		stop_sched_trace(tr);
-}
-
-static void sched_switch_trace_start(struct trace_array *tr)
-{
-	sched_stopped = 0;
-}
-
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
-	sched_stopped = 1;
-}
-
-static struct tracer sched_switch_trace __read_mostly =
-{
-	.name		= "sched_switch",
-	.init		= sched_switch_trace_init,
-	.reset		= sched_switch_trace_reset,
-	.start		= sched_switch_trace_start,
-	.stop		= sched_switch_trace_stop,
-	.wait_pipe	= poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_sched_switch,
-#endif
-};
-
-__init static int init_sched_switch_trace(void)
-{
-	return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
-
-- 
cgit v1.2.1


From 6752ab4a9c30d5411b2dfdb251a3f1cb18aae487 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Feb 2011 13:54:06 -0500
Subject: tracing: Deprecate tracing_enabled for tracing_on

tracing_enabled should not be used, it is heavy weight and does not
do much in helping lower the overhead.

tracing_on should be used instead. Warn users to use tracing_on
when tracing_enabled is used as it will soon be removed from the
tracing directory.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..8dc8da6733f9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 	if (tracer_enabled ^ val) {
+
+		/* Only need to warn if this is used to change the state */
+		WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+
 		if (val) {
 			tracer_enabled = 1;
 			if (current_trace->start)
-- 
cgit v1.2.1


From 4149efb22da66e326fc48baf80d628834509f7f0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Feb 2011 10:39:03 +0100
Subject: workqueue: add system_freezeable_wq

Add system wide freezeable workqueue.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 11869faa6819..28f8bd08f0e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -249,10 +249,12 @@ struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_freezeable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+EXPORT_SYMBOL_GPL(system_freezeable_wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -3764,8 +3766,10 @@ static int __init init_workqueues(void)
 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
 					    WQ_UNBOUND_MAX_ACTIVE);
+	system_freezeable_wq = alloc_workqueue("events_freezeable",
+					       WQ_FREEZEABLE, 0);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-	       !system_unbound_wq);
+	       !system_unbound_wq || !system_freezeable_wq);
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
cgit v1.2.1


From 5651f7f47dbb1cf2b95a60582546db4ff508e2b4 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Wed, 9 Feb 2011 14:02:33 -0500
Subject: watchdog, nmi: Lower the severity of error messages

During boot if the hardlockup detector fails to initialize, it
complains very loudly.  Some failures should be expected under
certain situations, ie no lapics, or resource in-use.  Tone
those error messages down a bit.  Keep the rest at a high level.

Reported-by: Paul Bolle <pebolle@tiscali.nl>
Tested-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1297278153-21111-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f37f974aa81b..18bb15776c57 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,8 +363,14 @@ static int watchdog_nmi_enable(int cpu)
 		goto out_save;
 	}
 
-	printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
-	       cpu, PTR_ERR(event));
+
+	/* vary the KERN level based on the returned errno */
+	if (PTR_ERR(event) == -EOPNOTSUPP)
+		printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+	else if (PTR_ERR(event) == -ENOENT)
+		printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+	else
+		printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
 	return PTR_ERR(event);
 
 	/* success path */
-- 
cgit v1.2.1


From 986c011ddbb3ed44b35e1bfd67f6aa60b293b495 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Wed, 9 Feb 2011 16:04:25 -0800
Subject: genirq: Call bus_lock/unlock functions in setup_irq()

irq_chips that supply .irq_bus_lock/.irq_bus_sync_unlock functions,
expect that the other chip methods will be called inside of calls to
the pair.  If this expectation is not met, things tend to not work.

Make setup_irq() call chip_bus_lock()/chip_bus_sync_unlock() too.

For the vast majority of irq_chips, this will be a NOP as most don't
have these bus lock functions.

[ tglx: No we don't want to call that in __setup_irq(). Way too many
  	error exit pathes. ]

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
LKML-Reference: <1297296265-18680-1-git-send-email-ddaney@caviumnetworks.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..a00bf2cd67ed 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -871,9 +871,14 @@ out_thread:
  */
 int setup_irq(unsigned int irq, struct irqaction *act)
 {
+	int retval;
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	return __setup_irq(irq, desc, act);
+	chip_bus_lock(desc);
+	retval = __setup_irq(irq, desc, act);
+	chip_bus_sync_unlock(desc);
+
+	return retval;
 }
 EXPORT_SYMBOL_GPL(setup_irq);
 
-- 
cgit v1.2.1


From 868baf07b1a259f5f3803c1dc2777b6c358f83cf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 10 Feb 2011 21:26:13 -0500
Subject: ftrace: Fix memory leak with function graph and cpu hotplug

When the fuction graph tracer starts, it needs to make a special
stack for each task to save the real return values of the tasks.
All running tasks have this stack created, as well as any new
tasks.

On CPU hot plug, the new idle task will allocate a stack as well
when init_idle() is called. The problem is that cpu hotplug does
not create a new idle_task. Instead it uses the idle task that
existed when the cpu went down.

ftrace_graph_init_task() will add a new ret_stack to the task
that is given to it. Because a clone will make the task
have a stack of its parent it does not check if the task's
ret_stack is already NULL or not. When the CPU hotplug code
starts a CPU up again, it will allocate a new stack even
though one already existed for it.

The solution is to treat the idle_task specially. In fact, the
function_graph code already does, just not at init_idle().
Instead of using the ftrace_graph_init_task() for the idle task,
which that function expects the task to be a clone, have a
separate ftrace_graph_init_idle_task(). Also, we will create a
per_cpu ret_stack that is used by the idle task. When we call
ftrace_graph_init_idle_task() it will check if the idle task's
ret_stack is NULL, if it is, then it will assign it the per_cpu
ret_stack.

Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stable Tree <stable@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/sched.c        |  2 +-
 kernel/trace/ftrace.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 46 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..fbe86cb04b61 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5571,7 +5571,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-	ftrace_graph_init_task(idle);
+	ftrace_graph_init_idle_task(idle, cpu);
 }
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..888b611897d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
 	/* The cpu_boot init_task->ret_stack will never be freed */
 	for_each_online_cpu(cpu) {
 		if (!idle_task(cpu)->ret_stack)
-			ftrace_graph_init_task(idle_task(cpu));
+			ftrace_graph_init_idle_task(idle_task(cpu), cpu);
 	}
 
 	do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+	atomic_set(&t->tracing_graph_pause, 0);
+	atomic_set(&t->trace_overrun, 0);
+	t->ftrace_timestamp = 0;
+	/* make curr_ret_stack visable before we add the ret_stack */
+	smp_wmb();
+	t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+	t->curr_ret_stack = -1;
+	/*
+	 * The idle task has no parent, it either has its own
+	 * stack or no stack at all.
+	 */
+	if (t->ret_stack)
+		WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+	if (ftrace_graph_active) {
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = per_cpu(idle_ret_stack, cpu);
+		if (!ret_stack) {
+			ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+					    * sizeof(struct ftrace_ret_stack),
+					    GFP_KERNEL);
+			if (!ret_stack)
+				return;
+			per_cpu(idle_ret_stack, cpu) = ret_stack;
+		}
+		graph_init_task(t, ret_stack);
+	}
+}
+
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 				GFP_KERNEL);
 		if (!ret_stack)
 			return;
-		atomic_set(&t->tracing_graph_pause, 0);
-		atomic_set(&t->trace_overrun, 0);
-		t->ftrace_timestamp = 0;
-		/* make curr_ret_stack visable before we add the ret_stack */
-		smp_wmb();
-		t->ret_stack = ret_stack;
+		graph_init_task(t, ret_stack);
 	}
 }
 
-- 
cgit v1.2.1


From f590308536db432e4747f562b29e5858123938e9 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Fri, 11 Feb 2011 19:21:25 -0800
Subject: timer debug: Hide kernel addresses via %pK in /proc/timer_list

In the continuing effort to avoid kernel addresses leaking to
unprivileged users, this patch switches to %pK for
/proc/timer_list reporting.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110212032125.GA23571@outflux.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/timer_list.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 32a19f9397fc..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
 	char symname[KSYM_NAME_LEN];
 
 	if (lookup_symbol_name((unsigned long)sym, symname) < 0)
-		SEQ_printf(m, "<%p>", sym);
+		SEQ_printf(m, "<%pK>", sym);
 	else
 		SEQ_printf(m, "%s", symname);
 }
@@ -112,7 +112,7 @@ next_one:
 static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
-	SEQ_printf(m, "  .base:       %p\n", base);
+	SEQ_printf(m, "  .base:       %pK\n", base);
 	SEQ_printf(m, "  .index:      %d\n",
 			base->index);
 	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
-- 
cgit v1.2.1


From 0de4b34d466bae571b50f41c7296b85248205e35 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 14 Feb 2011 14:48:07 +0900
Subject: tracing/kprobe: Fix NULL pointer deref check

Add NULL check for avoiding NULL pointer deref.
This bug has been introduced by:

  1ff511e35ed8: tracing/kprobes: Add bitfield type

which causes a null pointer dereference bug when kprobe-tracer
parses an argument without type.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110214054807.8919.69740.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ccdc542022c3..8435b43b1782 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -935,7 +935,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
 	parg->offset = tp->size;
 	tp->size += parg->type->size;
 	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
-	if (ret >= 0)
+	if (ret >= 0 && t != NULL)
 		ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
 	if (ret >= 0) {
 		parg->fetch_size.fn = get_fetch_size_function(parg->type,
-- 
cgit v1.2.1


From 7576958a9d5a4a677ad7dd40901cdbb6c1110c98 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 14 Feb 2011 14:04:46 +0100
Subject: workqueue: wake up a worker when a rescuer is leaving a gcwq

After executing the matching works, a rescuer leaves the gcwq whether
there are more pending works or not.  This may decrease the
concurrency level to zero and stall execution until a new work item is
queued on the gcwq.

Make rescuer wake up a regular worker when it leaves a gcwq if there
are more works to execute, so that execution isn't stalled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ray Jui <rjui@broadcom.com>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 11869faa6819..90a17ca2ad0b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2047,6 +2047,15 @@ repeat:
 				move_linked_works(work, scheduled, &n);
 
 		process_scheduled_works(rescuer);
+
+		/*
+		 * Leave this gcwq.  If keep_working() is %true, notify a
+		 * regular worker; otherwise, we end up with 0 concurrency
+		 * and stalling the execution.
+		 */
+		if (keep_working(gcwq))
+			wake_up_worker(gcwq);
+
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-- 
cgit v1.2.1


From 4fe757dd48a9e95e1a071291f15dda5421dacb66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Feb 2011 22:26:07 +0100
Subject: perf: Fix throttle logic

It was possible to call pmu::start() on an already running event. In
particular this lead so some wreckage as the hrtimer events would
re-initialize active timers.

This was due to throttled events being activated again by scheduling.
Scheduling in a context would add and force start events, resulting in
running events with a possible throttle status. The next tick to hit
that task will then try to unthrottle the event and call ->start() on
an already running event.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 999835b6112b..656222fcf767 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -782,6 +782,10 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -794,6 +798,17 @@ event_sched_in(struct perf_event *event,
 
 	event->state = PERF_EVENT_STATE_ACTIVE;
 	event->oncpu = smp_processor_id();
+
+	/*
+	 * Unthrottle events, since we scheduled we might have missed several
+	 * ticks already, also for a heavily scheduling task there is little
+	 * guarantee it'll get a tick in a timely manner.
+	 */
+	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+		perf_log_throttle(event, 1);
+		event->hw.interrupts = 0;
+	}
+
 	/*
 	 * The new state must be visible before we turn it on in the hardware:
 	 */
@@ -1596,10 +1611,6 @@ void __perf_event_task_sched_in(struct task_struct *task)
 	}
 }
 
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_event *event, int enable);
-
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	u64 frequency = event->attr.sample_freq;
-- 
cgit v1.2.1


From d41d5a01631af821d3a3447e6613a316f5ee6c25 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 7 Feb 2011 17:02:20 +0100
Subject: cgroup: Fix cgroup_subsys::exit callback

Make the ::exit method act like ::attach, it is after all very nearly
the same thing.

The bug had no effect on correctness - fixing it is an optimization for
the scheduler. Also, later perf-cgroups patches rely on it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Menage <menage@google.com>
LKML-Reference: <1297160655.13327.92.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cgroup.c | 31 ++++++++++++++++++-------------
 kernel/sched.c  |  6 ++----
 2 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..f6495f33a355 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-	int i;
 	struct css_set *cg;
-
-	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * modular subsystems can't use callbacks, so no need to lock
-		 * the subsys array
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			if (ss->exit)
-				ss->exit(ss, tsk);
-		}
-	}
+	int i;
 
 	/*
 	 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	task_lock(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
+
+	if (run_callbacks && need_forkexit_callback) {
+		/*
+		 * modular subsystems can't use callbacks, so no need to lock
+		 * the subsys array
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->exit) {
+				struct cgroup *old_cgrp =
+					rcu_dereference_raw(cg->subsys[i])->cgroup;
+				struct cgroup *cgrp = task_cgroup(tsk, i);
+				ss->exit(ss, cgrp, old_cgrp, tsk);
+			}
+		}
+	}
 	task_unlock(tsk);
+
 	if (cg)
 		put_css_set_taskexit(cg);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index e142e92f38da..79e611cd83dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
-	if (p->flags & PF_EXITING)
-		return &root_task_group;
-
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -8863,7 +8860,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
-- 
cgit v1.2.1


From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 14 Feb 2011 11:20:01 +0200
Subject: perf: Add cgroup support

This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.

The cgroup to monitor is passed as a file descriptor in the pid
argument to the syscall. The file descriptor must be opened to
the cgroup name in the cgroup filesystem. For instance, if the
cgroup name is foo and cgroupfs is mounted in /cgroup, then the
file descriptor is opened to /cgroup/foo. Cgroup mode is
activated by passing PERF_FLAG_PID_CGROUP in the flags argument
to the syscall.

For instance to measure in cgroup foo on CPU1 assuming
cgroupfs is mounted under /cgroup:

struct perf_event_attr attr;
int cgroup_fd, fd;

cgroup_fd = open("/cgroup/foo", O_RDONLY);
fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
close(cgroup_fd);

Signed-off-by: Stephane Eranian <eranian@google.com>
[ added perf_cgroup_{exit,attach} ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cgroup.c     |  23 ++
 kernel/perf_event.c | 638 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 626 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f6495f33a355..95362d15128c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
 	return ret;
 }
 
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+	struct cgroup *cgrp;
+	struct inode *inode;
+	struct cgroup_subsys_state *css;
+
+	inode = f->f_dentry->d_inode;
+	/* check in cgroup filesystem dir */
+	if (inode->i_op != &cgroup_dir_inode_operations)
+		return ERR_PTR(-EBADF);
+
+	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+		return ERR_PTR(-EINVAL);
+
+	/* get cgroup */
+	cgrp = __d_cgrp(f->f_dentry);
+	css = cgrp->subsys[id];
+	return css ? css : ERR_PTR(-ENOENT);
+}
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d3f282fa50e..65dcdc76d709 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 	return data.ret;
 }
 
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+		       PERF_FLAG_FD_OUTPUT  |\
+		       PERF_FLAG_PID_CGROUP)
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type);
+			     enum event_type_t event_type,
+			     struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)	{ }
 
@@ -162,6 +176,338 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, perf_subsys_id),
+			struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+	return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+	css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+	css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+	perf_put_cgroup(event);
+	event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	struct perf_cgroup_info *t;
+
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+	struct perf_cgroup_info *info;
+	u64 now;
+
+	now = perf_clock();
+
+	info = this_cpu_ptr(cgrp->info);
+
+	info->time += now - info->timestamp;
+	info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+	if (cgrp_out)
+		__update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+	struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+	/*
+	 * do not update time when cgroup is not active
+	 */
+	if (!event->cgrp || cgrp != event->cgrp)
+		return;
+
+	__update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+	struct perf_cgroup *cgrp;
+	struct perf_cgroup_info *info;
+
+	if (!task)
+		return;
+
+	cgrp = perf_cgroup_from_task(task);
+	info = this_cpu_ptr(cgrp->info);
+	info->timestamp = now;
+}
+
+#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	/*
+	 * disable interrupts to avoid geting nr_cgroup
+	 * changes via __perf_event_disable(). Also
+	 * avoids preemption.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * we reschedule only in the presence of cgroup
+	 * constrained events.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		/*
+		 * perf_cgroup_events says at least one
+		 * context on this CPU has cgroup events.
+		 *
+		 * ctx->nr_cgroups reports the number of cgroup
+		 * events for a context.
+		 */
+		if (cpuctx->ctx.nr_cgroups > 0) {
+
+			if (mode & PERF_CGROUP_SWOUT) {
+				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+				/*
+				 * must not be done before ctxswout due
+				 * to event_filter_match() in event_sched_out()
+				 */
+				cpuctx->cgrp = NULL;
+			}
+
+			if (mode & PERF_CGROUP_SWIN) {
+				/* set cgrp before ctxsw in to
+				 * allow event_filter_match() to not
+				 * have to pass task around
+				 */
+				cpuctx->cgrp = perf_cgroup_from_task(task);
+				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			}
+		}
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+	perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	struct perf_cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct file *file;
+	int ret = 0, fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+
+	css = cgroup_css_from_dir(file, perf_subsys_id);
+	if (IS_ERR(css))
+		return PTR_ERR(css);
+
+	cgrp = container_of(css, struct perf_cgroup, css);
+	event->cgrp = cgrp;
+
+	/*
+	 * all events in a group must monitor
+	 * the same cgroup because a task belongs
+	 * to only one perf cgroup at a time
+	 */
+	if (group_leader && group_leader->cgrp != cgrp) {
+		perf_detach_cgroup(event);
+		ret = -EINVAL;
+	} else {
+		/* must be done before we fput() the file */
+		perf_get_cgroup(event);
+	}
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+	struct perf_cgroup_info *t;
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+	/*
+	 * when the current task's perf cgroup does not match
+	 * the event's, we need to remember to call the
+	 * perf_mark_enable() function the first time a task with
+	 * a matching perf cgroup is scheduled in.
+	 */
+	if (is_cgroup_event(event) && !perf_cgroup_match(event))
+		event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{
+	struct perf_event *sub;
+	u64 tstamp = perf_event_time(event);
+
+	if (!event->cgrp_defer_enabled)
+		return;
+
+	event->cgrp_defer_enabled = 0;
+
+	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+			sub->cgrp_defer_enabled = 0;
+		}
+	}
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+	return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{
+}
+#endif
+
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx)
 static u64 perf_event_time(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
+
+	if (is_cgroup_event(event))
+		return perf_cgroup_event_time(event);
+
 	return ctx ? ctx->time : 0;
 }
 
@@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
-
-	if (ctx->is_active)
+	/*
+	 * in cgroup mode, time_enabled represents
+	 * the time the event was enabled AND active
+	 * tasks were in the monitored cgroup. This is
+	 * independent of the activity of the context as
+	 * there may be a mix of cgroup and non-cgroup events.
+	 *
+	 * That is why we treat cgroup events differently
+	 * here.
+	 */
+	if (is_cgroup_event(event))
 		run_end = perf_event_time(event);
+	else if (ctx->is_active)
+		run_end = ctx->time;
 	else
 		run_end = event->tstamp_stopped;
 
@@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event)
 		run_end = perf_event_time(event);
 
 	event->total_time_running = run_end - event->tstamp_running;
+
 }
 
 /*
@@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups++;
+		/*
+		 * one more event:
+		 * - that has cgroup constraint on event->cpu
+		 * - that may need work on context switch
+		 */
+		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_inc(&perf_sched_events);
+	}
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
 		perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups--;
+		atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_dec(&perf_sched_events);
+	}
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -616,7 +995,8 @@ out:
 static inline int
 event_filter_match(struct perf_event *event)
 {
-	return event->cpu == -1 || event->cpu == smp_processor_id();
+	return (event->cpu == -1 || event->cpu == smp_processor_id())
+	    && perf_cgroup_match(event);
 }
 
 static void
@@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event,
 	 */
 	if (event->state == PERF_EVENT_STATE_INACTIVE
 	    && !event_filter_match(event)) {
-		delta = ctx->time - event->tstamp_stopped;
+		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
 		event->tstamp_stopped = tstamp;
 	}
@@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
 /*
  * Cross CPU call to remove a performance event
  *
@@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info)
 	 */
 	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
 		update_group_times(event);
 		if (event == event->group_leader)
 			group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1226,41 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_set_shadow_time(struct perf_event *event,
+				 struct perf_event_context *ctx,
+				 u64 tstamp)
+{
+	/*
+	 * use the correct time source for the time snapshot
+	 *
+	 * We could get by without this by leveraging the
+	 * fact that to get to this function, the caller
+	 * has most likely already called update_context_time()
+	 * and update_cgrp_time_xx() and thus both timestamp
+	 * are identical (or very close). Given that tstamp is,
+	 * already adjusted for cgroup, we could say that:
+	 *    tstamp - ctx->timestamp
+	 * is equivalent to
+	 *    tstamp - cgrp->timestamp.
+	 *
+	 * Then, in perf_output_read(), the calculation would
+	 * work with no changes because:
+	 * - event is guaranteed scheduled in
+	 * - no scheduled out in between
+	 * - thus the timestamp would be the same
+	 *
+	 * But this is a bit hairy.
+	 *
+	 * So instead, we have an explicit cgroup call to remain
+	 * within the time time source all along. We believe it
+	 * is cleaner and simpler to understand.
+	 */
+	if (is_cgroup_event(event))
+		perf_cgroup_set_shadow_time(event, tstamp);
+	else
+		event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
@@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event,
 
 	event->tstamp_running += tstamp - event->tstamp_stopped;
 
-	event->shadow_ctx_time = tstamp - ctx->timestamp;
+	perf_set_shadow_time(event, ctx, tstamp);
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx);
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+					struct task_struct *tsk);
 
 /*
  * Cross CPU call to install and enable a performance event
@@ -1033,11 +1444,17 @@ static int  __perf_install_in_context(void *info)
 	 * which do context switches with IRQs enabled.
 	 */
 	if (ctx->task && !cpuctx->task_ctx)
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, ctx->task);
 
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
+	/*
+	 * update cgrp time only if current cgrp
+	 * matches event->cgrp. Must be done before
+	 * calling add_event_to_ctx()
+	 */
+	update_cgrp_time_from_event(event);
 
 	add_event_to_ctx(event, ctx);
 
@@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info)
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		goto unlock;
+
+	/*
+	 * set current task's cgroup time reference point
+	 */
+	perf_cgroup_set_timestamp(current, perf_clock());
+
 	__perf_event_mark_enabled(event, ctx);
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event)) {
+		if (is_cgroup_event(event))
+			perf_cgroup_defer_enabled(event);
 		goto unlock;
+	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
@@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (likely(!ctx->nr_events))
 		goto out;
 	update_context_time(ctx);
+	update_cgrp_time_from_cpuctx(cpuctx);
 
 	if (!ctx->nr_active)
 		goto out;
@@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
 
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
+
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch out PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_out(task);
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type)
+	     enum event_type_t event_type,
+	     struct task_struct *task)
 {
+	u64 now;
+
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	if (likely(!ctx->nr_events))
 		goto out;
 
-	ctx->timestamp = perf_clock();
-
+	now = perf_clock();
+	ctx->timestamp = now;
+	perf_cgroup_set_timestamp(task, now);
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@@ -1601,11 +2048,12 @@ out:
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type)
+			     enum event_type_t event_type,
+			     struct task_struct *task)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, task);
 }
 
 static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 	if (cpuctx->task_ctx == ctx)
 		return;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, NULL);
 	cpuctx->task_ctx = ctx;
 }
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+					struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx)
 	 */
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
-	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 
 	cpuctx->task_ctx = ctx;
 
@@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
 		if (likely(!ctx))
 			continue;
 
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, task);
 	}
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch in PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_in(task);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 	if (ctx)
 		rotate_ctx(ctx);
 
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
 	if (ctx)
 		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
@@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_context_sched_in(ctx);
+	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
 }
@@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active)
+	if (ctx->is_active) {
 		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
 	update_event_times(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
 		event->pmu->read(event);
@@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event)
 		 * (e.g., thread is blocked), in that case
 		 * we cannot update context time
 		 */
-		if (ctx->is_active)
+		if (ctx->is_active) {
 			update_context_time(ctx);
+			update_cgrp_time_from_event(event);
+		}
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
@@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_dec(&perf_task_events);
+			jump_label_dec(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
@@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event)
 		event->buffer = NULL;
 	}
 
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event)
 
 	if (!in_nmi()) {
 		update_context_time(event->ctx);
+		update_cgrp_time_from_event(event);
 		time = event->ctx->time;
 	} else {
 		u64 now = perf_clock();
@@ -5725,7 +6189,7 @@ done:
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_inc(&perf_task_events);
+			jump_label_inc(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
@@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int err;
 
 	/* for future expandability... */
-	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+	if (flags & ~PERF_FLAG_ALL)
 		return -EINVAL;
 
 	err = perf_copy_attr(attr_uptr, &attr);
@@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/*
+	 * In cgroup mode, the pid argument is used to pass the fd
+	 * opened to the cgroup directory in cgroupfs. The cpu argument
+	 * designates the cpu on which to monitor threads from that
+	 * cgroup.
+	 */
+	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+		return -EINVAL;
+
 	event_fd = get_unused_fd_flags(O_RDWR);
 	if (event_fd < 0)
 		return event_fd;
@@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
-	if (pid != -1) {
+	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 		task = find_lively_task_by_vpid(pid);
 		if (IS_ERR(task)) {
 			err = PTR_ERR(task);
@@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_task;
 	}
 
+	if (flags & PERF_FLAG_PID_CGROUP) {
+		err = perf_cgroup_connect(pid, event, &attr, group_leader);
+		if (err)
+			goto err_alloc;
+	}
+
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -6808,3 +7287,92 @@ unlock:
 	return ret;
 }
 device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+	struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	struct perf_cgroup_info *t;
+	int c;
+
+	jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+	if (!jc)
+		return ERR_PTR(-ENOMEM);
+
+	memset(jc, 0, sizeof(*jc));
+
+	jc->info = alloc_percpu(struct perf_cgroup_info);
+	if (!jc->info) {
+		kfree(jc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for_each_possible_cpu(c) {
+		t = per_cpu_ptr(jc->info, c);
+		t->time = 0;
+		t->timestamp = 0;
+	}
+	return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+			  struct perf_cgroup, css);
+	free_percpu(jc->info);
+	kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+	struct task_struct *task = info;
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	return 0;
+}
+
+static void perf_cgroup_move(struct task_struct *task)
+{
+	task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task,
+		bool threadgroup)
+{
+	perf_cgroup_move(task);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			perf_cgroup_move(c);
+		}
+		rcu_read_unlock();
+	}
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
+{
+	/*
+	 * cgroup_exit() is called in the copy_process() failure path.
+	 * Ignore this case since the task hasn't ran yet, this avoids
+	 * trying to poke a half freed task state from generic code.
+	 */
+	if (!(task->flags & PF_EXITING))
+		return;
+
+	perf_cgroup_move(task);
+}
+
+struct cgroup_subsys perf_subsys = {
+	.name = "perf_event",
+	.subsys_id = perf_subsys_id,
+	.create = perf_cgroup_create,
+	.destroy = perf_cgroup_destroy,
+	.exit = perf_cgroup_exit,
+	.attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
-- 
cgit v1.2.1


From 163ec4354a5135c6c38c3f4a9b46a31900ebdf48 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Feb 2011 11:22:34 +0100
Subject: perf: Optimize throttling code

By pre-computing the maximum number of samples per tick we can avoid a
multiplication and a conditional since MAX_INTERRUPTS >
max_samples_per_tick.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 43 ++++++++++++++++++++++++-------------------
 kernel/sysctl.c     |  2 +-
 2 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 65dcdc76d709..e03be08d0ddf 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -150,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 /*
  * max perf event sample rate
  */
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+	DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+
+	return 0;
+}
 
 static atomic64_t perf_event_id;
 
@@ -4941,26 +4958,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 	if (unlikely(!is_sampling_event(event)))
 		return 0;
 
-	if (!throttle) {
-		hwc->interrupts++;
-	} else {
-		if (hwc->interrupts != MAX_INTERRUPTS) {
-			hwc->interrupts++;
-			if (HZ * hwc->interrupts >
-					(u64)sysctl_perf_event_sample_rate) {
-				hwc->interrupts = MAX_INTERRUPTS;
-				perf_log_throttle(event, 0);
-				ret = 1;
-			}
-		} else {
-			/*
-			 * Keep re-disabling events even though on the previous
-			 * pass we disabled it - just in case we raced with a
-			 * sched-in and the event got enabled again:
-			 */
+	if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
+		if (throttle) {
+			hwc->interrupts = MAX_INTERRUPTS;
+			perf_log_throttle(event, 0);
 			ret = 1;
 		}
-	}
+	} else
+		hwc->interrupts++;
 
 	if (event->attr.freq) {
 		u64 now = perf_clock();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..daef911cbadb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= perf_proc_update_handler,
 	},
 #endif
 #ifdef CONFIG_KMEMCHECK
-- 
cgit v1.2.1


From ba3dd36c6775264ee6e7354ba1aabcd6e86d7298 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Feb 2011 12:41:46 +0100
Subject: perf: Optimize hrtimer events

There is no need to re-initialize the hrtimer every time we start it,
so don't do that (shaves a few cycles). Also, since we know hrtimers
run at a fixed rate (nanoseconds) we can pre-compute the desired
frequency at which they tick. This avoids us having to go through the
whole adaptive frequency feedback logic (shaves another few cycles).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297448589.5226.47.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e03be08d0ddf..a0a6987fabc4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5602,6 +5602,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	u64 period;
 
 	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return HRTIMER_NORESTART;
+
 	event->pmu->read(event);
 
 	perf_sample_data_init(&data, 0);
@@ -5628,9 +5632,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 	if (!is_sampling_event(event))
 		return;
 
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swevent_hrtimer;
-
 	period = local64_read(&hwc->period_left);
 	if (period) {
 		if (period < 0)
@@ -5657,6 +5658,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 	}
 }
 
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!is_sampling_event(event))
+		return;
+
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+
+	/*
+	 * Since hrtimers have a fixed rate, we can do a static freq->period
+	 * mapping and avoid the whole period adjust feedback stuff.
+	 */
+	if (event->attr.freq) {
+		long freq = event->attr.sample_freq;
+
+		event->attr.sample_period = NSEC_PER_SEC / freq;
+		hwc->sample_period = event->attr.sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+		event->attr.freq = 0;
+	}
+}
+
 /*
  * Software event: cpu wall time clock
  */
@@ -5709,6 +5734,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
 		return -ENOENT;
 
+	perf_swevent_init_hrtimer(event);
+
 	return 0;
 }
 
@@ -5787,6 +5814,8 @@ static int task_clock_event_init(struct perf_event *event)
 	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
 		return -ENOENT;
 
+	perf_swevent_init_hrtimer(event);
+
 	return 0;
 }
 
-- 
cgit v1.2.1


From 46e49b3836c7cd2ae5b5fe76fa981d0d292a52fe Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Mon, 14 Feb 2011 14:38:50 -0800
Subject: sched: Wholesale removal of sd_idle logic

sd_idle logic was introduced way back in 2005 (commit 5969fe06),
as an HT optimization.

As per the discussion in the thread here:

  lkml - sched: Resolve sd_idle and first_idle_cpu Catch-22 - v1
  https://patchwork.kernel.org/patch/532501/

The capacity based logic in the load balancer right now handles this
in a much cleaner way, handling more than 2 SMT siblings etc, and sd_idle
does not seem to bring any additional benefits. sd_idle logic also has
some bugs that has performance impact. Here is the patch that removes
the sd_idle logic altogether.

Also, there was a dependency of sched_mc_power_savings == 2, with sd_idle
logic.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Acked-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297723130-693-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 53 +++++++++++------------------------------------------
 1 file changed, 11 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 027024694043..d384e739ea95 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2672,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
  * @local_group: Does group contain this_cpu.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
@@ -2680,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  */
 static inline void update_sg_lb_stats(struct sched_domain *sd,
 			struct sched_group *group, int this_cpu,
-			enum cpu_idle_type idle, int load_idx, int *sd_idle,
+			enum cpu_idle_type idle, int load_idx,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
 {
@@ -2700,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if (*sd_idle && rq->nr_running)
-			*sd_idle = 0;
-
 		/* Bias balancing toward cpus of our domain */
 		if (local_group) {
 			if (idle_cpu(i) && !first_idle_cpu) {
@@ -2817,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-			enum cpu_idle_type idle, int *sd_idle,
-			const struct cpumask *cpus, int *balance,
-			struct sd_lb_stats *sds)
+			enum cpu_idle_type idle, const struct cpumask *cpus,
+			int *balance, struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *sg = sd->groups;
@@ -2843,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 
 		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
 		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
 				local_group, cpus, balance, &sgs);
 
 		if (local_group && !(*balance))
@@ -3095,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  * @imbalance: Variable which stores amount of weighted load which should
  *		be moved to restore balance/put a group to idle.
  * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
  * @cpus: The set of CPUs under consideration for load-balancing.
  * @balance: Pointer to a variable indicating if this_cpu
  *	is the appropriate cpu to perform load balancing at this_level.
@@ -3108,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const struct cpumask *cpus, int *balance)
+		   const struct cpumask *cpus, int *balance)
 {
 	struct sd_lb_stats sds;
 
@@ -3118,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
-	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-					balance, &sds);
+	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
 
 	/* Cases where imbalance does not exist from POV of this_cpu */
 	/* 1) this_cpu is not the appropriate cpu to perform load balancing
@@ -3255,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
 			       int busiest_cpu, int this_cpu)
 {
 	if (idle == CPU_NEWLY_IDLE) {
@@ -3287,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
 		 * move_tasks() will succeed.  ld_moved will be true and this
 		 * active balance code will not be triggered.
 		 */
-		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-			return 0;
-
 		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
 			return 0;
 	}
@@ -3308,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	int ld_moved, all_pinned = 0, active_balance = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
@@ -3317,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
-	/*
-	 * When power savings policy is enabled for the parent domain, idle
-	 * sibling can pick up load irrespective of busy siblings. In this case,
-	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
-	 */
-	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		sd_idle = 1;
-
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
 				   cpus, balance);
 
 	if (*balance == 0)
@@ -3392,8 +3370,7 @@ redo:
 		if (idle != CPU_NEWLY_IDLE)
 			sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
-					this_cpu)) {
+		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
@@ -3448,10 +3425,6 @@ redo:
 			sd->balance_interval *= 2;
 	}
 
-	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-
 	goto out;
 
 out_balanced:
@@ -3465,11 +3438,7 @@ out_one_pinned:
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-	else
-		ld_moved = 0;
+	ld_moved = 0;
 out:
 	return ld_moved;
 }
-- 
cgit v1.2.1


From 48228f7b470a74b6469a250d2977a13128d8fe96 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 8 Feb 2011 12:39:54 -0500
Subject: lockdep/timers: Explain in detail the locking problems
 del_timer_sync() may cause

Twice I had to explain the output about why lockdep gives an error with
locks in IRQ context and with del_timer_sync(). Might as well write it
up and place it in the comments above the code in del_timer_sync().
Perhaps the next time this lockdep dump triggers people will understand
the issues.

It is a ticky issue and very subtle, explaining it in detail in the code
may help others understand the issue when they stumble upon the bug
again.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297186794.23343.19.camel@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/timer.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..5f40c2e0a94e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -964,6 +964,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
  *
+ * Note: You must not hold locks that are held in interrupt context
+ *   while calling this function. Even if the lock has nothing to do
+ *   with the timer in question.  Here's why:
+ *
+ *    CPU0                             CPU1
+ *    ----                             ----
+ *                                   <SOFTIRQ>
+ *                                   call_timer_fn();
+ *                                     base->running_timer = mytimer;
+ *  spin_lock_irq(somelock);
+ *                                     <IRQ>
+ *                                        spin_lock(somelock);
+ *  del_timer_sync(mytimer);
+ *   while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
  * The function returns whether it has deactivated a pending timer or not.
  */
 int del_timer_sync(struct timer_list *timer)
@@ -971,6 +990,10 @@ int del_timer_sync(struct timer_list *timer)
 #ifdef CONFIG_LOCKDEP
 	unsigned long flags;
 
+	/*
+	 * If lockdep gives a backtrace here, please reference
+	 * the synchronization rules above.
+	 */
 	local_irq_save(flags);
 	lock_map_acquire(&timer->lockdep_map);
 	lock_map_release(&timer->lockdep_map);
-- 
cgit v1.2.1


From 58a69cb47ec6991bf006a3e5d202e8571b0327a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 09:25:31 +0100
Subject: workqueue, freezer: unify spelling of 'freeze' + 'able' to
 'freezable'

There are two spellings in use for 'freeze' + 'able' - 'freezable' and
'freezeable'.  The former is the more prominent one.  The latter is
mostly used by workqueue and in a few other odd places.  Unify the
spelling to 'freezable'.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Dmitry Torokhov <dtor@mail.ru>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Steven Whitehouse <swhiteho@redhat.com>
---
 kernel/power/main.c    |  2 +-
 kernel/power/process.c |  6 +++---
 kernel/workqueue.c     | 24 ++++++++++++------------
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..701853042c28 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
 
 static int __init pm_start_workqueue(void)
 {
-	pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
+	pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
 
 	return pm_wq ? 0 : -ENOMEM;
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d6d2a10320e0..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
  */
 #define TIMEOUT	(20 * HZ)
 
-static inline int freezeable(struct task_struct * p)
+static inline int freezable(struct task_struct * p)
 {
 	if ((p == current) ||
 	    (p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (frozen(p) || !freezeable(p))
+			if (frozen(p) || !freezable(p))
 				continue;
 
 			if (!freeze_task(p, sig_only))
@@ -167,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-		if (!freezeable(p))
+		if (!freezable(p))
 			continue;
 
 		if (nosig_only && should_send_signal(p))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90a17ca2ad0b..88a3e34f51f6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2965,7 +2965,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	 */
 	spin_lock(&workqueue_lock);
 
-	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+	if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
 		for_each_cwq_cpu(cpu, wq)
 			get_cwq(cpu, wq)->max_active = 0;
 
@@ -3077,7 +3077,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 
 		spin_lock_irq(&gcwq->lock);
 
-		if (!(wq->flags & WQ_FREEZEABLE) ||
+		if (!(wq->flags & WQ_FREEZABLE) ||
 		    !(gcwq->flags & GCWQ_FREEZING))
 			get_cwq(gcwq->cpu, wq)->max_active = max_active;
 
@@ -3327,7 +3327,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * want to get it over with ASAP - spam rescuers, wake up as
 	 * many idlers as necessary and create new ones till the
 	 * worklist is empty.  Note that if the gcwq is frozen, there
-	 * may be frozen works in freezeable cwqs.  Don't declare
+	 * may be frozen works in freezable cwqs.  Don't declare
 	 * completion while frozen.
 	 */
 	while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3585,9 +3585,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 /**
  * freeze_workqueues_begin - begin freezing workqueues
  *
- * Start freezing workqueues.  After this function returns, all
- * freezeable workqueues will queue new works to their frozen_works
- * list instead of gcwq->worklist.
+ * Start freezing workqueues.  After this function returns, all freezable
+ * workqueues will queue new works to their frozen_works list instead of
+ * gcwq->worklist.
  *
  * CONTEXT:
  * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3613,7 +3613,7 @@ void freeze_workqueues_begin(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (cwq && wq->flags & WQ_FREEZEABLE)
+			if (cwq && wq->flags & WQ_FREEZABLE)
 				cwq->max_active = 0;
 		}
 
@@ -3624,7 +3624,7 @@ void freeze_workqueues_begin(void)
 }
 
 /**
- * freeze_workqueues_busy - are freezeable workqueues still busy?
+ * freeze_workqueues_busy - are freezable workqueues still busy?
  *
  * Check whether freezing is complete.  This function must be called
  * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3633,8 +3633,8 @@ void freeze_workqueues_begin(void)
  * Grabs and releases workqueue_lock.
  *
  * RETURNS:
- * %true if some freezeable workqueues are still busy.  %false if
- * freezing is complete.
+ * %true if some freezable workqueues are still busy.  %false if freezing
+ * is complete.
  */
 bool freeze_workqueues_busy(void)
 {
@@ -3654,7 +3654,7 @@ bool freeze_workqueues_busy(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZABLE))
 				continue;
 
 			BUG_ON(cwq->nr_active < 0);
@@ -3699,7 +3699,7 @@ void thaw_workqueues(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZABLE))
 				continue;
 
 			/* restore max_active and repopulate worklist */
-- 
cgit v1.2.1


From 3233cdbd9fa347a6d6897a94cc6ed0302ae83c4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 18:10:19 +0100
Subject: workqueue: make sure MAYDAY_INITIAL_TIMEOUT is at least 2 jiffies
 long

MAYDAY_INITIAL_TIMEOUT is defined as HZ / 100 and depending on
configuration may end up 0 or 1.  Even when it's 1, depending on when
the mayday timer is added in the current jiffy interval, it may expire
way before a jiffy has passed.

Make sure MAYDAY_INITIAL_TIMEOUT is at least two to guarantee that at
least a full jiffy has passed before calling rescuers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ray Jui <rjui@broadcom.com>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 88a3e34f51f6..ee6578b578ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
 	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
 	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
 
-	MAYDAY_INITIAL_TIMEOUT	= HZ / 100,	/* call for help after 10ms */
+	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
+						/* call for help after 10ms
+						   (min two ticks) */
 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
-- 
cgit v1.2.1


From 2e725a065b0153f0c449318da1923a120477633d Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@pop3.wp.pl>
Date: Sat, 12 Feb 2011 21:06:51 +0100
Subject: PM / Hibernate: Return error code when alloc_image_page() fails

Currently we return 0 in swsusp_alloc() when alloc_image_page() fails.
Fix that.  Also remove unneeded "error" variable since the only
useful value of error is -ENOMEM.

[rjw: Fixed up the changelog and changed subject.]

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: stable@kernel.org
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..64db648ff911 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1519,11 +1519,8 @@ static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 		unsigned int nr_pages, unsigned int nr_highmem)
 {
-	int error = 0;
-
 	if (nr_highmem > 0) {
-		error = get_highmem_buffer(PG_ANY);
-		if (error)
+		if (get_highmem_buffer(PG_ANY))
 			goto err_out;
 		if (nr_highmem > alloc_highmem) {
 			nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 
  err_out:
 	swsusp_free();
-	return error;
+	return -ENOMEM;
 }
 
 asmlinkage int swsusp_save(void)
-- 
cgit v1.2.1


From e9345aab675382176740bc8a2c6d3caf1510e46d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Feb 2011 08:09:49 +0100
Subject: Revert "tracing: Add unstable sched clock note to the warning"

This reverts commit 5e38ca8f3ea423442eaafe1b7e206084aa38120a.

Breaks the build of several !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
architectures.

Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Message-ID: <20110217171823.GB17058@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7739893a1d0a..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2163,14 +2163,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp,
-				  sched_clock_stable ? "" :
-				  "If you just came from a suspend/resume,\n"
-				  "please switch to the trace global clock:\n"
-				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+				  (unsigned long long)cpu_buffer->write_stamp);
 			add_timestamp = 1;
 		}
 	}
-- 
cgit v1.2.1


From db1c1cce4a653dcbe6949c72ae7b9f42cab1b929 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Fri, 18 Feb 2011 10:07:25 +0100
Subject: ntp: Remove redundant and incorrect parameter check

The ADJ_SETOFFSET code redundantly checks the range of the nanoseconds
field of the time value. This field is checked again in the subsequent
call to timekeeping_inject_offset(). Also, as is, the check will not
detect whether the number of microseconds is out of range.

Let timekeeping_inject_offset() do the error checking.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: johnstul@us.ibm.com
LKML-Reference: <20110218090724.GA2924@riccoc20.at.omicron.at>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/ntp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5ac593267a26..5f1bb8e2008f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -650,13 +650,13 @@ int do_adjtimex(struct timex *txc)
 
 	if (txc->modes & ADJ_SETOFFSET) {
 		struct timespec delta;
-		if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC)
-			return -EINVAL;
 		delta.tv_sec  = txc->time.tv_sec;
 		delta.tv_nsec = txc->time.tv_usec;
 		if (!(txc->modes & ADJ_NANO))
 			delta.tv_nsec *= 1000;
-		timekeeping_inject_offset(&delta);
+		result = timekeeping_inject_offset(&delta);
+		if (result)
+			return result;
 	}
 
 	getnstimeofday(&ts);
-- 
cgit v1.2.1


From c1ee6264280e740a9d3ff3feef38642cf0a57013 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Feb 2011 17:45:15 +0100
Subject: genirq: Prevent access beyond allocated_irqs bitmap

Lars-Peter Clausen pointed out:

   I stumbled upon this while looking through the existing archs using
   SPARSE_IRQ.  Even with SPARSE_IRQ the NR_IRQS is still the upper
   limit for the number of IRQs.

   Both PXA and MMP set NR_IRQS to IRQ_BOARD_START, with
   IRQ_BOARD_START being the number of IRQs used by the core.

   In various machine files the nr_irqs field of the ARM machine
   defintion struct is then set to "IRQ_BOARD_START + NR_BOARD_IRQS".

   As a result "nr_irqs" will greater then NR_IRQS which then again
   causes the "allocated_irqs" bitmap in the core irq code to be
   accessed beyond its size overwriting unrelated data.

The core code really misses a sanity check there.

This went unnoticed so far as by chance the compiler/linker places
data behind that bitmap which gets initialized later on those affected
platforms.

So the obvious fix would be to add a sanity check in early_irq_init()
and break all affected platforms. Though that check wants to be
backported to stable as well, which will require to fix all known
problematic platforms and probably some more yet not known ones as
well. Lots of churn.

A way simpler solution is to allocate a slightly larger bitmap and
avoid the whole churn w/o breaking anything. Add a few warnings when
an arch returns utter crap.

Reported-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org # .37
Cc: Haojian Zhuang <haojian.zhuang@marvell.com>
Cc: Eric Miao <eric.y.miao@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/irq/internals.h |  6 ++++++
 kernel/irq/irqdesc.c   | 11 ++++++++++-
 kernel/irq/resend.c    |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..99c3bc8a6fb4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -3,6 +3,12 @@
  */
 #include <linux/irqdesc.h>
 
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS	(NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS	NR_IRQS
+#endif
+
 extern int noirqdebug;
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f20230e67..2039bea31bdf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -94,7 +94,7 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
 
 #ifdef CONFIG_SPARSE_IRQ
 
@@ -217,6 +217,15 @@ int __init early_irq_init(void)
 	initcnt = arch_probe_nr_irqs();
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
 
+	if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+		nr_irqs = IRQ_BITMAP_BITS;
+
+	if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+		initcnt = IRQ_BITMAP_BITS;
+
+	if (initcnt > nr_irqs)
+		nr_irqs = initcnt;
+
 	for (i = 0; i < initcnt; i++) {
 		desc = alloc_desc(i, node);
 		set_bit(i, allocated_irqs);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..dc49358b73fa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
 #ifdef CONFIG_HARDIRQS_SW_RESEND
 
 /* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
 
 /*
  * Run software resends of IRQ's
-- 
cgit v1.2.1


From 6d83f94db95cfe65d2a6359cccdf61cf087c2598 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Feb 2011 23:27:23 +0100
Subject: genirq: Disable the SHIRQ_DEBUG call in request_threaded_irq for now

With CONFIG_SHIRQ_DEBUG=y we call a newly installed interrupt handler
in request_threaded_irq().

The original implementation (commit a304e1b8) called the handler
_BEFORE_ it was installed, but that caused problems with handlers
calling disable_irq_nosync(). See commit 377bf1e4.

It's braindead in the first place to call disable_irq_nosync in shared
handlers, but ....

Moving this call after we installed the handler looks innocent, but it
is very subtle broken on SMP.

Interrupt handlers rely on the fact, that the irq core prevents
reentrancy.

Now this debug call violates that promise because we run the handler
w/o the IRQ_INPROGRESS protection - which we cannot apply here because
that would result in a possibly forever masked interrupt line.

A concurrent real hardware interrupt on a different CPU results in
handler reentrancy and can lead to complete wreckage, which was
unfortunately observed in reality and took a fricking long time to
debug.

Leave the code here for now. We want this debug feature, but that's
not easy to fix. We really should get rid of those
disable_irq_nosync() abusers and remove that function completely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: stable@kernel.org # .28 -> .37
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..9033c1c70828 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1100,7 +1100,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	if (retval)
 		kfree(action);
 
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
 	if (!retval && (irqflags & IRQF_SHARED)) {
 		/*
 		 * It's a shared IRQ -- the driver ought to be prepared for it
-- 
cgit v1.2.1


From e7bcecb7b1d29b9ad5af939149a945658620ca8f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Feb 2011 17:12:57 +0100
Subject: genirq: Make nr_irqs runtime expandable

We face more and more the requirement to expand nr_irqs at
runtime. The reason are irq expanders which can not be detected in the
early boot stage. So we speculate nr_irqs to have enough room. Further
Xen needs extra irq numbers and we really want to avoid adding more
"detection" code into the early boot. There is no real good reason why
we need to limit nr_irqs at early boot.

Allow the allocation code to expand nr_irqs. We have already 8k extra
number space in the allocation bitmap, so lets use it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a250d3a0af12..6f6644f819dd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -206,6 +206,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	return NULL;
 }
 
+static int irq_expand_nr_irqs(unsigned int cnt)
+{
+	if (nr_irqs + cnt > IRQ_BITMAP_BITS)
+		return -ENOMEM;
+	nr_irqs += cnt;
+	return 0;
+}
+
 int __init early_irq_init(void)
 {
 	int i, initcnt, node = first_online_node;
@@ -287,6 +295,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
 	return start;
 }
+
+static int irq_expand_nr_irqs(unsigned int cnt)
+{
+	return -ENOMEM;
+}
+
 #endif /* !CONFIG_SPARSE_IRQ */
 
 /* Dynamic interrupt handling */
@@ -335,9 +349,11 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
 	if (irq >=0 && start != irq)
 		goto err;
 
-	ret = -ENOMEM;
-	if (start >= nr_irqs)
-		goto err;
+	if (start >= nr_irqs) {
+		ret = irq_expand_nr_irqs(cnt);
+		if (ret)
+			goto err;
+	}
 
 	bitmap_set(allocated_irqs, start, cnt);
 	mutex_unlock(&sparse_irq_lock);
-- 
cgit v1.2.1


From 43abe43ce0619d744c7a5bb15cce075e532b53b7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Feb 2011 12:10:49 +0100
Subject: genirq: Add missing buslock to set_irq_type(), set_irq_wake()

chips behind a slow bus cannot update the chip under desc->lock, but
we miss the chip_buslock/chip_bus_sync_unlock() calls around the set
type and set wake functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c   | 2 ++
 kernel/irq/manage.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..9639ab8bece0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -65,9 +65,11 @@ int set_irq_type(unsigned int irq, unsigned int type)
 	if (type == IRQ_TYPE_NONE)
 		return 0;
 
+	chip_bus_lock(desc);
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	ret = __irq_set_trigger(desc, irq, type);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(desc);
 	return ret;
 }
 EXPORT_SYMBOL(set_irq_type);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ba84307fbf24..a400db220cf3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -454,6 +454,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 	/* wakeup-capable irqs can be shared between drivers that
 	 * don't need to have the same sleep mode behaviors.
 	 */
+	chip_bus_lock(desc);
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (on) {
 		if (desc->wake_depth++ == 0) {
@@ -476,6 +477,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 	}
 
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(desc);
 	return ret;
 }
 EXPORT_SYMBOL(set_irq_wake);
-- 
cgit v1.2.1


From a0cd9ca2b907d7ee26575e7b63ac92dad768a75e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 11:36:33 +0100
Subject: genirq: Namespace cleanup

The irq namespace has become quite convoluted. My bad.  Clean it up
and deprecate the old functions. All new functions follow the scheme:

irq number based:
    irq_set/get/xxx/_xxx(unsigned int irq, ...)

irq_data based:
	 irq_data_set/get/xxx/_xxx(struct irq_data *d, ....)

irq_desc based:
	 irq_desc_get_xxx(struct irq_desc *desc)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c   | 28 ++++++++++++++--------------
 kernel/irq/manage.c |  6 +++---
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9639ab8bece0..622b55ac0e09 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,11 +19,11 @@
 #include "internals.h"
 
 /**
- *	set_irq_chip - set the irq chip for an irq
+ *	irq_set_chip - set the irq chip for an irq
  *	@irq:	irq number
  *	@chip:	pointer to irq chip description structure
  */
-int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -43,14 +43,14 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 
 	return 0;
 }
-EXPORT_SYMBOL(set_irq_chip);
+EXPORT_SYMBOL(irq_set_chip);
 
 /**
- *	set_irq_type - set the irq trigger type for an irq
+ *	irq_set_type - set the irq trigger type for an irq
  *	@irq:	irq number
  *	@type:	IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
  */
-int set_irq_type(unsigned int irq, unsigned int type)
+int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -72,16 +72,16 @@ int set_irq_type(unsigned int irq, unsigned int type)
 	chip_bus_sync_unlock(desc);
 	return ret;
 }
-EXPORT_SYMBOL(set_irq_type);
+EXPORT_SYMBOL(irq_set_irq_type);
 
 /**
- *	set_irq_data - set irq type data for an irq
+ *	irq_set_handler_data - set irq handler data for an irq
  *	@irq:	Interrupt number
  *	@data:	Pointer to interrupt specific data
  *
  *	Set the hardware irq controller data for an irq
  */
-int set_irq_data(unsigned int irq, void *data)
+int irq_set_handler_data(unsigned int irq, void *data)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -97,16 +97,16 @@ int set_irq_data(unsigned int irq, void *data)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
-EXPORT_SYMBOL(set_irq_data);
+EXPORT_SYMBOL(irq_set_handler_data);
 
 /**
- *	set_irq_msi - set MSI descriptor data for an irq
+ *	irq_set_msi_desc - set MSI descriptor data for an irq
  *	@irq:	Interrupt number
  *	@entry:	Pointer to MSI descriptor data
  *
  *	Set the MSI descriptor entry for an irq
  */
-int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -126,13 +126,13 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 }
 
 /**
- *	set_irq_chip_data - set irq chip data for an irq
+ *	irq_set_chip_data - set irq chip data for an irq
  *	@irq:	Interrupt number
  *	@data:	Pointer to chip specific data
  *
  *	Set the hardware irq chip data for an irq
  */
-int set_irq_chip_data(unsigned int irq, void *data)
+int irq_set_chip_data(unsigned int irq, void *data)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -154,7 +154,7 @@ int set_irq_chip_data(unsigned int irq, void *data)
 
 	return 0;
 }
-EXPORT_SYMBOL(set_irq_chip_data);
+EXPORT_SYMBOL(irq_set_chip_data);
 
 struct irq_data *irq_get_irq_data(unsigned int irq)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a400db220cf3..b1b4da9446e6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -434,7 +434,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 }
 
 /**
- *	set_irq_wake - control irq power management wakeup
+ *	irq_set_irq_wake - control irq power management wakeup
  *	@irq:	interrupt to control
  *	@on:	enable/disable power management wakeup
  *
@@ -445,7 +445,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
  *	Wakeup mode lets this IRQ wake the system from sleep
  *	states like "suspend to RAM".
  */
-int set_irq_wake(unsigned int irq, unsigned int on)
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -480,7 +480,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 	chip_bus_sync_unlock(desc);
 	return ret;
 }
-EXPORT_SYMBOL(set_irq_wake);
+EXPORT_SYMBOL(irq_set_irq_wake);
 
 /*
  * Internal function that tells the architecture code whether a
-- 
cgit v1.2.1


From 1fa46f1f070961783661ae640cd2f6b2557f3885 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 16:46:58 +0100
Subject: genirq: Simplify affinity related code

There is lot of #ifdef CONFIG_GENERIC_PENDING_IRQ along with
duplicated code in the irq core. Move the #ifdeffery into one place
and cleanup the code so it's readable. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 64 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b1b4da9446e6..99f3e9a3780c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -100,47 +100,70 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 	}
 }
 
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
+{
+	return desc->status & IRQ_MOVE_PCNTXT;
+}
+static inline bool irq_move_pending(struct irq_desc *desc)
+{
+	return desc->status & IRQ_MOVE_PENDING;
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+	cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+	cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
+static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
  *	@cpumask:	cpumask
  *
  */
-int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_chip *chip = desc->irq_data.chip;
 	unsigned long flags;
+	int ret = 0;
 
 	if (!chip->irq_set_affinity)
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT) {
-		if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-			cpumask_copy(desc->irq_data.affinity, cpumask);
+	if (irq_can_move_pcntxt(desc)) {
+		ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+		if (!ret) {
+			cpumask_copy(desc->irq_data.affinity, mask);
 			irq_set_thread_affinity(desc);
 		}
-	}
-	else {
+	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(desc->pending_mask, cpumask);
+		irq_copy_pending(desc, mask);
 	}
-#else
-	if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-		cpumask_copy(desc->irq_data.affinity, cpumask);
-		irq_set_thread_affinity(desc);
-	}
-#endif
+
 	if (desc->affinity_notify) {
 		kref_get(&desc->affinity_notify->kref);
 		schedule_work(&desc->affinity_notify->work);
 	}
 	desc->status |= IRQ_AFFINITY_SET;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	return 0;
+	return ret;
 }
 
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
@@ -167,18 +190,13 @@ static void irq_affinity_notify(struct work_struct *work)
 	cpumask_var_t cpumask;
 	unsigned long flags;
 
-	if (!desc)
-		goto out;
-
-	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+	if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
 		goto out;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PENDING)
-		cpumask_copy(cpumask, desc->pending_mask);
+	if (irq_move_pending(desc))
+		irq_get_pending(cpumask, desc);
 	else
-#endif
 		cpumask_copy(cpumask, desc->irq_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
-- 
cgit v1.2.1


From b008207cbd0d5ce606a1a2ac52826e0ab37d0b99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 17:30:50 +0100
Subject: genirq: Rremove redundant check

IRQ_NO_BALANCING is already checked in irq_can_set_affinity() above,
no need to check it again.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 99f3e9a3780c..591c927b135c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
  */
 static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 {
+	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
 		return 0;
 
@@ -263,7 +264,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 	 * Preserve an userspace affinity setup, but make sure that
 	 * one of the targets is online.
 	 */
-	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+	if (desc->status & (IRQ_AFFINITY_SET)) {
 		if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
 		    < nr_cpu_ids)
 			goto set_affinity;
-- 
cgit v1.2.1


From 569bda8df11effa03e618729293c7961696abb10 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 17:05:08 +0100
Subject: genirq: Always apply cpu online mask

If the affinity had been set by the user, then a later request_irq()
will honour that setting. But online cpus can have changed. So apply
the online mask and for this case as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 591c927b135c..ade65bfb466d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
  */
 static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 {
+	struct cpumask *set = irq_default_affinity;
+
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
 		return 0;
@@ -265,15 +267,13 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET)) {
-		if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
-		    < nr_cpu_ids)
-			goto set_affinity;
+		if (cpumask_intersects(desc->irq_data.affinity,
+				       cpu_online_mask))
+			set = desc->irq_data.affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
-
-	cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
-set_affinity:
+	cpumask_and(desc->irq_data.affinity, cpu_online_mask, set);
 	desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
 
 	return 0;
-- 
cgit v1.2.1


From 3b8249e759c701c4a82f99d957be651a7657bf6f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 16:02:20 +0100
Subject: genirq: Do not copy affinity before set

While rumaging through arch code I found that there are a few
workarounds which deal with the fact that the initial affinity setting
from request_irq() copies the mask into irq_data->affinity before the
chip code is called. In the normal path we unconditionally copy the
mask when the chip code returns 0.

Copy after the code is called and add a return code
IRQ_SET_MASK_OK_NOCOPY for the chip functions, which prevents the
copy. That way we see the real mask when the chip function decided to
truncate it further as some arches do. IRQ_SET_MASK_OK is 0, which is
the current behaviour.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h |  2 +-
 kernel/irq/manage.c    | 47 ++++++++++++++++++++++++++++++++++-------------
 kernel/irq/proc.c      |  2 +-
 3 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99c3bc8a6fb4..b5bfa24aa6a6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -43,7 +43,7 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
-extern int irq_select_affinity_usr(unsigned int irq);
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ade65bfb466d..dc95d53df510 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -148,9 +148,12 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (irq_can_move_pcntxt(desc)) {
 		ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
-		if (!ret) {
+		switch (ret) {
+		case IRQ_SET_MASK_OK:
 			cpumask_copy(desc->irq_data.affinity, mask);
+		case IRQ_SET_MASK_OK_NOCOPY:
 			irq_set_thread_affinity(desc);
+			ret = 0;
 		}
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
@@ -254,9 +257,12 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 /*
  * Generic version of the affinity autoselector.
  */
-static int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
+	struct irq_chip *chip = get_irq_desc_chip(desc);
 	struct cpumask *set = irq_default_affinity;
+	int ret;
 
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
@@ -273,13 +279,20 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
-	cpumask_and(desc->irq_data.affinity, cpu_online_mask, set);
-	desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
 
+	cpumask_and(mask, cpu_online_mask, set);
+	ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+	switch (ret) {
+	case IRQ_SET_MASK_OK:
+		cpumask_copy(desc->irq_data.affinity, mask);
+	case IRQ_SET_MASK_OK_NOCOPY:
+		irq_set_thread_affinity(desc);
+	}
 	return 0;
 }
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
 {
 	return irq_select_affinity(irq);
 }
@@ -288,23 +301,23 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
 /*
  * Called when affinity is set via /proc/irq
  */
-int irq_select_affinity_usr(unsigned int irq)
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 	int ret;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = setup_affinity(irq, desc);
+	ret = setup_affinity(irq, desc, mask);
 	if (!ret)
 		irq_set_thread_affinity(desc);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
 	return ret;
 }
 
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
 	return 0;
 }
@@ -765,8 +778,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
 	unsigned long flags;
-	int nested, shared = 0;
-	int ret;
+	int ret, nested, shared = 0;
+	cpumask_var_t mask;
 
 	if (!desc)
 		return -EINVAL;
@@ -831,6 +844,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		new->thread = t;
 	}
 
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto out_thread;
+	}
+
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -876,7 +894,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 					new->flags & IRQF_TRIGGER_MASK);
 
 			if (ret)
-				goto out_thread;
+				goto out_mask;
 		} else
 			compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -903,7 +921,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_NO_BALANCING;
 
 		/* Set default affinity mask once everything is setup */
-		setup_affinity(irq, desc);
+		setup_affinity(irq, desc, mask);
 
 	} else if ((new->flags & IRQF_TRIGGER_MASK)
 			&& (new->flags & IRQF_TRIGGER_MASK)
@@ -956,6 +974,9 @@ mismatch:
 #endif
 	ret = -EBUSY;
 
+out_mask:
+	free_cpumask_var(mask);
+
 out_thread:
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	if (new->thread) {
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..a46bd762db47 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	if (!cpumask_intersects(new_value, cpu_online_mask)) {
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+		err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
 	} else {
 		irq_set_affinity(irq, new_value);
 		err = count;
-- 
cgit v1.2.1


From 2b879eaf095878430c38cbd95e5c0fc4ce65ad8e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 11:25:02 +0100
Subject: genirq: Remove redundant thread affinity setting

Thread affinity is already set by setup_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dc95d53df510..33a6ee0ac68f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -309,8 +309,6 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	ret = setup_affinity(irq, desc, mask);
-	if (!ret)
-		irq_set_thread_affinity(desc);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-- 
cgit v1.2.1


From 1082687e8d6292a61759eb83358e7db39fed1bf4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 09:05:05 +0100
Subject: genirq: Plug race in report_bad_irq()

We cannot walk the action chain unlocked. Even if IRQ_INPROGRESS is
set an action can be removed and we follow a null pointer. It's safe
to take the lock there, because the code which removes the action will
call synchronize_irq() which waits unlocked for IRQ_INPROGRESS going
away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..2fbfda2716e1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -139,15 +139,13 @@ static void poll_spurious_irqs(unsigned long dummy)
  *
  * (The other 100-of-100,000 interrupts may have been a correctly
  *  functioning device sharing an IRQ with the failing one)
- *
- * Called under desc->lock
  */
-
 static void
 __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 		 irqreturn_t action_ret)
 {
 	struct irqaction *action;
+	unsigned long flags;
 
 	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
 		printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +157,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
 
+	/*
+	 * We need to take desc->lock here. note_interrupt() is called
+	 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+	 * with something else removing an action. It's ok to take
+	 * desc->lock here. See synchronize_irq().
+	 */
+	raw_spin_lock_irqsave(&desc->lock, flags);
 	action = desc->action;
 	while (action) {
 		printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +172,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 		printk("\n");
 		action = action->next;
 	}
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 static void
-- 
cgit v1.2.1


From b738a50a202639614c98b5763b01bf9201779e50 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 23:58:19 +0100
Subject: genirq: Warn when handler enables interrupts

We run all handlers with interrupts disabled and expect them not to
enable them. Warn when we catch one who does.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..cdd6fbbe771c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -68,6 +68,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		ret = action->handler(irq, action->dev_id);
 		trace_irq_handler_exit(irq, action, ret);
 
+		if (WARN_ON_ONCE(!irqs_disabled()))
+			local_irq_disable();
+
 		switch (ret) {
 		case IRQ_WAKE_THREAD:
 			/*
@@ -114,7 +117,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	if (status & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
-	local_irq_disable();
 
 	return retval;
 }
-- 
cgit v1.2.1


From fa27271bc8d230355c1f24ddea103824fdc12de6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 09:10:39 +0100
Subject: genirq: Fixup poll handling

try_one_irq() contains redundant code and lots of useless checks for
shared interrupts. Check for shared before setting IRQ_INPROGRESS and
then call handle_IRQ_event() while pending. Shorter version with the
same functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 50 +++++++++++++++++++-------------------------------
 1 file changed, 19 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2fbfda2716e1..0af9e59c82eb 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -42,48 +42,36 @@ static int try_one_irq(int irq, struct irq_desc *desc)
 		raw_spin_unlock(&desc->lock);
 		return ok;
 	}
-	/* Honour the normal IRQ locking */
-	desc->status |= IRQ_INPROGRESS;
-	action = desc->action;
-	raw_spin_unlock(&desc->lock);
-
-	while (action) {
-		/* Only shared IRQ handlers are safe to call */
-		if (action->flags & IRQF_SHARED) {
-			if (action->handler(irq, action->dev_id) ==
-				IRQ_HANDLED)
-				ok = 1;
-		}
-		action = action->next;
-	}
-	local_irq_disable();
-	/* Now clean up the flags */
-	raw_spin_lock(&desc->lock);
-	action = desc->action;
-
 	/*
-	 * While we were looking for a fixup someone queued a real
-	 * IRQ clashing with our walk:
+	 * All handlers must agree on IRQF_SHARED, so we test just the
+	 * first. Check for action->next as well.
 	 */
-	while ((desc->status & IRQ_PENDING) && action) {
-		/*
-		 * Perform real IRQ processing for the IRQ we deferred
-		 */
-		work = 1;
+	action = desc->action;
+	if (!action || !(action->flags & IRQF_SHARED) || !action->next)
+		goto out;
+
+	/* Honour the normal IRQ locking */
+	desc->status |= IRQ_INPROGRESS;
+	do {
+		work++;
+		desc->status &= ~IRQ_PENDING;
 		raw_spin_unlock(&desc->lock);
-		handle_IRQ_event(irq, action);
+		if (handle_IRQ_event(irq, action) != IRQ_NONE)
+			ok = 1;
 		raw_spin_lock(&desc->lock);
-		desc->status &= ~IRQ_PENDING;
-	}
+		action = desc->action;
+	}  while ((desc->status & IRQ_PENDING) && action);
+
 	desc->status &= ~IRQ_INPROGRESS;
 	/*
 	 * If we did actual work for the real IRQ line we must let the
 	 * IRQ controller clean up too
 	 */
-	if (work)
+	if (work > 1)
 		irq_end(irq, desc);
-	raw_spin_unlock(&desc->lock);
 
+out:
+	raw_spin_unlock(&desc->lock);
 	return ok;
 }
 
-- 
cgit v1.2.1


From c7259cd7af757ddcd65701c37099dcddae2054f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 09:52:27 +0100
Subject: genirq: Do not poll disabled, percpu and timer interrupts

There is no point in polling disabled lines.

percpu does not make sense at all because we only poll on the cpu
we're currently running on. Also polling per_cpu interrupts is racy as
hell. The handler runs without locking so we might get a huge
surprise.

If the timer interrupt needs polling, then we wont get there anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 0af9e59c82eb..bd0e42d3e0ba 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -25,30 +25,42 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
 /*
  * Recovery handler for misrouted interrupts.
  */
-static int try_one_irq(int irq, struct irq_desc *desc)
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
 	struct irqaction *action;
 	int ok = 0, work = 0;
 
 	raw_spin_lock(&desc->lock);
+
+	/* PER_CPU and nested thread interrupts are never polled */
+	if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD))
+		goto out;
+
+	/*
+	 * Do not poll disabled interrupts unless the spurious
+	 * disabled poller asks explicitely.
+	 */
+	if ((desc->status & IRQ_DISABLED) && !force)
+		goto out;
+
+	/*
+	 * All handlers must agree on IRQF_SHARED, so we test just the
+	 * first. Check for action->next as well.
+	 */
+	action = desc->action;
+	if (!action || !(action->flags & IRQF_SHARED) ||
+	    (action->flags & __IRQF_TIMER) || !action->next)
+		goto out;
+
 	/* Already running on another processor */
 	if (desc->status & IRQ_INPROGRESS) {
 		/*
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
 		 */
-		if (desc->action && (desc->action->flags & IRQF_SHARED))
-			desc->status |= IRQ_PENDING;
-		raw_spin_unlock(&desc->lock);
-		return ok;
-	}
-	/*
-	 * All handlers must agree on IRQF_SHARED, so we test just the
-	 * first. Check for action->next as well.
-	 */
-	action = desc->action;
-	if (!action || !(action->flags & IRQF_SHARED) || !action->next)
+		desc->status |= IRQ_PENDING;
 		goto out;
+	}
 
 	/* Honour the normal IRQ locking */
 	desc->status |= IRQ_INPROGRESS;
@@ -87,7 +99,7 @@ static int misrouted_irq(int irq)
 		if (i == irq)	/* Already tried */
 			continue;
 
-		if (try_one_irq(i, desc))
+		if (try_one_irq(i, desc, false))
 			ok = 1;
 	}
 	/* So the caller can adjust the irq error counts */
@@ -112,7 +124,7 @@ static void poll_spurious_irqs(unsigned long dummy)
 			continue;
 
 		local_irq_disable();
-		try_one_irq(i, desc);
+		try_one_irq(i, desc, true);
 		local_irq_enable();
 	}
 
-- 
cgit v1.2.1


From d05c65fff0ef672be75429266751f0e015b54d94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 14:31:37 +0100
Subject: genirq: spurious: Run only one poller at a time

No point in running concurrent pollers which confuse each other by
setting PENDING.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd0e42d3e0ba..56ff8fffb8b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,6 +21,8 @@ static int irqfixup __read_mostly;
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 static void poll_spurious_irqs(unsigned long dummy);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
 
 /*
  * Recovery handler for misrouted interrupts.
@@ -92,6 +94,11 @@ static int misrouted_irq(int irq)
 	struct irq_desc *desc;
 	int i, ok = 0;
 
+	if (atomic_inc_return(&irq_poll_active) == 1)
+		goto out;
+
+	irq_poll_cpu = smp_processor_id();
+
 	for_each_irq_desc(i, desc) {
 		if (!i)
 			 continue;
@@ -102,6 +109,8 @@ static int misrouted_irq(int irq)
 		if (try_one_irq(i, desc, false))
 			ok = 1;
 	}
+out:
+	atomic_dec(&irq_poll_active);
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -111,6 +120,10 @@ static void poll_spurious_irqs(unsigned long dummy)
 	struct irq_desc *desc;
 	int i;
 
+	if (atomic_inc_return(&irq_poll_active) != 1)
+		goto out;
+	irq_poll_cpu = smp_processor_id();
+
 	for_each_irq_desc(i, desc) {
 		unsigned int status;
 
@@ -127,7 +140,8 @@ static void poll_spurious_irqs(unsigned long dummy)
 		try_one_irq(i, desc, true);
 		local_irq_enable();
 	}
-
+out:
+	atomic_dec(&irq_poll_active);
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
-- 
cgit v1.2.1


From fe200ae48ef5c79bf7941fe8046ff9505c570ff6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 10:34:30 +0100
Subject: genirq: Mark polled irqs and defer the real handler

With the chip.end() function gone we might run into a situation where
a poll call runs and the real interrupt comes in, sees IRQ_INPROGRESS
and disables the line. That might be a perfect working one, which will
then be masked forever.

So mark them polled while the poll runs. When the real handler sees
IRQ_INPROGRESS it checks the poll flag and waits for the polling to
complete. Add the necessary amount of sanity checks to it to avoid
deadlocks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 26 +++++++++++++++++++------
 kernel/irq/internals.h | 11 +----------
 kernel/irq/spurious.c  | 51 ++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 60 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 622b55ac0e09..31258782742c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -448,6 +448,13 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_nested_irq);
 
+static bool irq_check_poll(struct irq_desc *desc)
+{
+	if (!(desc->status & IRQ_POLL_INPROGRESS))
+		return false;
+	return irq_wait_for_poll(desc);
+}
+
 /**
  *	handle_simple_irq - Simple and software-decoded IRQs.
  *	@irq:	the interrupt number
@@ -469,7 +476,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out_unlock;
+		if (!irq_check_poll(desc))
+			goto out_unlock;
+
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
@@ -510,7 +519,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	mask_ack_irq(desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out_unlock;
+		if (!irq_check_poll(desc))
+			goto out_unlock;
+
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
@@ -558,7 +569,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out;
+		if (!irq_check_poll(desc))
+			goto out;
 
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
@@ -620,9 +632,11 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 */
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
-		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc);
-		goto out_unlock;
+		if (!irq_check_poll(desc)) {
+			desc->status |= (IRQ_PENDING | IRQ_MASKED);
+			mask_ack_irq(desc);
+			goto out_unlock;
+		}
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b5bfa24aa6a6..0eff7e92b1a9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -28,6 +28,7 @@ extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
 
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -47,16 +48,6 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static inline void irq_end(unsigned int irq, struct irq_desc *desc)
-{
-	if (desc->irq_data.chip && desc->irq_data.chip->end)
-		desc->irq_data.chip->end(irq);
-}
-#else
-static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
-#endif
-
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 56ff8fffb8b0..f749d29bfd81 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -24,13 +24,45 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
+/*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+	if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+		      "irq poll in progress on cpu %d for irq %d\n",
+		      smp_processor_id(), desc->irq_data.irq))
+		return false;
+
+#ifdef CONFIG_SMP
+	do {
+		raw_spin_unlock(&desc->lock);
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
+		raw_spin_lock(&desc->lock);
+	} while (desc->status & IRQ_INPROGRESS);
+	/* Might have been disabled in meantime */
+	return !(desc->status & IRQ_DISABLED) && desc->action;
+#else
+	return false;
+#endif
+}
+
 /*
  * Recovery handler for misrouted interrupts.
  */
 static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
 	struct irqaction *action;
-	int ok = 0, work = 0;
+	int ok = 0;
 
 	raw_spin_lock(&desc->lock);
 
@@ -64,10 +96,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 	}
 
-	/* Honour the normal IRQ locking */
-	desc->status |= IRQ_INPROGRESS;
+	/* Honour the normal IRQ locking and mark it poll in progress */
+	desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS;
 	do {
-		work++;
 		desc->status &= ~IRQ_PENDING;
 		raw_spin_unlock(&desc->lock);
 		if (handle_IRQ_event(irq, action) != IRQ_NONE)
@@ -76,14 +107,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		action = desc->action;
 	}  while ((desc->status & IRQ_PENDING) && action);
 
-	desc->status &= ~IRQ_INPROGRESS;
-	/*
-	 * If we did actual work for the real IRQ line we must let the
-	 * IRQ controller clean up too
-	 */
-	if (work > 1)
-		irq_end(irq, desc);
-
+	desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS);
 out:
 	raw_spin_unlock(&desc->lock);
 	return ok;
@@ -238,6 +262,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		    irqreturn_t action_ret)
 {
+	if (desc->status & IRQ_POLL_INPROGRESS)
+		return;
+
 	if (unlikely(action_ret != IRQ_HANDLED)) {
 		/*
 		 * If we are seeing only the odd spurious IRQ caused by
-- 
cgit v1.2.1


From 1535dfacbf21c4da1b73fcf07c39913da5bd5581 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:55:43 +0100
Subject: genirq: Move irq thread flags to core

Soleley used in core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 0eff7e92b1a9..b17c98440400 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -11,6 +11,20 @@
 
 extern int noirqdebug;
 
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
+ */
+enum {
+	IRQTF_RUNTHREAD,
+	IRQTF_DIED,
+	IRQTF_WARNED,
+	IRQTF_AFFINITY,
+};
+
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
 /* Set default functions for irq_chip structures: */
-- 
cgit v1.2.1


From 3b56f0585fd4c02d047dc406668cb40159b2d340 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 21:41:12 +0000
Subject: genirq: Remove bogus conditional

The if (chip->irq_shutdown) check will always evaluate to true, as we
fill in chip->irq_shutdown with default_shutdown in
irq_chip_set_defaults() if the chip does not provide its own function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110202212551.667607458@linutronix.de>
---
 kernel/irq/manage.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 33a6ee0ac68f..30bc8de40905 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1057,10 +1057,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* If this was the last handler, shut down the IRQ line: */
 	if (!desc->action) {
 		desc->status |= IRQ_DISABLED;
-		if (desc->irq_data.chip->irq_shutdown)
-			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-		else
-			desc->irq_data.chip->irq_disable(&desc->irq_data);
+		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.1


From 4699923861513671d3f6ade8efb4e56a9a7ecadf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 21:41:14 +0000
Subject: genirq: Consolidate startup/shutdown of interrupts

Aside of duplicated code some of the startup/shutdown sites do not
handle the MASKED/DISABLED flags and the depth field at all. Move that
to a helper function and take care of it there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110202212551.787481468@linutronix.de>
---
 kernel/irq/autoprobe.c | 10 +++++-----
 kernel/irq/chip.c      | 37 ++++++++++++++++++++-----------------
 kernel/irq/internals.h |  3 +++
 kernel/irq/manage.c    | 14 +++++---------
 4 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..08947cb61725 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -60,7 +60,7 @@ unsigned long probe_irq_on(void)
 			if (desc->irq_data.chip->irq_set_type)
 				desc->irq_data.chip->irq_set_type(&desc->irq_data,
 							 IRQ_TYPE_PROBE);
-			desc->irq_data.chip->irq_startup(&desc->irq_data);
+			irq_startup(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
@@ -77,7 +77,7 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-			if (desc->irq_data.chip->irq_startup(&desc->irq_data))
+			if (irq_startup(desc))
 				desc->status |= IRQ_PENDING;
 		}
 		raw_spin_unlock_irq(&desc->lock);
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 			/* It triggered already - consider it spurious. */
 			if (!(status & IRQ_WAITING)) {
 				desc->status = status & ~IRQ_AUTODETECT;
-				desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+				irq_shutdown(desc);
 			} else
 				if (i < 32)
 					mask |= 1 << i;
@@ -138,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
 				mask |= 1 << i;
 
 			desc->status = status & ~IRQ_AUTODETECT;
-			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
@@ -182,7 +182,7 @@ int probe_irq_off(unsigned long val)
 				nr_of_irqs++;
 			}
 			desc->status = status & ~IRQ_AUTODETECT;
-			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 31258782742c..988fe7a24282 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -192,6 +192,25 @@ void set_irq_nested_thread(unsigned int irq, int nest)
 }
 EXPORT_SYMBOL_GPL(set_irq_nested_thread);
 
+int irq_startup(struct irq_desc *desc)
+{
+	desc->status &= ~(IRQ_MASKED | IRQ_DISABLED);
+	desc->depth = 0;
+
+	if (desc->irq_data.chip->irq_startup)
+		return desc->irq_data.chip->irq_startup(&desc->irq_data);
+
+	desc->irq_data.chip->irq_enable(&desc->irq_data);
+	return 0;
+}
+
+void irq_shutdown(struct irq_desc *desc)
+{
+	desc->status |= IRQ_MASKED | IRQ_DISABLED;
+	desc->depth = 1;
+	desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+}
+
 /*
  * default enable function
  */
@@ -210,17 +229,6 @@ static void default_disable(struct irq_data *data)
 {
 }
 
-/*
- * default startup function
- */
-static unsigned int default_startup(struct irq_data *data)
-{
-	struct irq_desc *desc = irq_data_to_desc(data);
-
-	desc->irq_data.chip->irq_enable(data);
-	return 0;
-}
-
 /*
  * default shutdown function
  */
@@ -229,7 +237,6 @@ static void default_shutdown(struct irq_data *data)
 	struct irq_desc *desc = irq_data_to_desc(data);
 
 	desc->irq_data.chip->irq_mask(&desc->irq_data);
-	desc->status |= IRQ_MASKED;
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -337,8 +344,6 @@ void irq_chip_set_defaults(struct irq_chip *chip)
 		chip->irq_enable = default_enable;
 	if (!chip->irq_disable)
 		chip->irq_disable = default_disable;
-	if (!chip->irq_startup)
-		chip->irq_startup = default_startup;
 	/*
 	 * We use chip->irq_disable, when the user provided its own. When
 	 * we have default_disable set for chip->irq_disable, then we need
@@ -747,10 +752,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
-		desc->status &= ~IRQ_DISABLED;
 		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
-		desc->depth = 0;
-		desc->irq_data.chip->irq_startup(&desc->irq_data);
+		irq_startup(desc);
 	}
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	chip_bus_sync_unlock(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b17c98440400..5cbfc93ed7b1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -38,6 +38,9 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
+extern int irq_startup(struct irq_desc *desc);
+extern void irq_shutdown(struct irq_desc *desc);
+
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
 /* Resending of interrupts :*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 30bc8de40905..9c562477e28b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -906,11 +906,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
 
-		if (!(desc->status & IRQ_NOAUTOEN)) {
-			desc->depth = 0;
-			desc->status &= ~IRQ_DISABLED;
-			desc->irq_data.chip->irq_startup(&desc->irq_data);
-		} else
+		if (!(desc->status & IRQ_NOAUTOEN))
+			irq_startup(desc);
+		else
 			/* Undo nested disables: */
 			desc->depth = 1;
 
@@ -1055,10 +1053,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 #endif
 
 	/* If this was the last handler, shut down the IRQ line: */
-	if (!desc->action) {
-		desc->status |= IRQ_DISABLED;
-		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-	}
+	if (!desc->action)
+		irq_shutdown(desc);
 
 #ifdef CONFIG_SMP
 	/* make sure affinity_hint is cleaned up */
-- 
cgit v1.2.1


From 87923470c712dff00b101ffb6b6fbc27bd7a6df5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Feb 2011 12:27:44 +0100
Subject: genirq: Consolidate disable/enable

Create irq_disable/enable and use them to keep the flags consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 12 +++++++++++-
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    |  2 +-
 kernel/irq/resend.c    | 10 +++++-----
 kernel/irq/spurious.c  |  2 +-
 5 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 988fe7a24282..86c8e42f7fe4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -200,7 +200,7 @@ int irq_startup(struct irq_desc *desc)
 	if (desc->irq_data.chip->irq_startup)
 		return desc->irq_data.chip->irq_startup(&desc->irq_data);
 
-	desc->irq_data.chip->irq_enable(&desc->irq_data);
+	irq_enable(desc);
 	return 0;
 }
 
@@ -211,6 +211,16 @@ void irq_shutdown(struct irq_desc *desc)
 	desc->irq_data.chip->irq_shutdown(&desc->irq_data);
 }
 
+void irq_enable(struct irq_desc *desc)
+{
+	desc->irq_data.chip->irq_enable(&desc->irq_data);
+}
+
+void irq_disable(struct irq_desc *desc)
+{
+	desc->irq_data.chip->irq_disable(&desc->irq_data);
+}
+
 /*
  * default enable function
  */
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5cbfc93ed7b1..c71fc4de0371 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -40,6 +40,8 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9c562477e28b..007802399697 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -331,7 +331,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 
 	if (!desc->depth++) {
 		desc->status |= IRQ_DISABLED;
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_disable(desc);
 	}
 }
 
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dc49358b73fa..4bfe268dffe5 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,20 +55,20 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
  */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-	unsigned int status = desc->status;
-
 	/*
 	 * Make sure the interrupt is enabled, before resending it:
 	 */
-	desc->irq_data.chip->irq_enable(&desc->irq_data);
+	irq_enable(desc);
 
 	/*
 	 * We do not resend level type interrupts. Level type
 	 * interrupts are resent by hardware when they are still
 	 * active.
 	 */
-	if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+	if (desc->status & IRQ_LEVEL)
+		return;
+	if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+		desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY;
 
 		if (!desc->irq_data.chip->irq_retrigger ||
 		    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f749d29bfd81..c300b8f6008d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -303,7 +303,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
 		desc->depth++;
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_disable(desc);
 
 		mod_timer(&poll_spurious_irq_timer,
 			  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
-- 
cgit v1.2.1


From 50f7c0327513d5acefbe26fd33498af18d1ffac5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Feb 2011 13:23:54 +0100
Subject: genirq: Remove default magic

Now that everything uses the wrappers, we can remove the default
functions. None of those functions is performance critical.

That makes the IRQ_MASKED flag tracking fully consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c   | 73 +++++++++++------------------------------------------
 kernel/irq/manage.c |  4 +--
 2 files changed, 17 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 86c8e42f7fe4..1a239a83f925 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -208,45 +208,29 @@ void irq_shutdown(struct irq_desc *desc)
 {
 	desc->status |= IRQ_MASKED | IRQ_DISABLED;
 	desc->depth = 1;
-	desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+	if (desc->irq_data.chip->irq_shutdown)
+		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+	if (desc->irq_data.chip->irq_disable)
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_mask(&desc->irq_data);
 }
 
 void irq_enable(struct irq_desc *desc)
 {
-	desc->irq_data.chip->irq_enable(&desc->irq_data);
-}
-
-void irq_disable(struct irq_desc *desc)
-{
-	desc->irq_data.chip->irq_disable(&desc->irq_data);
-}
-
-/*
- * default enable function
- */
-static void default_enable(struct irq_data *data)
-{
-	struct irq_desc *desc = irq_data_to_desc(data);
-
-	desc->irq_data.chip->irq_unmask(&desc->irq_data);
+	if (desc->irq_data.chip->irq_enable)
+		desc->irq_data.chip->irq_enable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	desc->status &= ~IRQ_MASKED;
 }
 
-/*
- * default disable function
- */
-static void default_disable(struct irq_data *data)
-{
-}
-
-/*
- * default shutdown function
- */
-static void default_shutdown(struct irq_data *data)
+void irq_disable(struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_data_to_desc(data);
-
-	desc->irq_data.chip->irq_mask(&desc->irq_data);
+	if (desc->irq_data.chip->irq_disable) {
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		desc->status |= IRQ_MASKED;
+	}
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -334,10 +318,6 @@ static void compat_bus_sync_unlock(struct irq_data *data)
 void irq_chip_set_defaults(struct irq_chip *chip)
 {
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-	/*
-	 * Compat fixup functions need to be before we set the
-	 * defaults for enable/disable/startup/shutdown
-	 */
 	if (chip->enable)
 		chip->irq_enable = compat_irq_enable;
 	if (chip->disable)
@@ -346,31 +326,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
 		chip->irq_shutdown = compat_irq_shutdown;
 	if (chip->startup)
 		chip->irq_startup = compat_irq_startup;
-#endif
-	/*
-	 * The real defaults
-	 */
-	if (!chip->irq_enable)
-		chip->irq_enable = default_enable;
-	if (!chip->irq_disable)
-		chip->irq_disable = default_disable;
-	/*
-	 * We use chip->irq_disable, when the user provided its own. When
-	 * we have default_disable set for chip->irq_disable, then we need
-	 * to use default_shutdown, otherwise the irq line is not
-	 * disabled on free_irq():
-	 */
-	if (!chip->irq_shutdown)
-		chip->irq_shutdown = chip->irq_disable != default_disable ?
-			chip->irq_disable : default_shutdown;
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
 	if (!chip->end)
 		chip->end = dummy_irq_chip.end;
-
-	/*
-	 * Now fix up the remaining compat handlers
-	 */
 	if (chip->bus_lock)
 		chip->irq_bus_lock = compat_bus_lock;
 	if (chip->bus_sync_unlock)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 007802399697..2a6c6ee11a3f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -440,8 +440,8 @@ void enable_irq(unsigned int irq)
 	if (!desc)
 		return;
 
-	if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
-	    KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+	if (WARN(!desc->irq_data.chip,
+		 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
 		return;
 
 	chip_bus_lock(desc);
-- 
cgit v1.2.1


From 3aae994fb0f43f6d94a31c33536a83869504abdf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 10:17:52 +0100
Subject: genirq: Consolidate IRQ_DISABLED

Handle IRQ_DISABLED consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c     | 14 ++++++++++----
 kernel/irq/manage.c   |  9 +++------
 kernel/irq/resend.c   |  5 -----
 kernel/irq/spurious.c |  2 +-
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1a239a83f925..43c62ca68c11 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -194,11 +194,14 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
 
 int irq_startup(struct irq_desc *desc)
 {
-	desc->status &= ~(IRQ_MASKED | IRQ_DISABLED);
+	desc->status &= ~IRQ_DISABLED;
 	desc->depth = 0;
 
-	if (desc->irq_data.chip->irq_startup)
-		return desc->irq_data.chip->irq_startup(&desc->irq_data);
+	if (desc->irq_data.chip->irq_startup) {
+		int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+		desc->status &= ~IRQ_MASKED;
+		return ret;
+	}
 
 	irq_enable(desc);
 	return 0;
@@ -206,7 +209,7 @@ int irq_startup(struct irq_desc *desc)
 
 void irq_shutdown(struct irq_desc *desc)
 {
-	desc->status |= IRQ_MASKED | IRQ_DISABLED;
+	desc->status |= IRQ_DISABLED;
 	desc->depth = 1;
 	if (desc->irq_data.chip->irq_shutdown)
 		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -214,10 +217,12 @@ void irq_shutdown(struct irq_desc *desc)
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 	else
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
+	desc->status |= IRQ_MASKED;
 }
 
 void irq_enable(struct irq_desc *desc)
 {
+	desc->status &= ~IRQ_DISABLED;
 	if (desc->irq_data.chip->irq_enable)
 		desc->irq_data.chip->irq_enable(&desc->irq_data);
 	else
@@ -227,6 +232,7 @@ void irq_enable(struct irq_desc *desc)
 
 void irq_disable(struct irq_desc *desc)
 {
+	desc->status |= IRQ_DISABLED;
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 		desc->status |= IRQ_MASKED;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2a6c6ee11a3f..78a566a9b39f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -329,10 +329,8 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 		desc->status |= IRQ_SUSPENDED;
 	}
 
-	if (!desc->depth++) {
-		desc->status |= IRQ_DISABLED;
+	if (!desc->depth++)
 		irq_disable(desc);
-	}
 }
 
 /**
@@ -407,12 +405,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
-		unsigned int status = desc->status & ~IRQ_DISABLED;
-
 		if (desc->status & IRQ_SUSPENDED)
 			goto err_out;
 		/* Prevent probing on this irq: */
-		desc->status = status | IRQ_NOPROBE;
+		desc->status |= IRQ_NOPROBE;
+		irq_enable(desc);
 		check_irq_resend(desc, irq);
 		/* fall-through */
 	}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 4bfe268dffe5..60b20261041b 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,11 +55,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
  */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-	/*
-	 * Make sure the interrupt is enabled, before resending it:
-	 */
-	irq_enable(desc);
-
 	/*
 	 * We do not resend level type interrupts. Level type
 	 * interrupts are resent by hardware when they are still
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c300b8f6008d..89e5e16aca39 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -301,7 +301,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-		desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+		desc->status |= IRQ_SPURIOUS_DISABLED;
 		desc->depth++;
 		irq_disable(desc);
 
-- 
cgit v1.2.1


From d78f8dd36b90626106ce19cb2e6828b0dc39447e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 21:41:17 +0000
Subject: genirq: Do not fiddle with IRQ_MASKED in handle_edge_irq()

IRQ_MASKED is set in mask_ack_irq() anyway. Remove it from
handle_edge_irq() to allow simpler ab^HHreuse of that function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110202212551.918484270@linutronix.de>
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 43c62ca68c11..2c30b7844595 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -611,7 +611,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		if (!irq_check_poll(desc)) {
-			desc->status |= (IRQ_PENDING | IRQ_MASKED);
+			desc->status |= IRQ_PENDING;
 			mask_ack_irq(desc);
 			goto out_unlock;
 		}
-- 
cgit v1.2.1


From 4912609f228da4a3d2bfbdf0f31de3d9eab2b7f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:08:49 +0100
Subject: genirq: Implement handle_irq_event()

Core code replacement for the ugly camel case. It contains all the
code which is shared in all handlers.

     clear status flags
     set INPROGRESS flag
     unlock
     call action chain
     note_interrupt
     lock
     clr INPROGRESS flag

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c    | 47 +++++++++++++++++++++++++++++++++++++++--------
 kernel/irq/internals.h |  3 +++
 2 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index cdd6fbbe771c..4ef059478ebf 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,14 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-/**
- * handle_IRQ_event - irq action chain handler
- * @irq:	the interrupt number
- * @action:	the interrupt action chain for this irq
- *
- * Handles the action chain of an irq event
- */
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
@@ -120,3 +113,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	return retval;
 }
+
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
+{
+	irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action);
+
+	if (!noirqdebug)
+		note_interrupt(desc->irq_data.irq, desc, ret);
+	return ret;
+}
+
+irqreturn_t handle_irq_event(struct irq_desc *desc)
+{
+	struct irqaction *action = desc->action;
+	irqreturn_t ret;
+
+	desc->status &= ~IRQ_PENDING;
+	desc->status |= IRQ_INPROGRESS;
+	raw_spin_unlock(&desc->lock);
+
+	ret = handle_irq_event_percpu(desc, action);
+
+	raw_spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+	return ret;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq:	the interrupt number
+ * @action:	the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
+ */
+irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+{
+	return __handle_irq_event(irq, action);
+}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c71fc4de0371..b61824cdadc6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -45,6 +45,9 @@ extern void irq_disable(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
+
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
 bool irq_wait_for_poll(struct irq_desc *desc);
-- 
cgit v1.2.1


From 107781e72192067b95a7d373bfa460434a13c6ae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:21:02 +0100
Subject: genirq: Use handle_irq_event() in handle_simple_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2c30b7844595..809a03fe7e07 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -448,9 +448,6 @@ static bool irq_check_poll(struct irq_desc *desc)
 void
 handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irqaction *action;
-	irqreturn_t action_ret;
-
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -460,19 +457,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
 
-	desc->status |= IRQ_INPROGRESS;
-	raw_spin_unlock(&desc->lock);
+	handle_irq_event(desc);
 
-	action_ret = handle_IRQ_event(irq, action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
-
-	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
-- 
cgit v1.2.1


From 1529866c63d789925de9b4250646d82d033e4b95 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:22:17 +0100
Subject: genirq: Use handle_irq_event() in handle_level_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 809a03fe7e07..2d2ba4ace0ec 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -479,9 +479,6 @@ out_unlock:
 void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irqaction *action;
-	irqreturn_t action_ret;
-
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
 
@@ -496,19 +493,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * keep it masked and get out of here
 	 */
-	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
 
-	desc->status |= IRQ_INPROGRESS;
-	raw_spin_unlock(&desc->lock);
-
-	action_ret = handle_IRQ_event(irq, action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
-
-	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	handle_irq_event(desc);
 
 	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
 		unmask_irq(desc);
-- 
cgit v1.2.1


From a7ae4de5c8ae8110556f0f9c7241093ef984605c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:23:07 +0100
Subject: genirq: Use handle_irq_event() in handle_fasteoi_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d2ba4ace0ec..a499ca5b11aa 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -518,9 +518,6 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
 void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irqaction *action;
-	irqreturn_t action_ret;
-
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -534,26 +531,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * then mask it and get out of here:
 	 */
-	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		mask_irq(desc);
 		goto out;
 	}
-
-	desc->status |= IRQ_INPROGRESS;
-	desc->status &= ~IRQ_PENDING;
-	raw_spin_unlock(&desc->lock);
-
-	action_ret = handle_IRQ_event(irq, action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
-
-	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	handle_irq_event(desc);
 out:
 	desc->irq_data.chip->irq_eoi(&desc->irq_data);
-
 	raw_spin_unlock(&desc->lock);
 }
 
-- 
cgit v1.2.1


From a60a5dc2db3b08b3c2900614c43b1262410c2d8c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:24:07 +0100
Subject: genirq: Use handle_irq_event() in handle_edge_irq()

It's safe to drop the IRQ_INPROGRESS flag between action chain walks
as we are protected by desc->lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a499ca5b11aa..3ccff4d55b39 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -583,14 +583,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	/* Start handling the irq */
 	desc->irq_data.chip->irq_ack(&desc->irq_data);
 
-	/* Mark the IRQ currently in progress.*/
-	desc->status |= IRQ_INPROGRESS;
-
 	do {
-		struct irqaction *action = desc->action;
-		irqreturn_t action_ret;
-
-		if (unlikely(!action)) {
+		if (unlikely(!desc->action)) {
 			mask_irq(desc);
 			goto out_unlock;
 		}
@@ -606,16 +600,10 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 			unmask_irq(desc);
 		}
 
-		desc->status &= ~IRQ_PENDING;
-		raw_spin_unlock(&desc->lock);
-		action_ret = handle_IRQ_event(irq, action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
-		raw_spin_lock(&desc->lock);
+		handle_irq_event(desc);
 
 	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
 
-	desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
-- 
cgit v1.2.1


From 849f061c25f8951d11c7dd88f44950ccde296392 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:25:41 +0100
Subject: genirq: Use handle_perpcu_event() in handle_percpu_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ccff4d55b39..52b10ad7bd59 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -618,19 +618,17 @@ out_unlock:
 void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
-	irqreturn_t action_ret;
+	struct irq_chip *chip = get_irq_desc_chip(desc);
 
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (desc->irq_data.chip->irq_ack)
-		desc->irq_data.chip->irq_ack(&desc->irq_data);
+	if (chip->irq_ack)
+		chip->irq_ack(&desc->irq_data);
 
-	action_ret = handle_IRQ_event(irq, desc->action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
+	handle_irq_event_percpu(desc, desc->action);
 
-	if (desc->irq_data.chip->irq_eoi)
-		desc->irq_data.chip->irq_eoi(&desc->irq_data);
+	if (chip->irq_eoi)
+		chip->irq_eoi(&desc->irq_data);
 }
 
 void
-- 
cgit v1.2.1


From 0877d66257082ce86fca8f9826b91870575b272c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:29:15 +0100
Subject: genirq: Use handle_irq_event() in the spurious poll code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89e5e16aca39..bc0620745d5f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -56,13 +56,14 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 #endif
 }
 
+
 /*
  * Recovery handler for misrouted interrupts.
  */
 static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
+	irqreturn_t ret = IRQ_NONE;
 	struct irqaction *action;
-	int ok = 0;
 
 	raw_spin_lock(&desc->lock);
 
@@ -96,21 +97,17 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 	}
 
-	/* Honour the normal IRQ locking and mark it poll in progress */
-	desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS;
+	/* Mark it poll in progress */
+	desc->status |= IRQ_POLL_INPROGRESS;
 	do {
-		desc->status &= ~IRQ_PENDING;
-		raw_spin_unlock(&desc->lock);
-		if (handle_IRQ_event(irq, action) != IRQ_NONE)
-			ok = 1;
-		raw_spin_lock(&desc->lock);
+		if (handle_irq_event(desc) == IRQ_HANDLED)
+			ret = IRQ_HANDLED;
 		action = desc->action;
-	}  while ((desc->status & IRQ_PENDING) && action);
-
-	desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS);
+	} while ((desc->status & IRQ_PENDING) && action);
+	desc->status &= ~IRQ_POLL_INPROGRESS;
 out:
 	raw_spin_unlock(&desc->lock);
-	return ok;
+	return ret == IRQ_HANDLED;
 }
 
 static int misrouted_irq(int irq)
-- 
cgit v1.2.1


From 1277a5325adfc53caac7dd3dac5d3d2fd2a125b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:40:27 +0100
Subject: genirq: Simplify handle_irq_event()

Now that all core users are converted one layer can go.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 4ef059478ebf..ff40e0f5e2e2 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,10 +51,11 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action)
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
-	unsigned int status = 0;
+	unsigned int status = 0, irq = desc->irq_data.irq;
 
 	do {
 		trace_irq_handler_entry(irq, action);
@@ -111,17 +112,9 @@ static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action
 	if (status & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
 
-	return retval;
-}
-
-irqreturn_t
-handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
-{
-	irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action);
-
 	if (!noirqdebug)
-		note_interrupt(desc->irq_data.irq, desc, ret);
-	return ret;
+		note_interrupt(irq, desc, ret);
+	return retval;
 }
 
 irqreturn_t handle_irq_event(struct irq_desc *desc)
@@ -149,5 +142,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
  */
 irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 {
-	return __handle_irq_event(irq, action);
+	return handle_irq_event_percpu(irq_to_desc(irq), action);
 }
-- 
cgit v1.2.1


From c78b9b65faa291def628dbd8539649f58299f0f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Dec 2010 17:21:47 +0100
Subject: genirq: Implement generic irq_show_interrupts()

All archs implement show_interrupts() in more or less the same
way. That's tons of duplicated code with different bugs with no
value. Implement a generic version and deprecate show_interrupts()

Unfortunately we need some ifdeffery for !GENERIC_HARDIRQ archs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig |  3 +++
 kernel/irq/proc.c  | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..4cd5d7135e0f 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -20,6 +20,9 @@ config HAVE_SPARSE_IRQ
 config GENERIC_IRQ_PROBE
 	def_bool n
 
+config GENERIC_IRQ_SHOW
+       def_bool n
+
 config GENERIC_PENDING_IRQ
 	def_bool n
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a46bd762db47..26449239bb46 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
 
 #include "internals.h"
 
@@ -357,3 +358,65 @@ void init_irq_proc(void)
 	}
 }
 
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+	return 0;
+}
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+	static int prec;
+
+	unsigned long flags, any_count = 0;
+	int i = *(loff_t *) v, j;
+	struct irqaction *action;
+	struct irq_desc *desc;
+
+	if (i > nr_irqs)
+		return 0;
+
+	if (i == nr_irqs)
+		return arch_show_interrupts(p, prec);
+
+	/* print header and calculate the width of the first column */
+	if (i == 0) {
+		for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+			j *= 10;
+
+		seq_printf(p, "%*s", prec + 8, "");
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%-8d", j);
+		seq_putc(p, '\n');
+	}
+
+	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	for_each_online_cpu(j)
+		any_count |= kstat_irqs_cpu(i, j);
+	action = desc->action;
+	if (!action && !any_count)
+		goto out;
+
+	seq_printf(p, "%*d: ", prec, i);
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+	seq_printf(p, " %8s", desc->irq_data.chip->name);
+	seq_printf(p, "-%-8s", desc->name);
+
+	if (action) {
+		seq_printf(p, "  %s", action->name);
+		while ((action = action->next) != NULL)
+			seq_printf(p, ", %s", action->name);
+	}
+
+	seq_putc(p, '\n');
+out:
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+#endif
-- 
cgit v1.2.1


From 35e857cbeb24e75c6f9a9312ac30454eee8c5950 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 12:20:23 +0100
Subject: genirq: Fixup core code namespace fallout

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c   | 6 +++---
 kernel/irq/manage.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 52b10ad7bd59..143eb2a9fa4e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -618,7 +618,7 @@ out_unlock:
 void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_chip *chip = get_irq_desc_chip(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
 
 	kstat_incr_irqs_this_cpu(irq, desc);
 
@@ -685,7 +685,7 @@ void
 set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
 			 irq_flow_handler_t handle)
 {
-	set_irq_chip(irq, chip);
+	irq_set_chip(irq, chip);
 	__set_irq_handler(irq, handle, 0, NULL);
 }
 
@@ -693,7 +693,7 @@ void
 set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
 {
-	set_irq_chip(irq, chip);
+	irq_set_chip(irq, chip);
 	__set_irq_handler(irq, handle, 0, name);
 }
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 78a566a9b39f..dd4e5c21b9e7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
-	struct irq_chip *chip = get_irq_desc_chip(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct cpumask *set = irq_default_affinity;
 	int ret;
 
-- 
cgit v1.2.1


From dbec07bac614a61e3392c1e7c08cc6a49ad43f7a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:19:55 +0100
Subject: genirq: Add internal state field to irq_desc

That field will contain internal state information which is not going
to be exposed to anything outside the core code - except via accessor
functions. I'm tired of everyone fiddling in irq_desc.status.

core_internal_state__do_not_mess_with_it is clear enough, annoying to
type and easy to grep for. Offenders will be tracked down and slapped
with stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b61824cdadc6..ae96e688f4e1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,5 +1,9 @@
 /*
  * IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
  */
 #include <linux/irqdesc.h>
 
@@ -9,6 +13,8 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
+#define istate core_internal_state__do_not_mess_with_it
+
 extern int noirqdebug;
 
 /*
-- 
cgit v1.2.1


From e6bea9c404699223322d7411c6f2ceaec02fa83c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 13:16:52 +0100
Subject: genirq: Protect tglx from tripping over his own feet

The irq_desc.status field will either go away or renamed to
settings. Anyway we need to maintain compatibility to avoid breaking
the world and some more. While moving bits into the core, I need to
avoid that I use any of the still existing IRQ_ bits in the core code
by typos. So that file will hold the inline wrappers and some nasty
CPP tricks to break the build when typoed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 2 ++
 kernel/irq/irqdesc.c   | 4 ++--
 kernel/irq/settings.h  | 7 +++++++
 3 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 kernel/irq/settings.h

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ae96e688f4e1..8f200310a952 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,8 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
+#include "settings.h"
+
 #define istate core_internal_state__do_not_mess_with_it
 
 extern int noirqdebug;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 6f6644f819dd..8b87f2ce0203 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.chip_data = NULL;
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
-	desc->status = IRQ_DEFAULT_INIT_FLAGS;
+	desc->status = _IRQ_DEFAULT_INIT_FLAGS;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
@@ -246,7 +246,7 @@ int __init early_irq_init(void)
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
-		.status		= IRQ_DEFAULT_INIT_FLAGS,
+		.status		= _IRQ_DEFAULT_INIT_FLAGS,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
 		.lock		= __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..610f55597ce7
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,7 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
+};
-- 
cgit v1.2.1


From bd062e7667ac173afef57fbfe9327f3b914a9d4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:25:25 +0100
Subject: genirq: Move IRQ_AUTODETECT to internal state

No users outside of core

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/autoprobe.c | 29 ++++++++++++-----------------
 kernel/irq/internals.h | 15 +++++++++++++--
 kernel/irq/manage.c    |  3 ++-
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 08947cb61725..916e56e10a2e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
 {
 	struct irq_desc *desc;
 	unsigned long mask = 0;
-	unsigned int status;
 	int i;
 
 	/*
@@ -76,7 +75,8 @@ unsigned long probe_irq_on(void)
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
-			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+			desc->istate |= IRQS_AUTODETECT;
+			desc->status |= IRQ_WAITING;
 			if (irq_startup(desc))
 				desc->status |= IRQ_PENDING;
 		}
@@ -93,12 +93,11 @@ unsigned long probe_irq_on(void)
 	 */
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		status = desc->status;
 
-		if (status & IRQ_AUTODETECT) {
+		if (desc->istate & IRQS_AUTODETECT) {
 			/* It triggered already - consider it spurious. */
-			if (!(status & IRQ_WAITING)) {
-				desc->status = status & ~IRQ_AUTODETECT;
+			if (!(desc->status & IRQ_WAITING)) {
+				desc->istate &= ~IRQS_AUTODETECT;
 				irq_shutdown(desc);
 			} else
 				if (i < 32)
@@ -125,19 +124,17 @@ EXPORT_SYMBOL(probe_irq_on);
  */
 unsigned int probe_irq_mask(unsigned long val)
 {
-	unsigned int status, mask = 0;
+	unsigned int mask = 0;
 	struct irq_desc *desc;
 	int i;
 
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		status = desc->status;
-
-		if (status & IRQ_AUTODETECT) {
-			if (i < 16 && !(status & IRQ_WAITING))
+		if (desc->istate & IRQS_AUTODETECT) {
+			if (i < 16 && !(desc->status & IRQ_WAITING))
 				mask |= 1 << i;
 
-			desc->status = status & ~IRQ_AUTODETECT;
+			desc->istate &= ~IRQS_AUTODETECT;
 			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
@@ -169,19 +166,17 @@ int probe_irq_off(unsigned long val)
 {
 	int i, irq_found = 0, nr_of_irqs = 0;
 	struct irq_desc *desc;
-	unsigned int status;
 
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		status = desc->status;
 
-		if (status & IRQ_AUTODETECT) {
-			if (!(status & IRQ_WAITING)) {
+		if (desc->istate & IRQS_AUTODETECT) {
+			if (!(desc->status & IRQ_WAITING)) {
 				if (!nr_of_irqs)
 					irq_found = i;
 				nr_of_irqs++;
 			}
-			desc->status = status & ~IRQ_AUTODETECT;
+			desc->istate &= ~IRQS_AUTODETECT;
 			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8f200310a952..7ffd4f439b92 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -33,6 +33,15 @@ enum {
 	IRQTF_AFFINITY,
 };
 
+/*
+ * Bit masks for desc->state
+ *
+ * IRQS_AUTODETECT		- autodetection in progress
+ */
+enum {
+	IRQS_AUTODETECT		= 0x00000001,
+};
+
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
 /* Set default functions for irq_chip structures: */
@@ -98,6 +107,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 #include <linux/kallsyms.h>
 
 #define P(f) if (desc->status & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
 
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
@@ -117,7 +127,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
 	P(IRQ_REPLAY);
-	P(IRQ_AUTODETECT);
 	P(IRQ_WAITING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
@@ -127,7 +136,9 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_NOPROBE);
 	P(IRQ_NOREQUEST);
 	P(IRQ_NOAUTOEN);
+
+	PS(IRQS_AUTODETECT);
 }
 
 #undef P
-
+#undef PS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dd4e5c21b9e7..abe852c9449d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
+		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT |
 				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+		desc->istate &= ~IRQS_AUTODETECT;
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
-- 
cgit v1.2.1


From 7acdd53e5b2c55b6f7e3427e85e2f91fa814a4f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:40:54 +0100
Subject: genirq: Move IRQ_SPURIOUS_DISABLED to core state

No users outside.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 3 +++
 kernel/irq/manage.c    | 9 ++++-----
 kernel/irq/spurious.c  | 8 ++++----
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7ffd4f439b92..dc5e21b84f9e 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -37,9 +37,12 @@ enum {
  * Bit masks for desc->state
  *
  * IRQS_AUTODETECT		- autodetection in progress
+ * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
+ *				  detection
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
+	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index abe852c9449d..5b918ffa46af 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -897,9 +897,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT |
-				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
-		desc->istate &= ~IRQS_AUTODETECT;
+		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS);
+		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED);
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
@@ -937,8 +936,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 * Check whether we disabled the irq via the spurious handler
 	 * before. Reenable it and give it another chance.
 	 */
-	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
-		desc->status &= ~IRQ_SPURIOUS_DISABLED;
+	if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
+		desc->istate &= ~IRQS_SPURIOUS_DISABLED;
 		__enable_irq(desc, irq, false);
 	}
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bc0620745d5f..2941d8a22df7 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,15 +146,15 @@ static void poll_spurious_irqs(unsigned long dummy)
 	irq_poll_cpu = smp_processor_id();
 
 	for_each_irq_desc(i, desc) {
-		unsigned int status;
+		unsigned int state;
 
 		if (!i)
 			 continue;
 
 		/* Racy but it doesn't matter */
-		status = desc->status;
+		state = desc->istate;
 		barrier();
-		if (!(status & IRQ_SPURIOUS_DISABLED))
+		if (!(state & IRQS_SPURIOUS_DISABLED))
 			continue;
 
 		local_irq_disable();
@@ -298,7 +298,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-		desc->status |= IRQ_SPURIOUS_DISABLED;
+		desc->istate |= IRQS_SPURIOUS_DISABLED;
 		desc->depth++;
 		irq_disable(desc);
 
-- 
cgit v1.2.1


From 6f91a52d9bb28396177662f1da0f2e2cef9cf5d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 13:33:16 +0100
Subject: genirq: Use modify_status for set_irq_nested_thread

No need for a separate function in the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 143eb2a9fa4e..bff21f233a02 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -164,34 +164,6 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(irq_get_irq_data);
 
-/**
- *	set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
- *
- *	@irq:	Interrupt number
- *	@nest:	0 to clear / 1 to set the IRQ_NESTED_THREAD flag
- *
- *	The IRQ_NESTED_THREAD flag indicates that on
- *	request_threaded_irq() no separate interrupt thread should be
- *	created for the irq as the handler are called nested in the
- *	context of a demultiplexing interrupt handler thread.
- */
-void set_irq_nested_thread(unsigned int irq, int nest)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-
-	if (!desc)
-		return;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	if (nest)
-		desc->status |= IRQ_NESTED_THREAD;
-	else
-		desc->status &= ~IRQ_NESTED_THREAD;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-EXPORT_SYMBOL_GPL(set_irq_nested_thread);
-
 int irq_startup(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_DISABLED;
-- 
cgit v1.2.1


From 6954b75b488dd740950573f244ddd66fd28620aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:55:35 +0100
Subject: genirq: Move IRQ_POLL_INPROGRESS to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 2 +-
 kernel/irq/internals.h | 2 ++
 kernel/irq/spurious.c  | 6 +++---
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index bff21f233a02..34245e7d1213 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -400,7 +400,7 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
 
 static bool irq_check_poll(struct irq_desc *desc)
 {
-	if (!(desc->status & IRQ_POLL_INPROGRESS))
+	if (!(desc->istate & IRQS_POLL_INPROGRESS))
 		return false;
 	return irq_wait_for_poll(desc);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index dc5e21b84f9e..f5d28e1e1eda 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -39,10 +39,12 @@ enum {
  * IRQS_AUTODETECT		- autodetection in progress
  * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
  *				  detection
+ * IRQS_POLL_INPROGRESS		- polling in progress
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
+	IRQS_POLL_INPROGRESS	= 0x00000008,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2941d8a22df7..21c46178b1a6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -98,13 +98,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	}
 
 	/* Mark it poll in progress */
-	desc->status |= IRQ_POLL_INPROGRESS;
+	desc->istate |= IRQS_POLL_INPROGRESS;
 	do {
 		if (handle_irq_event(desc) == IRQ_HANDLED)
 			ret = IRQ_HANDLED;
 		action = desc->action;
 	} while ((desc->status & IRQ_PENDING) && action);
-	desc->status &= ~IRQ_POLL_INPROGRESS;
+	desc->istate &= ~IRQS_POLL_INPROGRESS;
 out:
 	raw_spin_unlock(&desc->lock);
 	return ret == IRQ_HANDLED;
@@ -259,7 +259,7 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		    irqreturn_t action_ret)
 {
-	if (desc->status & IRQ_POLL_INPROGRESS)
+	if (desc->istate & IRQS_POLL_INPROGRESS)
 		return;
 
 	if (unlikely(action_ret != IRQ_HANDLED)) {
-- 
cgit v1.2.1


From 009b4c3b8ad584b3462734127a5bec680d5d6af4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 21:48:49 +0100
Subject: genirq: Add IRQ_INPROGRESS to core

We need to maintain the flag for now in both fields status and istate.
Add a CONFIG_GENERIC_HARDIRQS_NO_COMPAT switch to allow testing w/o
the status one. Wrap the access to status IRQ_INPROGRESS in a inline
which can be turned of with CONFIG_GENERIC_HARDIRQS_NO_COMPAT along
with the define.

There is no reason that anything outside of core looks at this. That
needs some modifications, but we'll get there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig     |  3 +++
 kernel/irq/chip.c      | 16 +++++++++-------
 kernel/irq/compat.h    | 17 +++++++++++++++++
 kernel/irq/handle.c    |  6 ++++--
 kernel/irq/internals.h |  5 ++++-
 kernel/irq/manage.c    | 17 +++++++++--------
 kernel/irq/settings.h  |  3 +++
 kernel/irq/spurious.c  |  6 +++---
 8 files changed, 52 insertions(+), 21 deletions(-)
 create mode 100644 kernel/irq/compat.h

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 4cd5d7135e0f..9e2256de1d1a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,6 +13,9 @@ config GENERIC_HARDIRQS
 config GENERIC_HARDIRQS_NO_DEPRECATED
        def_bool n
 
+config GENERIC_HARDIRQS_NO_COMPAT
+       def_bool n
+
 # Options selectable by the architecture code
 config HAVE_SPARSE_IRQ
        def_bool n
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 34245e7d1213..075385549dcd 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -383,7 +383,8 @@ void handle_nested_irq(unsigned int irq)
 	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
 
-	desc->status |= IRQ_INPROGRESS;
+	irq_compat_set_progress(desc);
+	desc->istate |= IRQS_INPROGRESS;
 	raw_spin_unlock_irq(&desc->lock);
 
 	action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -391,7 +392,8 @@ void handle_nested_irq(unsigned int irq)
 		note_interrupt(irq, desc, action_ret);
 
 	raw_spin_lock_irq(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	desc->istate &= ~IRQS_INPROGRESS;
+	irq_compat_clr_progress(desc);
 
 out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
@@ -422,7 +424,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->istate & IRQS_INPROGRESS))
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
@@ -454,7 +456,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->istate & IRQS_INPROGRESS))
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
@@ -492,7 +494,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->istate & IRQS_INPROGRESS))
 		if (!irq_check_poll(desc))
 			goto out;
 
@@ -542,8 +544,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 * we shouldn't process the IRQ. Mark it pending, handle
 	 * the necessary masking and go out
 	 */
-	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
-		    !desc->action)) {
+	if (unlikely((desc->istate & (IRQS_INPROGRESS) ||
+		      (desc->status & IRQ_DISABLED) || !desc->action))) {
 		if (!irq_check_poll(desc)) {
 			desc->status |= IRQ_PENDING;
 			mask_ack_irq(desc);
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
new file mode 100644
index 000000000000..aac6e400e608
--- /dev/null
+++ b/kernel/irq/compat.h
@@ -0,0 +1,17 @@
+/*
+ * Compat layer for transition period
+ */
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+static inline void irq_compat_set_progress(struct irq_desc *desc)
+{
+	desc->status |= IRQ_INPROGRESS;
+}
+
+static inline void irq_compat_clr_progress(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_INPROGRESS;
+}
+#else
+static inline void irq_compat_set_progress(struct irq_desc *desc) { }
+static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
+#endif
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index ff40e0f5e2e2..d4ae0b1ccc00 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -123,13 +123,15 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	irqreturn_t ret;
 
 	desc->status &= ~IRQ_PENDING;
-	desc->status |= IRQ_INPROGRESS;
+	irq_compat_set_progress(desc);
+	desc->istate |= IRQS_INPROGRESS;
 	raw_spin_unlock(&desc->lock);
 
 	ret = handle_irq_event_percpu(desc, action);
 
 	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	desc->istate &= ~IRQS_INPROGRESS;
+	irq_compat_clr_progress(desc);
 	return ret;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f5d28e1e1eda..d1cb1f8df6fe 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,7 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
+#include "compat.h"
 #include "settings.h"
 
 #define istate core_internal_state__do_not_mess_with_it
@@ -40,11 +41,13 @@ enum {
  * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
  *				  detection
  * IRQS_POLL_INPROGRESS		- polling in progress
+ * IRQS_INPROGRESS		- Interrupt in progress
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 	IRQS_POLL_INPROGRESS	= 0x00000008,
+	IRQS_INPROGRESS		= 0x00000010,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -128,7 +131,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 		print_symbol("%s\n", (unsigned long)desc->action->handler);
 	}
 
-	P(IRQ_INPROGRESS);
 	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
 	P(IRQ_REPLAY);
@@ -143,6 +145,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_NOAUTOEN);
 
 	PS(IRQS_AUTODETECT);
+	PS(IRQS_INPROGRESS);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5b918ffa46af..7e5a50825088 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -30,7 +30,7 @@
 void synchronize_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned int status;
+	unsigned int state;
 
 	if (!desc)
 		return;
@@ -42,16 +42,16 @@ void synchronize_irq(unsigned int irq)
 		 * Wait until we're out of the critical section.  This might
 		 * give the wrong answer due to the lack of memory barriers.
 		 */
-		while (desc->status & IRQ_INPROGRESS)
+		while (desc->istate & IRQS_INPROGRESS)
 			cpu_relax();
 
 		/* Ok, that indicated we're done: double-check carefully. */
 		raw_spin_lock_irqsave(&desc->lock, flags);
-		status = desc->status;
+		state = desc->istate;
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 		/* Oops, that failed? */
-	} while (status & IRQ_INPROGRESS);
+	} while (state & IRQS_INPROGRESS);
 
 	/*
 	 * We made sure that no hardirq handler is running. Now verify
@@ -637,9 +637,9 @@ again:
 	 * The thread is faster done than the hard interrupt handler
 	 * on the other CPU. If we unmask the irq line then the
 	 * interrupt can come in again and masks the line, leaves due
-	 * to IRQ_INPROGRESS and the irq line is masked forever.
+	 * to IRQS_INPROGRESS and the irq line is masked forever.
 	 */
-	if (unlikely(desc->status & IRQ_INPROGRESS)) {
+	if (unlikely(desc->istate & IRQS_INPROGRESS)) {
 		raw_spin_unlock_irq(&desc->lock);
 		chip_bus_sync_unlock(desc);
 		cpu_relax();
@@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS);
-		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED);
+		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT);
+		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
+				  IRQS_INPROGRESS);
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 610f55597ce7..a96140eea409 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -5,3 +5,6 @@
 enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
 };
+
+#undef IRQ_INPROGRESS
+#define IRQ_INPROGRESS		GOT_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 21c46178b1a6..51504837d8cc 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,10 +45,10 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 #ifdef CONFIG_SMP
 	do {
 		raw_spin_unlock(&desc->lock);
-		while (desc->status & IRQ_INPROGRESS)
+		while (desc->istate & IRQS_INPROGRESS)
 			cpu_relax();
 		raw_spin_lock(&desc->lock);
-	} while (desc->status & IRQ_INPROGRESS);
+	} while (desc->istate & IRQS_INPROGRESS);
 	/* Might have been disabled in meantime */
 	return !(desc->status & IRQ_DISABLED) && desc->action;
 #else
@@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 
 	/* Already running on another processor */
-	if (desc->status & IRQ_INPROGRESS) {
+	if (desc->istate & IRQS_INPROGRESS) {
 		/*
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
-- 
cgit v1.2.1


From 3d67baec7f1b01fc289ac1a2f1a7e6d5e43391c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 21:02:10 +0100
Subject: genirq: Move IRQ_ONESHOT to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 2 +-
 kernel/irq/internals.h | 2 ++
 kernel/irq/manage.c    | 8 ++++----
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 075385549dcd..420fa6bdb117 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -472,7 +472,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	handle_irq_event(desc);
 
-	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+	if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT))
 		unmask_irq(desc);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index d1cb1f8df6fe..36563f731ff8 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,12 +42,14 @@ enum {
  *				  detection
  * IRQS_POLL_INPROGRESS		- polling in progress
  * IRQS_INPROGRESS		- Interrupt in progress
+ * IRQS_ONESHOT			- irq is not unmasked in primary handler
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 	IRQS_POLL_INPROGRESS	= 0x00000008,
 	IRQS_INPROGRESS		= 0x00000010,
+	IRQS_ONESHOT		= 0x00000020,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7e5a50825088..aca4208a03ac 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -697,7 +697,7 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	int wake, oneshot = desc->status & IRQ_ONESHOT;
+	int wake, oneshot = desc->istate & IRQS_ONESHOT;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
@@ -897,12 +897,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT);
+		desc->status &= ~IRQ_WAITING;
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-				  IRQS_INPROGRESS);
+				  IRQS_INPROGRESS | IRQS_ONESHOT);
 
 		if (new->flags & IRQF_ONESHOT)
-			desc->status |= IRQ_ONESHOT;
+			desc->istate |= IRQS_ONESHOT;
 
 		if (!(desc->status & IRQ_NOAUTOEN))
 			irq_startup(desc);
-- 
cgit v1.2.1


From 163ef3091195f514a06f064b12914597d2644c55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 11:39:15 +0100
Subject: genirq: Move IRQ_REPLAY and IRQ_WAITING to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/autoprobe.c | 11 +++++------
 kernel/irq/chip.c      |  9 ++++-----
 kernel/irq/internals.h |  8 ++++++--
 kernel/irq/manage.c    |  4 ++--
 kernel/irq/resend.c    |  7 +++++--
 kernel/irq/settings.h  |  4 ++++
 6 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 916e56e10a2e..9ea8bb99f7c1 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
 /*
  * Autodetection depends on the fact that any interrupt that
  * comes in on to an unassigned handler will get stuck with
- * "IRQ_WAITING" cleared and the interrupt disabled.
+ * "IRQS_WAITING" cleared and the interrupt disabled.
  */
 static DEFINE_MUTEX(probing_active);
 
@@ -75,8 +75,7 @@ unsigned long probe_irq_on(void)
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
-			desc->istate |= IRQS_AUTODETECT;
-			desc->status |= IRQ_WAITING;
+			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
 			if (irq_startup(desc))
 				desc->status |= IRQ_PENDING;
 		}
@@ -96,7 +95,7 @@ unsigned long probe_irq_on(void)
 
 		if (desc->istate & IRQS_AUTODETECT) {
 			/* It triggered already - consider it spurious. */
-			if (!(desc->status & IRQ_WAITING)) {
+			if (!(desc->istate & IRQS_WAITING)) {
 				desc->istate &= ~IRQS_AUTODETECT;
 				irq_shutdown(desc);
 			} else
@@ -131,7 +130,7 @@ unsigned int probe_irq_mask(unsigned long val)
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (desc->istate & IRQS_AUTODETECT) {
-			if (i < 16 && !(desc->status & IRQ_WAITING))
+			if (i < 16 && !(desc->istate & IRQS_WAITING))
 				mask |= 1 << i;
 
 			desc->istate &= ~IRQS_AUTODETECT;
@@ -171,7 +170,7 @@ int probe_irq_off(unsigned long val)
 		raw_spin_lock_irq(&desc->lock);
 
 		if (desc->istate & IRQS_AUTODETECT) {
-			if (!(desc->status & IRQ_WAITING)) {
+			if (!(desc->istate & IRQS_WAITING)) {
 				if (!nr_of_irqs)
 					irq_found = i;
 				nr_of_irqs++;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 420fa6bdb117..59ae14527ecd 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -428,7 +428,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
@@ -460,7 +460,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/*
@@ -498,7 +498,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		if (!irq_check_poll(desc))
 			goto out;
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/*
@@ -537,8 +537,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	/*
 	 * If we're currently running this IRQ, or its disabled,
 	 * we shouldn't process the IRQ. Mark it pending, handle
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 36563f731ff8..54037533af7a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -43,6 +43,8 @@ enum {
  * IRQS_POLL_INPROGRESS		- polling in progress
  * IRQS_INPROGRESS		- Interrupt in progress
  * IRQS_ONESHOT			- irq is not unmasked in primary handler
+ * IRQS_REPLAY			- irq is replayed
+ * IRQS_WAITING			- irq is waiting
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -50,6 +52,8 @@ enum {
 	IRQS_POLL_INPROGRESS	= 0x00000008,
 	IRQS_INPROGRESS		= 0x00000010,
 	IRQS_ONESHOT		= 0x00000020,
+	IRQS_REPLAY		= 0x00000040,
+	IRQS_WAITING		= 0x00000080,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -135,8 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 
 	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
-	P(IRQ_REPLAY);
-	P(IRQ_WAITING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
 #ifdef CONFIG_IRQ_PER_CPU
@@ -148,6 +150,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 
 	PS(IRQS_AUTODETECT);
 	PS(IRQS_INPROGRESS);
+	PS(IRQS_REPLAY);
+	PS(IRQS_WAITING);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aca4208a03ac..7971df53d6a9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -897,9 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~IRQ_WAITING;
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-				  IRQS_INPROGRESS | IRQS_ONESHOT);
+				  IRQS_INPROGRESS | IRQS_ONESHOT | \
+				  IRQS_WAITING);
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 60b20261041b..f83387cd11f3 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -62,8 +62,11 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 */
 	if (desc->status & IRQ_LEVEL)
 		return;
-	if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-		desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY;
+	if (desc->istate & IRQS_REPLAY)
+		return;
+	if (desc->status & IRQ_PENDING) {
+		desc->status &= ~IRQ_PENDING;
+		desc->istate |= IRQS_REPLAY;
 
 		if (!desc->irq_data.chip->irq_retrigger ||
 		    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index a96140eea409..2e7d08fff0ce 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,3 +8,7 @@ enum {
 
 #undef IRQ_INPROGRESS
 #define IRQ_INPROGRESS		GOT_YOU_MORON
+#undef IRQ_REPLAY
+#define IRQ_REPLAY		GOT_YOU_MORON
+#undef IRQ_WAITING
+#define IRQ_WAITING		GOT_YOU_MORON
-- 
cgit v1.2.1


From c1594b77e46124bb462f961e536120e471c67446 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 22:11:30 +0100
Subject: genirq: Move IRQ_DISABLED to core

Keep status in sync until all abusers are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 48 +++++++++++++++++++++++++++++++-----------------
 kernel/irq/compat.h    | 12 ++++++++++++
 kernel/irq/internals.h |  4 +++-
 kernel/irq/irqdesc.c   |  2 ++
 kernel/irq/manage.c    |  4 ++--
 kernel/irq/migration.c |  2 +-
 kernel/irq/settings.h  |  2 ++
 kernel/irq/spurious.c  |  4 ++--
 8 files changed, 55 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 59ae14527ecd..527df7ab1b05 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -164,9 +164,21 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(irq_get_irq_data);
 
+static void irq_state_clr_disabled(struct irq_desc *desc)
+{
+	desc->istate &= ~IRQS_DISABLED;
+	irq_compat_clr_disabled(desc);
+}
+
+static void irq_state_set_disabled(struct irq_desc *desc)
+{
+	desc->istate |= IRQS_DISABLED;
+	irq_compat_set_disabled(desc);
+}
+
 int irq_startup(struct irq_desc *desc)
 {
-	desc->status &= ~IRQ_DISABLED;
+	irq_state_clr_disabled(desc);
 	desc->depth = 0;
 
 	if (desc->irq_data.chip->irq_startup) {
@@ -181,7 +193,7 @@ int irq_startup(struct irq_desc *desc)
 
 void irq_shutdown(struct irq_desc *desc)
 {
-	desc->status |= IRQ_DISABLED;
+	irq_state_set_disabled(desc);
 	desc->depth = 1;
 	if (desc->irq_data.chip->irq_shutdown)
 		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -194,7 +206,7 @@ void irq_shutdown(struct irq_desc *desc)
 
 void irq_enable(struct irq_desc *desc)
 {
-	desc->status &= ~IRQ_DISABLED;
+	irq_state_clr_disabled(desc);
 	if (desc->irq_data.chip->irq_enable)
 		desc->irq_data.chip->irq_enable(&desc->irq_data);
 	else
@@ -204,7 +216,7 @@ void irq_enable(struct irq_desc *desc)
 
 void irq_disable(struct irq_desc *desc)
 {
-	desc->status |= IRQ_DISABLED;
+	irq_state_set_disabled(desc);
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 		desc->status |= IRQ_MASKED;
@@ -380,7 +392,7 @@ void handle_nested_irq(unsigned int irq)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
 		goto out_unlock;
 
 	irq_compat_set_progress(desc);
@@ -431,7 +443,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
 		goto out_unlock;
 
 	handle_irq_event(desc);
@@ -467,12 +479,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * keep it masked and get out of here
 	 */
-	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
 		goto out_unlock;
 
 	handle_irq_event(desc);
 
-	if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT))
+	if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
 		unmask_irq(desc);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
@@ -505,7 +517,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * then mask it and get out of here:
 	 */
-	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) {
+	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		mask_irq(desc);
 		goto out;
@@ -543,8 +555,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 * we shouldn't process the IRQ. Mark it pending, handle
 	 * the necessary masking and go out
 	 */
-	if (unlikely((desc->istate & (IRQS_INPROGRESS) ||
-		      (desc->status & IRQ_DISABLED) || !desc->action))) {
+	if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
+		      !desc->action))) {
 		if (!irq_check_poll(desc)) {
 			desc->status |= IRQ_PENDING;
 			mask_ack_irq(desc);
@@ -567,15 +579,16 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 * one, we could have masked the irq.
 		 * Renable it, if it was not disabled in meantime.
 		 */
-		if (unlikely((desc->status &
-			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
-			      (IRQ_PENDING | IRQ_MASKED))) {
-			unmask_irq(desc);
+		if (unlikely(desc->status & IRQ_PENDING)) {
+			if (!(desc->istate & IRQS_DISABLED) &&
+			    (desc->status & IRQ_MASKED))
+				unmask_irq(desc);
 		}
 
 		handle_irq_event(desc);
 
-	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+	} while ((desc->status & IRQ_PENDING) &&
+		 !(desc->istate & IRQS_DISABLED));
 
 out_unlock:
 	raw_spin_unlock(&desc->lock);
@@ -639,7 +652,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	if (handle == handle_bad_irq) {
 		if (desc->irq_data.chip != &no_irq_chip)
 			mask_ack_irq(desc);
-		desc->status |= IRQ_DISABLED;
+		irq_compat_set_disabled(desc);
+		desc->istate |= IRQS_DISABLED;
 		desc->depth = 1;
 	}
 	desc->handle_irq = handle;
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index aac6e400e608..bc0c2a501e82 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -11,7 +11,19 @@ static inline void irq_compat_clr_progress(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_INPROGRESS;
 }
+static inline void irq_compat_set_disabled(struct irq_desc *desc)
+{
+	desc->status |= IRQ_DISABLED;
+}
+
+static inline void irq_compat_clr_disabled(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_DISABLED;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
+static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
+static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
 #endif
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 54037533af7a..919d2dd0bb33 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -45,6 +45,7 @@ enum {
  * IRQS_ONESHOT			- irq is not unmasked in primary handler
  * IRQS_REPLAY			- irq is replayed
  * IRQS_WAITING			- irq is waiting
+ * IRQS_DISABLED		- irq is disabled
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -54,6 +55,7 @@ enum {
 	IRQS_ONESHOT		= 0x00000020,
 	IRQS_REPLAY		= 0x00000040,
 	IRQS_WAITING		= 0x00000080,
+	IRQS_DISABLED		= 0x00000100,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -137,7 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 		print_symbol("%s\n", (unsigned long)desc->action->handler);
 	}
 
-	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
@@ -152,6 +153,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	PS(IRQS_INPROGRESS);
 	PS(IRQS_REPLAY);
 	PS(IRQS_WAITING);
+	PS(IRQS_DISABLED);
 }
 
 #undef P
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8b87f2ce0203..78866d050bc9 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
 	desc->status = _IRQ_DEFAULT_INIT_FLAGS;
+	desc->istate = IRQS_DISABLED;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
@@ -247,6 +248,7 @@ int __init early_irq_init(void)
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status		= _IRQ_DEFAULT_INIT_FLAGS,
+		.istate		= IRQS_DISABLED,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
 		.lock		= __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7971df53d6a9..77ff275b54cf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -646,7 +646,7 @@ again:
 		goto again;
 	}
 
-	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+	if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) {
 		desc->status &= ~IRQ_MASKED;
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	}
@@ -709,7 +709,7 @@ static int irq_thread(void *data)
 		atomic_inc(&desc->threads_active);
 
 		raw_spin_lock_irq(&desc->lock);
-		if (unlikely(desc->status & IRQ_DISABLED)) {
+		if (unlikely(desc->istate & IRQS_DISABLED)) {
 			/*
 			 * CHECKME: We might need a dedicated
 			 * IRQ_THREAD_PENDING flag here, which
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..8c68cb8555a7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -61,7 +61,7 @@ void move_native_irq(int irq)
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
 
-	if (unlikely(desc->status & IRQ_DISABLED))
+	if (unlikely(desc->istate & IRQS_DISABLED))
 		return;
 
 	/*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 2e7d08fff0ce..5e3411c7c62b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -12,3 +12,5 @@ enum {
 #define IRQ_REPLAY		GOT_YOU_MORON
 #undef IRQ_WAITING
 #define IRQ_WAITING		GOT_YOU_MORON
+#undef IRQ_DISABLED
+#define IRQ_DISABLED		GOT_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 51504837d8cc..367614f858ff 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -50,7 +50,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 		raw_spin_lock(&desc->lock);
 	} while (desc->istate & IRQS_INPROGRESS);
 	/* Might have been disabled in meantime */
-	return !(desc->status & IRQ_DISABLED) && desc->action;
+	return !(desc->istate & IRQS_DISABLED) && desc->action;
 #else
 	return false;
 #endif
@@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	 * Do not poll disabled interrupts unless the spurious
 	 * disabled poller asks explicitely.
 	 */
-	if ((desc->status & IRQ_DISABLED) && !force)
+	if ((desc->istate & IRQS_DISABLED) && !force)
 		goto out;
 
 	/*
-- 
cgit v1.2.1


From 2a0d6fb335d4428285dab2d254911748e6040807 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 12:17:57 +0100
Subject: genirq: Move IRQ_PENDING flag to core

Keep status in sync until all users are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/autoprobe.c |  6 ++++--
 kernel/irq/chip.c      | 10 ++++++----
 kernel/irq/compat.h    | 12 +++++++++++-
 kernel/irq/handle.c    |  3 ++-
 kernel/irq/internals.h |  4 +++-
 kernel/irq/manage.c    |  5 +++--
 kernel/irq/pm.c        |  3 ++-
 kernel/irq/resend.c    |  5 +++--
 kernel/irq/settings.h  |  2 ++
 kernel/irq/spurious.c  |  5 +++--
 10 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 9ea8bb99f7c1..aab64c262726 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -76,8 +76,10 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-			if (irq_startup(desc))
-				desc->status |= IRQ_PENDING;
+			if (irq_startup(desc)) {
+				irq_compat_set_pending(desc);
+				desc->istate |= IRQS_PENDING;
+			}
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 527df7ab1b05..17c87865bfb1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -518,7 +518,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * then mask it and get out of here:
 	 */
 	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
-		desc->status |= IRQ_PENDING;
+		irq_compat_set_pending(desc);
+		desc->istate |= IRQS_PENDING;
 		mask_irq(desc);
 		goto out;
 	}
@@ -558,7 +559,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
 		      !desc->action))) {
 		if (!irq_check_poll(desc)) {
-			desc->status |= IRQ_PENDING;
+			irq_compat_set_pending(desc);
+			desc->istate |= IRQS_PENDING;
 			mask_ack_irq(desc);
 			goto out_unlock;
 		}
@@ -579,7 +581,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 * one, we could have masked the irq.
 		 * Renable it, if it was not disabled in meantime.
 		 */
-		if (unlikely(desc->status & IRQ_PENDING)) {
+		if (unlikely(desc->istate & IRQS_PENDING)) {
 			if (!(desc->istate & IRQS_DISABLED) &&
 			    (desc->status & IRQ_MASKED))
 				unmask_irq(desc);
@@ -587,7 +589,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 
 		handle_irq_event(desc);
 
-	} while ((desc->status & IRQ_PENDING) &&
+	} while ((desc->istate & IRQS_PENDING) &&
 		 !(desc->istate & IRQS_DISABLED));
 
 out_unlock:
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index bc0c2a501e82..0067a69781f4 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -15,15 +15,25 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc)
 {
 	desc->status |= IRQ_DISABLED;
 }
-
 static inline void irq_compat_clr_disabled(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_DISABLED;
 }
+static inline void irq_compat_set_pending(struct irq_desc *desc)
+{
+	desc->status |= IRQ_PENDING;
+}
+
+static inline void irq_compat_clr_pending(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_PENDING;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
 static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
 static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
+static inline void irq_compat_set_pending(struct irq_desc *desc) { }
+static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d4ae0b1ccc00..6e34bdbeb26a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -122,7 +122,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	struct irqaction *action = desc->action;
 	irqreturn_t ret;
 
-	desc->status &= ~IRQ_PENDING;
+	irq_compat_clr_pending(desc);
+	desc->istate &= ~IRQS_PENDING;
 	irq_compat_set_progress(desc);
 	desc->istate |= IRQS_INPROGRESS;
 	raw_spin_unlock(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 919d2dd0bb33..fdf2524437eb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -46,6 +46,7 @@ enum {
  * IRQS_REPLAY			- irq is replayed
  * IRQS_WAITING			- irq is waiting
  * IRQS_DISABLED		- irq is disabled
+ * IRQS_PENDING			- irq is pending and replayed later
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -56,6 +57,7 @@ enum {
 	IRQS_REPLAY		= 0x00000040,
 	IRQS_WAITING		= 0x00000080,
 	IRQS_DISABLED		= 0x00000100,
+	IRQS_PENDING		= 0x00000200,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -139,7 +141,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 		print_symbol("%s\n", (unsigned long)desc->action->handler);
 	}
 
-	P(IRQ_PENDING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
 #ifdef CONFIG_IRQ_PER_CPU
@@ -154,6 +155,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	PS(IRQS_REPLAY);
 	PS(IRQS_WAITING);
 	PS(IRQS_DISABLED);
+	PS(IRQS_PENDING);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 77ff275b54cf..ac060814a787 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -714,10 +714,11 @@ static int irq_thread(void *data)
 			 * CHECKME: We might need a dedicated
 			 * IRQ_THREAD_PENDING flag here, which
 			 * retriggers the thread in check_irq_resend()
-			 * but AFAICT IRQ_PENDING should be fine as it
+			 * but AFAICT IRQS_PENDING should be fine as it
 			 * retriggers the interrupt itself --- tglx
 			 */
-			desc->status |= IRQ_PENDING;
+			irq_compat_set_pending(desc);
+			desc->istate |= IRQS_PENDING;
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
 			raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d6bfb89cce91..d7389418e91a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,7 +69,8 @@ int check_wakeup_irqs(void)
 	int irq;
 
 	for_each_irq_desc(irq, desc)
-		if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+		if ((desc->status & IRQ_WAKEUP) &&
+		    (desc->istate & IRQS_PENDING))
 			return -EBUSY;
 
 	return 0;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index f83387cd11f3..ff1fea060014 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -64,8 +64,9 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 		return;
 	if (desc->istate & IRQS_REPLAY)
 		return;
-	if (desc->status & IRQ_PENDING) {
-		desc->status &= ~IRQ_PENDING;
+	if (desc->istate & IRQS_PENDING) {
+		irq_compat_clr_pending(desc);
+		desc->istate &= ~IRQS_PENDING;
 		desc->istate |= IRQS_REPLAY;
 
 		if (!desc->irq_data.chip->irq_retrigger ||
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 5e3411c7c62b..623fcf83e7de 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,3 +14,5 @@ enum {
 #define IRQ_WAITING		GOT_YOU_MORON
 #undef IRQ_DISABLED
 #define IRQ_DISABLED		GOT_YOU_MORON
+#undef IRQ_PENDING
+#define IRQ_PENDING		GOT_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 367614f858ff..692ce2bae302 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -93,7 +93,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
 		 */
-		desc->status |= IRQ_PENDING;
+		irq_compat_set_pending(desc);
+		desc->istate |= IRQS_PENDING;
 		goto out;
 	}
 
@@ -103,7 +104,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		if (handle_irq_event(desc) == IRQ_HANDLED)
 			ret = IRQ_HANDLED;
 		action = desc->action;
-	} while ((desc->status & IRQ_PENDING) && action);
+	} while ((desc->istate & IRQS_PENDING) && action);
 	desc->istate &= ~IRQS_POLL_INPROGRESS;
 out:
 	raw_spin_unlock(&desc->lock);
-- 
cgit v1.2.1


From 6e40262ea43c4b0e3f435b3a083e4461ef921c17 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 12:36:06 +0100
Subject: genirq: Move IRQ_MASKED to core

Keep status in sync until all users are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 28 ++++++++++++++++++++--------
 kernel/irq/compat.h    | 11 +++++++++++
 kernel/irq/internals.h |  4 +++-
 kernel/irq/manage.c    |  5 +++--
 kernel/irq/migration.c |  2 +-
 kernel/irq/settings.h  |  2 ++
 6 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 17c87865bfb1..73b2e7e00934 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -176,6 +176,18 @@ static void irq_state_set_disabled(struct irq_desc *desc)
 	irq_compat_set_disabled(desc);
 }
 
+static void irq_state_clr_masked(struct irq_desc *desc)
+{
+	desc->istate &= ~IRQS_MASKED;
+	irq_compat_clr_masked(desc);
+}
+
+static void irq_state_set_masked(struct irq_desc *desc)
+{
+	desc->istate |= IRQS_MASKED;
+	irq_compat_set_masked(desc);
+}
+
 int irq_startup(struct irq_desc *desc)
 {
 	irq_state_clr_disabled(desc);
@@ -183,7 +195,7 @@ int irq_startup(struct irq_desc *desc)
 
 	if (desc->irq_data.chip->irq_startup) {
 		int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
-		desc->status &= ~IRQ_MASKED;
+		irq_state_clr_masked(desc);
 		return ret;
 	}
 
@@ -201,7 +213,7 @@ void irq_shutdown(struct irq_desc *desc)
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 	else
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
-	desc->status |= IRQ_MASKED;
+	irq_state_set_masked(desc);
 }
 
 void irq_enable(struct irq_desc *desc)
@@ -211,7 +223,7 @@ void irq_enable(struct irq_desc *desc)
 		desc->irq_data.chip->irq_enable(&desc->irq_data);
 	else
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
-	desc->status &= ~IRQ_MASKED;
+	irq_state_clr_masked(desc);
 }
 
 void irq_disable(struct irq_desc *desc)
@@ -219,8 +231,8 @@ void irq_disable(struct irq_desc *desc)
 	irq_state_set_disabled(desc);
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
-		desc->status |= IRQ_MASKED;
 	}
+	irq_state_set_masked(desc);
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -352,14 +364,14 @@ static inline void mask_ack_irq(struct irq_desc *desc)
 		if (desc->irq_data.chip->irq_ack)
 			desc->irq_data.chip->irq_ack(&desc->irq_data);
 	}
-	desc->status |= IRQ_MASKED;
+	irq_state_set_masked(desc);
 }
 
 static inline void mask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask) {
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
-		desc->status |= IRQ_MASKED;
+		irq_state_set_masked(desc);
 	}
 }
 
@@ -367,7 +379,7 @@ static inline void unmask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_unmask) {
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
-		desc->status &= ~IRQ_MASKED;
+		irq_state_clr_masked(desc);
 	}
 }
 
@@ -583,7 +595,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 */
 		if (unlikely(desc->istate & IRQS_PENDING)) {
 			if (!(desc->istate & IRQS_DISABLED) &&
-			    (desc->status & IRQ_MASKED))
+			    (desc->istate & IRQS_MASKED))
 				unmask_irq(desc);
 		}
 
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index 0067a69781f4..593abecbcc44 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -28,6 +28,15 @@ static inline void irq_compat_clr_pending(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_PENDING;
 }
+static inline void irq_compat_set_masked(struct irq_desc *desc)
+{
+	desc->status |= IRQ_MASKED;
+}
+
+static inline void irq_compat_clr_masked(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_MASKED;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
@@ -35,5 +44,7 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
 static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
 static inline void irq_compat_set_pending(struct irq_desc *desc) { }
 static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
+static inline void irq_compat_set_masked(struct irq_desc *desc) { }
+static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fdf2524437eb..3f2fcc194dcc 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -47,6 +47,7 @@ enum {
  * IRQS_WAITING			- irq is waiting
  * IRQS_DISABLED		- irq is disabled
  * IRQS_PENDING			- irq is pending and replayed later
+ * IRQS_MASKED			- irq is masked
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -58,6 +59,7 @@ enum {
 	IRQS_WAITING		= 0x00000080,
 	IRQS_DISABLED		= 0x00000100,
 	IRQS_PENDING		= 0x00000200,
+	IRQS_MASKED		= 0x00000400,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -142,7 +144,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	}
 
 	P(IRQ_LEVEL);
-	P(IRQ_MASKED);
 #ifdef CONFIG_IRQ_PER_CPU
 	P(IRQ_PER_CPU);
 #endif
@@ -156,6 +157,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	PS(IRQS_WAITING);
 	PS(IRQS_DISABLED);
 	PS(IRQS_PENDING);
+	PS(IRQS_MASKED);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac060814a787..83fd20194e5b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -646,8 +646,9 @@ again:
 		goto again;
 	}
 
-	if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) {
-		desc->status &= ~IRQ_MASKED;
+	if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) {
+		irq_compat_clr_masked(desc);
+		desc->istate &= ~IRQS_MASKED;
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	}
 	raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 8c68cb8555a7..6f2f98480354 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -69,7 +69,7 @@ void move_native_irq(int irq)
 	 * threaded interrupt with ONESHOT set, we can end up with an
 	 * interrupt storm.
 	 */
-	masked = desc->status & IRQ_MASKED;
+	masked = desc->istate & IRQS_MASKED;
 	if (!masked)
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
 	move_masked_irq(irq);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 623fcf83e7de..2cd45fd5ec8a 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -16,3 +16,5 @@ enum {
 #define IRQ_DISABLED		GOT_YOU_MORON
 #undef IRQ_PENDING
 #define IRQ_PENDING		GOT_YOU_MORON
+#undef IRQ_MASKED
+#define IRQ_MASKED		GOT_YOU_MORON
-- 
cgit v1.2.1


From c531e8361f1968d664e6e97fbd3bfa4cf0e62e42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 12:44:58 +0100
Subject: genirq: Move IRQ_SUSPENDED to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 2 ++
 kernel/irq/manage.c    | 8 ++++----
 kernel/irq/pm.c        | 6 +++---
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 3f2fcc194dcc..46889119e6a6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -48,6 +48,7 @@ enum {
  * IRQS_DISABLED		- irq is disabled
  * IRQS_PENDING			- irq is pending and replayed later
  * IRQS_MASKED			- irq is masked
+ * IRQS_SUSPENDED		- irq is suspended
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -60,6 +61,7 @@ enum {
 	IRQS_DISABLED		= 0x00000100,
 	IRQS_PENDING		= 0x00000200,
 	IRQS_MASKED		= 0x00000400,
+	IRQS_SUSPENDED		= 0x00000800,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 83fd20194e5b..b912de4ff4de 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -326,7 +326,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 	if (suspend) {
 		if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
 			return;
-		desc->status |= IRQ_SUSPENDED;
+		desc->istate |= IRQS_SUSPENDED;
 	}
 
 	if (!desc->depth++)
@@ -388,7 +388,7 @@ EXPORT_SYMBOL(disable_irq);
 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
 	if (resume) {
-		if (!(desc->status & IRQ_SUSPENDED)) {
+		if (!(desc->istate & IRQS_SUSPENDED)) {
 			if (!desc->action)
 				return;
 			if (!(desc->action->flags & IRQF_FORCE_RESUME))
@@ -396,7 +396,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 			/* Pretend that it got disabled ! */
 			desc->depth++;
 		}
-		desc->status &= ~IRQ_SUSPENDED;
+		desc->istate &= ~IRQS_SUSPENDED;
 	}
 
 	switch (desc->depth) {
@@ -405,7 +405,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
-		if (desc->status & IRQ_SUSPENDED)
+		if (desc->istate & IRQS_SUSPENDED)
 			goto err_out;
 		/* Prevent probing on this irq: */
 		desc->status |= IRQ_NOPROBE;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d7389418e91a..d81337fc1cff 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
  * During system-wide suspend or hibernation device drivers need to be prevented
  * from receiving interrupts and this function is provided for this purpose.
  * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQ_SUSPENDED flag for each of them.
+ * and sets the IRQS_SUSPENDED flag for each of them.
  */
 void suspend_device_irqs(void)
 {
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
 	}
 
 	for_each_irq_desc(irq, desc)
-		if (desc->status & IRQ_SUSPENDED)
+		if (desc->istate & IRQS_SUSPENDED)
 			synchronize_irq(irq);
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
  * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
  *
  * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQ_SUSPENDED flag set.
+ * have the IRQS_SUSPENDED flag set.
  */
 void resume_device_irqs(void)
 {
-- 
cgit v1.2.1


From 6d2cd17fde1fc3e93302815f049f255bb2b3123e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 14:34:18 +0100
Subject: genirq: Move IRQ_WAKEUP to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 2 ++
 kernel/irq/manage.c    | 4 ++--
 kernel/irq/pm.c        | 2 +-
 kernel/irq/settings.h  | 2 ++
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46889119e6a6..cef0849dcfa5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -49,6 +49,7 @@ enum {
  * IRQS_PENDING			- irq is pending and replayed later
  * IRQS_MASKED			- irq is masked
  * IRQS_SUSPENDED		- irq is suspended
+ * IRQS_WAKEUP			- irq triggers system wakeup from suspend
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -62,6 +63,7 @@ enum {
 	IRQS_PENDING		= 0x00000200,
 	IRQS_MASKED		= 0x00000400,
 	IRQS_SUSPENDED		= 0x00000800,
+	IRQS_WAKEUP		= 0x00001000,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b912de4ff4de..ccc9389909ff 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -489,7 +489,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 0;
 			else
-				desc->status |= IRQ_WAKEUP;
+				desc->istate |= IRQS_WAKEUP;
 		}
 	} else {
 		if (desc->wake_depth == 0) {
@@ -499,7 +499,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 1;
 			else
-				desc->status &= ~IRQ_WAKEUP;
+				desc->istate &= ~IRQS_WAKEUP;
 		}
 	}
 
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d81337fc1cff..f39383d8672d 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,7 +69,7 @@ int check_wakeup_irqs(void)
 	int irq;
 
 	for_each_irq_desc(irq, desc)
-		if ((desc->status & IRQ_WAKEUP) &&
+		if ((desc->istate & IRQS_WAKEUP) &&
 		    (desc->istate & IRQS_PENDING))
 			return -EBUSY;
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 2cd45fd5ec8a..ef09824e4b32 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -18,3 +18,5 @@ enum {
 #define IRQ_PENDING		GOT_YOU_MORON
 #undef IRQ_MASKED
 #define IRQ_MASKED		GOT_YOU_MORON
+#undef IRQ_WAKEUP
+#define IRQ_WAKEUP		GOT_YOU_MORON
-- 
cgit v1.2.1


From f230b6d5c48f8d12f4dfa1f8b5ab0b0320076d21 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Feb 2011 15:20:04 +0100
Subject: genirq: Add IRQ_MOVE_PENDING to irq_data.state

chip implementations need to know about it. Keep status in sync until
all users are fixed.

Accessor function: irqd_is_setaffinity_pending(irqdata)

Coders who access them directly will be tracked down and slapped with
stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/compat.h    | 11 +++++++++++
 kernel/irq/internals.h | 15 +++++++++++++++
 kernel/irq/manage.c    |  4 ++--
 kernel/irq/migration.c |  6 +++---
 kernel/irq/proc.c      |  2 +-
 kernel/irq/settings.h  |  2 ++
 6 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index 593abecbcc44..5e33aadadacc 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -37,6 +37,15 @@ static inline void irq_compat_clr_masked(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_MASKED;
 }
+static inline void irq_compat_set_move_pending(struct irq_desc *desc)
+{
+	desc->status |= IRQ_MOVE_PENDING;
+}
+
+static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_MOVE_PENDING;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
@@ -46,5 +55,7 @@ static inline void irq_compat_set_pending(struct irq_desc *desc) { }
 static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
 static inline void irq_compat_set_masked(struct irq_desc *desc) { }
 static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
+static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
+static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index cef0849dcfa5..e93e6090cd47 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -124,6 +124,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 		desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 
+/*
+ * Manipulation functions for irq_data.state
+ */
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+	d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+	irq_compat_set_move_pending(irq_data_to_desc(d));
+}
+
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
+	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+	irq_compat_clr_move_pending(irq_data_to_desc(d));
+}
+
 /*
  * Debugging printout:
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ccc9389909ff..9a99c471d470 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -107,7 +107,7 @@ static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
 }
 static inline bool irq_move_pending(struct irq_desc *desc)
 {
-	return desc->status & IRQ_MOVE_PENDING;
+	return irqd_is_setaffinity_pending(&desc->irq_data);
 }
 static inline void
 irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
@@ -156,7 +156,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 			ret = 0;
 		}
 	} else {
-		desc->status |= IRQ_MOVE_PENDING;
+		irqd_set_move_pending(&desc->irq_data);
 		irq_copy_pending(desc, mask);
 	}
 
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 6f2f98480354..9485ae081dcd 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -9,7 +9,7 @@ void move_masked_irq(int irq)
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_chip *chip = desc->irq_data.chip;
 
-	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
 
 	/*
@@ -20,7 +20,7 @@ void move_masked_irq(int irq)
 		return;
 	}
 
-	desc->status &= ~IRQ_MOVE_PENDING;
+	irqd_clr_move_pending(&desc->irq_data);
 
 	if (unlikely(cpumask_empty(desc->pending_mask)))
 		return;
@@ -58,7 +58,7 @@ void move_native_irq(int irq)
 	struct irq_desc *desc = irq_to_desc(irq);
 	bool masked;
 
-	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
 
 	if (unlikely(desc->istate & IRQS_DISABLED))
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 26449239bb46..afe4e6803148 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -25,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	const struct cpumask *mask = desc->irq_data.affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PENDING)
+	if (irqd_is_setaffinity_pending(&desc->irq_data))
 		mask = desc->pending_mask;
 #endif
 	seq_cpumask(m, mask);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index ef09824e4b32..bb104a2dce73 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -20,3 +20,5 @@ enum {
 #define IRQ_MASKED		GOT_YOU_MORON
 #undef IRQ_WAKEUP
 #define IRQ_WAKEUP		GOT_YOU_MORON
+#undef IRQ_MOVE_PENDING
+#define IRQ_MOVE_PENDING	GOT_YOU_MORON
-- 
cgit v1.2.1


From 6a58fb3bad099076f36f0f30f44507bc3275cdb6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 15:40:05 +0100
Subject: genirq: Remove CONFIG_IRQ_PER_CPU

The saving of this switch is minimal versus the ifdef mess it
creates. Simple enable PER_CPU unconditionally and remove the config
switch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig     | 3 ---
 kernel/irq/internals.h | 2 --
 kernel/irq/manage.c    | 9 +++------
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9e2256de1d1a..48ad25f5fa59 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -32,9 +32,6 @@ config GENERIC_PENDING_IRQ
 config AUTO_IRQ_AFFINITY
        def_bool n
 
-config IRQ_PER_CPU
-       def_bool n
-
 config HARDIRQS_SW_RESEND
        def_bool n
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e93e6090cd47..9e32b3d35d35 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -163,9 +163,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	}
 
 	P(IRQ_LEVEL);
-#ifdef CONFIG_IRQ_PER_CPU
 	P(IRQ_PER_CPU);
-#endif
 	P(IRQ_NOPROBE);
 	P(IRQ_NOREQUEST);
 	P(IRQ_NOAUTOEN);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9a99c471d470..056aa49698b4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -865,12 +865,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			goto mismatch;
 		}
 
-#if defined(CONFIG_IRQ_PER_CPU)
 		/* All handlers must agree on per-cpuness */
 		if ((old->flags & IRQF_PERCPU) !=
 		    (new->flags & IRQF_PERCPU))
 			goto mismatch;
-#endif
 
 		/* add new interrupt at end of irq queue */
 		do {
@@ -894,15 +892,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 				goto out_mask;
 		} else
 			compat_irq_chip_set_default_handler(desc);
-#if defined(CONFIG_IRQ_PER_CPU)
-		if (new->flags & IRQF_PERCPU)
-			desc->status |= IRQ_PER_CPU;
-#endif
 
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
 				  IRQS_INPROGRESS | IRQS_ONESHOT | \
 				  IRQS_WAITING);
 
+		if (new->flags & IRQF_PERCPU)
+			desc->status |= IRQ_PER_CPU;
+
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
-- 
cgit v1.2.1


From fae581e588e64a0690f3fc995e404fcacaebe772 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 16:53:24 +0100
Subject: genirq: Remove CHECK_IRQ_PER_CPU from core code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c    | 4 ++--
 kernel/irq/migration.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 056aa49698b4..f1cfa271ba70 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
-	    !desc->irq_data.chip->irq_set_affinity)
+	if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) ||
+	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
 		return 0;
 
 	return 1;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9485ae081dcd..24f53caddf47 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -15,7 +15,7 @@ void move_masked_irq(int irq)
 	/*
 	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
 	 */
-	if (CHECK_IRQ_PER_CPU(desc->status)) {
+	if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) {
 		WARN_ON(1);
 		return;
 	}
-- 
cgit v1.2.1


From 1ce6068dac1924f7095be5850481e790cbf1b3c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 20:44:21 +0100
Subject: genirq: Move debug code to separate header

It'll break when I'm going to undefine the constants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/debug.h     | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/irq/internals.h | 48 ++++--------------------------------------------
 2 files changed, 44 insertions(+), 44 deletions(-)
 create mode 100644 kernel/irq/debug.h

(limited to 'kernel')

diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..d1a33b7fa61d
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,40 @@
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+	printk("->handle_irq():  %p, ", desc->handle_irq);
+	print_symbol("%s\n", (unsigned long)desc->handle_irq);
+	printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+	print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+	printk("->action(): %p\n", desc->action);
+	if (desc->action) {
+		printk("->action->handler(): %p, ", desc->action->handler);
+		print_symbol("%s\n", (unsigned long)desc->action->handler);
+	}
+
+	P(IRQ_LEVEL);
+	P(IRQ_PER_CPU);
+	P(IRQ_NOPROBE);
+	P(IRQ_NOREQUEST);
+	P(IRQ_NOAUTOEN);
+
+	PS(IRQS_AUTODETECT);
+	PS(IRQS_INPROGRESS);
+	PS(IRQS_REPLAY);
+	PS(IRQS_WAITING);
+	PS(IRQS_DISABLED);
+	PS(IRQS_PENDING);
+	PS(IRQS_MASKED);
+}
+
+#undef P
+#undef PS
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9e32b3d35d35..b2ba59e73f21 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,9 +13,6 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
-#include "compat.h"
-#include "settings.h"
-
 #define istate core_internal_state__do_not_mess_with_it
 
 extern int noirqdebug;
@@ -66,6 +63,10 @@ enum {
 	IRQS_WAKEUP		= 0x00001000,
 };
 
+#include "compat.h"
+#include "debug.h"
+#include "settings.h"
+
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
 /* Set default functions for irq_chip structures: */
@@ -138,44 +139,3 @@ static inline void irqd_clr_move_pending(struct irq_data *d)
 	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
 	irq_compat_clr_move_pending(irq_data_to_desc(d));
 }
-
-/*
- * Debugging printout:
- */
-
-#include <linux/kallsyms.h>
-
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
-#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
-
-static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
-		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
-	printk("->handle_irq():  %p, ", desc->handle_irq);
-	print_symbol("%s\n", (unsigned long)desc->handle_irq);
-	printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
-	print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
-	printk("->action(): %p\n", desc->action);
-	if (desc->action) {
-		printk("->action->handler(): %p, ", desc->action->handler);
-		print_symbol("%s\n", (unsigned long)desc->action->handler);
-	}
-
-	P(IRQ_LEVEL);
-	P(IRQ_PER_CPU);
-	P(IRQ_NOPROBE);
-	P(IRQ_NOREQUEST);
-	P(IRQ_NOAUTOEN);
-
-	PS(IRQS_AUTODETECT);
-	PS(IRQS_INPROGRESS);
-	PS(IRQS_REPLAY);
-	PS(IRQS_WAITING);
-	PS(IRQS_DISABLED);
-	PS(IRQS_PENDING);
-	PS(IRQS_MASKED);
-}
-
-#undef P
-#undef PS
-- 
cgit v1.2.1


From a005677b3dd05decdd8880cf3044ae709856f58f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:11:03 +0100
Subject: genirq: Mirror IRQ_PER_CPU and IRQ_NO_BALANCING in irq_data.state

That's the right data structure to look at for arch code.

Accessor functions are provided.

	 irqd_is_per_cpu(irqdata);
	 irqd_can_balance(irqdata);

Coders who access them directly will be tracked down and slapped with
stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      | 15 +++++++++------
 kernel/irq/internals.h | 11 +++++++++++
 kernel/irq/manage.c    | 16 ++++++++++------
 kernel/irq/migration.c |  2 +-
 kernel/irq/settings.h  | 36 ++++++++++++++++++++++++++++++++++++
 kernel/irq/spurious.c  |  3 ++-
 6 files changed, 69 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 73b2e7e00934..b8aa3dfe8301 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -706,12 +706,15 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 	if (!desc)
 		return;
 
-	/* Sanitize flags */
-	set &= IRQF_MODIFY_MASK;
-	clr &= IRQF_MODIFY_MASK;
-
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	desc->status &= ~clr;
-	desc->status |= set;
+
+	irq_settings_clr_and_set(desc, clr, set);
+
+	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU);
+	if (irq_settings_has_no_balance_set(desc))
+		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+	if (irq_settings_is_per_cpu(desc))
+		irqd_set(&desc->irq_data, IRQD_PER_CPU);
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2ba59e73f21..a80b44d2735e 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -139,3 +139,14 @@ static inline void irqd_clr_move_pending(struct irq_data *d)
 	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
 	irq_compat_clr_move_pending(irq_data_to_desc(d));
 }
+
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
+{
+	d->state_use_accessors &= ~mask;
+}
+
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+	d->state_use_accessors |= mask;
+}
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f1cfa271ba70..84a0a9c22226 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) ||
-	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+	if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip ||
+	    !desc->irq_data.chip->irq_set_affinity)
 		return 0;
 
 	return 1;
@@ -897,8 +897,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 				  IRQS_INPROGRESS | IRQS_ONESHOT | \
 				  IRQS_WAITING);
 
-		if (new->flags & IRQF_PERCPU)
-			desc->status |= IRQ_PER_CPU;
+		if (new->flags & IRQF_PERCPU) {
+			irqd_set(&desc->irq_data, IRQD_PER_CPU);
+			irq_settings_set_per_cpu(desc);
+		}
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
@@ -910,8 +912,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->depth = 1;
 
 		/* Exclude IRQ from balancing if requested */
-		if (new->flags & IRQF_NOBALANCING)
-			desc->status |= IRQ_NO_BALANCING;
+		if (new->flags & IRQF_NOBALANCING) {
+			irq_settings_set_no_balancing(desc);
+			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+		}
 
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 24f53caddf47..7a93c6b88b25 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -15,7 +15,7 @@ void move_masked_irq(int irq)
 	/*
 	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
 	 */
-	if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) {
+	if (!irqd_can_balance(&desc->irq_data)) {
 		WARN_ON(1);
 		return;
 	}
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index bb104a2dce73..ba0fffe410ad 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -4,6 +4,9 @@
  */
 enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
+	_IRQ_PER_CPU		= IRQ_PER_CPU,
+	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
+	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
 #undef IRQ_INPROGRESS
@@ -22,3 +25,36 @@ enum {
 #define IRQ_WAKEUP		GOT_YOU_MORON
 #undef IRQ_MOVE_PENDING
 #define IRQ_MOVE_PENDING	GOT_YOU_MORON
+#undef IRQ_PER_CPU
+#define IRQ_PER_CPU		GOT_YOU_MORON
+#undef IRQ_NO_BALANCING
+#define IRQ_NO_BALANCING	GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK	GOT_YOU_MORON
+
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+	desc->status &= ~(clr & _IRQF_MODIFY_MASK);
+	desc->status |= (set & _IRQF_MODIFY_MASK);
+}
+
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_NO_BALANCING;
+}
+
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_NO_BALANCING;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 692ce2bae302..226ed7d26a84 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -68,7 +68,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	raw_spin_lock(&desc->lock);
 
 	/* PER_CPU and nested thread interrupts are never polled */
-	if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD))
+	if (irq_settings_is_per_cpu(desc) ||
+	    (desc->status & IRQ_NESTED_THREAD))
 		goto out;
 
 	/*
-- 
cgit v1.2.1


From bce43032ad79fae0ce5b6174ce1321e643ceb54b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 22:37:41 +0100
Subject: genirq: Reuse existing can set affinty check

Add a !desc check while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 4 ++--
 kernel/irq/proc.c   | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 84a0a9c22226..550ae97a0040 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip ||
-	    !desc->irq_data.chip->irq_set_affinity)
+	if (!desc || !irqd_can_balance(&desc->irq_data) ||
+	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
 		return 0;
 
 	return 1;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index afe4e6803148..4cc2e5ed0bec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	cpumask_var_t new_value;
 	int err;
 
-	if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
-	    irq_balancing_disabled(irq))
+	if (!irq_can_set_affinity(irq) || no_irq_affinity)
 		return -EIO;
 
 	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
-- 
cgit v1.2.1


From 2bdd10558c8d93009cb6c32ce9e30800fbb08add Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:22:00 +0100
Subject: genirq: Move IRQ_AFFINITY_SET to core

Keep status in sync until last abuser is gone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/compat.h    | 11 +++++++++++
 kernel/irq/internals.h |  4 ++++
 kernel/irq/manage.c    | 11 +++++++----
 kernel/irq/settings.h  |  2 ++
 4 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index 5e33aadadacc..6bbaf66aca85 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -46,6 +46,15 @@ static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_MOVE_PENDING;
 }
+static inline void irq_compat_set_affinity(struct irq_desc *desc)
+{
+	desc->status |= IRQ_AFFINITY_SET;
+}
+
+static inline void irq_compat_clr_affinity(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_AFFINITY_SET;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
@@ -57,5 +66,7 @@ static inline void irq_compat_set_masked(struct irq_desc *desc) { }
 static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
 static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
 static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
+static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
+static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a80b44d2735e..6776453c454c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -150,3 +150,7 @@ static inline void irqd_set(struct irq_data *d, unsigned int mask)
 	d->state_use_accessors |= mask;
 }
 
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+	return d->state_use_accessors & mask;
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 550ae97a0040..8246afc81956 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -164,7 +164,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 		kref_get(&desc->affinity_notify->kref);
 		schedule_work(&desc->affinity_notify->work);
 	}
-	desc->status |= IRQ_AFFINITY_SET;
+	irq_compat_set_affinity(desc);
+	irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
@@ -272,12 +273,14 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 	 * Preserve an userspace affinity setup, but make sure that
 	 * one of the targets is online.
 	 */
-	if (desc->status & (IRQ_AFFINITY_SET)) {
+	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
 		if (cpumask_intersects(desc->irq_data.affinity,
 				       cpu_online_mask))
 			set = desc->irq_data.affinity;
-		else
-			desc->status &= ~IRQ_AFFINITY_SET;
+		else {
+			irq_compat_clr_affinity(desc);
+			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
+		}
 	}
 
 	cpumask_and(mask, cpu_online_mask, set);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index ba0fffe410ad..da5acb446b1c 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -29,6 +29,8 @@ enum {
 #define IRQ_PER_CPU		GOT_YOU_MORON
 #undef IRQ_NO_BALANCING
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
+#undef IRQ_AFFINITY_SET
+#define IRQ_AFFINITY_SET	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
-- 
cgit v1.2.1


From 876dbd4cc1b35c1a4cb96a2be1d43ea0eabce3b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:28:12 +0100
Subject: genirq: Mirror irq trigger type bits in irq_data.state

That's the data structure chip functions get provided. Also allow them
to signal the core code that they updated the flags in irq_data.state
by returning IRQ_SET_MASK_OK_NOCOPY. The default is unchanged.

The type bits should be accessed via:

val = irqd_get_trigger_type(irqdata);
and
irqd_set_trigger_type(irqdata, val);

Coders who access them directly will be tracked down and slapped with
stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c     |  5 ++++-
 kernel/irq/manage.c   | 44 +++++++++++++++++++++++++++-----------------
 kernel/irq/resend.c   |  2 +-
 kernel/irq/settings.h | 30 ++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b8aa3dfe8301..9c9b573a718e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -710,11 +710,14 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 
 	irq_settings_clr_and_set(desc, clr, set);
 
-	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU);
+	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+		   IRQD_TRIGGER_MASK | IRQD_LEVEL);
 	if (irq_settings_has_no_balance_set(desc))
 		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 	if (irq_settings_is_per_cpu(desc))
 		irqd_set(&desc->irq_data, IRQD_PER_CPU);
 
+	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8246afc81956..9ae758ed8e66 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -567,23 +567,32 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		return 0;
 	}
 
+	flags &= IRQ_TYPE_SENSE_MASK;
 	/* caller masked out all except trigger mode flags */
 	ret = chip->irq_set_type(&desc->irq_data, flags);
 
-	if (ret)
-		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
-		       flags, irq, chip->irq_set_type);
-	else {
-		if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
-			flags |= IRQ_LEVEL;
-		/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
-		desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
-		desc->status |= flags;
+	switch (ret) {
+	case IRQ_SET_MASK_OK:
+		irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+		irqd_set(&desc->irq_data, flags);
+
+	case IRQ_SET_MASK_OK_NOCOPY:
+		flags = irqd_get_trigger_type(&desc->irq_data);
+		irq_settings_set_trigger_mask(desc, flags);
+		irqd_clear(&desc->irq_data, IRQD_LEVEL);
+		irq_settings_clr_level(desc);
+		if (flags & IRQ_TYPE_LEVEL_MASK) {
+			irq_settings_set_level(desc);
+			irqd_set(&desc->irq_data, IRQD_LEVEL);
+		}
 
 		if (chip != desc->irq_data.chip)
 			irq_chip_set_defaults(desc->irq_data.chip);
+		return 0;
+	default:
+		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+		       flags, irq, chip->irq_set_type);
 	}
-
 	return ret;
 }
 
@@ -923,13 +932,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
 
-	} else if ((new->flags & IRQF_TRIGGER_MASK)
-			&& (new->flags & IRQF_TRIGGER_MASK)
-				!= (desc->status & IRQ_TYPE_SENSE_MASK)) {
-		/* hope the handler works with the actual trigger mode... */
-		pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
-				irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
-				(int)(new->flags & IRQF_TRIGGER_MASK));
+	} else if (new->flags & IRQF_TRIGGER_MASK) {
+		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
+		unsigned int omsk = irq_settings_get_trigger_mask(desc);
+
+		if (nmsk != omsk)
+			/* hope the handler works with current  trigger mode */
+			pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+				   irq, nmsk, omsk);
 	}
 
 	new->irq = irq;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index ff1fea060014..ad683a99b1ec 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 * interrupts are resent by hardware when they are still
 	 * active.
 	 */
-	if (desc->status & IRQ_LEVEL)
+	if (irq_settings_is_level(desc))
 		return;
 	if (desc->istate & IRQS_REPLAY)
 		return;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index da5acb446b1c..2201f2aaa9a0 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -5,6 +5,7 @@
 enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
 	_IRQ_PER_CPU		= IRQ_PER_CPU,
+	_IRQ_LEVEL		= IRQ_LEVEL,
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
@@ -31,6 +32,8 @@ enum {
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
 #undef IRQ_AFFINITY_SET
 #define IRQ_AFFINITY_SET	GOT_YOU_MORON
+#undef IRQ_LEVEL
+#define IRQ_LEVEL		GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -60,3 +63,30 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
 {
 	return desc->status & _IRQ_NO_BALANCING;
 }
+
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+	return desc->status & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+	desc->status &= ~IRQ_TYPE_SENSE_MASK;
+	desc->status |= mask & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_LEVEL;
+}
+
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+	desc->status &= ~_IRQ_LEVEL;
+}
+
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_LEVEL;
+}
-- 
cgit v1.2.1


From 1ccb4e612f68ceefb888c2c6c1def6294ea8666d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 14:44:17 +0100
Subject: genirq: Wrap the remaning IRQ_* flags

Use wrappers to keep them away from the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/autoprobe.c |  4 ++--
 kernel/irq/chip.c      |  3 ++-
 kernel/irq/manage.c    | 14 ++++++------
 kernel/irq/settings.h  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/spurious.c  |  3 +--
 5 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index aab64c262726..c8bbc4fabaab 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+		if (!desc->action && irq_settings_can_probe(desc)) {
 			/*
 			 * An old-style architecture might still have
 			 * the handle_bad_irq handler there:
@@ -74,7 +74,7 @@ unsigned long probe_irq_on(void)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+		if (!desc->action && irq_settings_can_probe(desc)) {
 			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
 			if (irq_startup(desc)) {
 				irq_compat_set_pending(desc);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9c9b573a718e..9e9220da4deb 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -674,7 +674,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
-		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		irq_settings_set_noprobe(desc);
+		irq_settings_set_norequest(desc);
 		irq_startup(desc);
 	}
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9ae758ed8e66..b5de828e58d9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -103,7 +103,7 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
 {
-	return desc->status & IRQ_MOVE_PCNTXT;
+	return irq_settings_can_move_pcntxt(desc);
 }
 static inline bool irq_move_pending(struct irq_desc *desc)
 {
@@ -411,7 +411,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 		if (desc->istate & IRQS_SUSPENDED)
 			goto err_out;
 		/* Prevent probing on this irq: */
-		desc->status |= IRQ_NOPROBE;
+		irq_settings_set_noprobe(desc);
 		irq_enable(desc);
 		check_irq_resend(desc, irq);
 		/* fall-through */
@@ -526,7 +526,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	if (!desc)
 		return 0;
 
-	if (desc->status & IRQ_NOREQUEST)
+	if (!irq_settings_can_request(desc))
 		return 0;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
@@ -820,7 +820,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 * Check whether the interrupt nests into another interrupt
 	 * thread.
 	 */
-	nested = desc->status & IRQ_NESTED_THREAD;
+	nested = irq_settings_is_nested_thread(desc);
 	if (nested) {
 		if (!new->thread_fn)
 			return -EINVAL;
@@ -917,7 +917,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
-		if (!(desc->status & IRQ_NOAUTOEN))
+		if (irq_settings_can_autoenable(desc))
 			irq_startup(desc);
 		else
 			/* Undo nested disables: */
@@ -1217,7 +1217,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	if (!desc)
 		return -EINVAL;
 
-	if (desc->status & IRQ_NOREQUEST)
+	if (!irq_settings_can_request(desc))
 		return -EINVAL;
 
 	if (!handler) {
@@ -1292,7 +1292,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
 	if (!desc)
 		return -EINVAL;
 
-	if (desc->status & IRQ_NESTED_THREAD) {
+	if (irq_settings_is_nested_thread(desc)) {
 		ret = request_threaded_irq(irq, NULL, handler,
 					   flags, name, dev_id);
 		return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 2201f2aaa9a0..216b6f200e7c 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -6,7 +6,12 @@ enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
 	_IRQ_PER_CPU		= IRQ_PER_CPU,
 	_IRQ_LEVEL		= IRQ_LEVEL,
+	_IRQ_NOPROBE		= IRQ_NOPROBE,
+	_IRQ_NOREQUEST		= IRQ_NOREQUEST,
+	_IRQ_NOAUTOEN		= IRQ_NOAUTOEN,
+	_IRQ_MOVE_PCNTXT	= IRQ_MOVE_PCNTXT,
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
+	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -34,6 +39,14 @@ enum {
 #define IRQ_AFFINITY_SET	GOT_YOU_MORON
 #undef IRQ_LEVEL
 #define IRQ_LEVEL		GOT_YOU_MORON
+#undef IRQ_NOPROBE
+#define IRQ_NOPROBE		GOT_YOU_MORON
+#undef IRQ_NOREQUEST
+#define IRQ_NOREQUEST		GOT_YOU_MORON
+#undef IRQ_NOAUTOEN
+#define IRQ_NOAUTOEN		GOT_YOU_MORON
+#undef IRQ_NESTED_THREAD
+#define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -90,3 +103,48 @@ static inline void irq_settings_set_level(struct irq_desc *desc)
 {
 	desc->status |= _IRQ_LEVEL;
 }
+
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+	return !(desc->status & _IRQ_NOREQUEST);
+}
+
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+	desc->status &= ~_IRQ_NOREQUEST;
+}
+
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_NOREQUEST;
+}
+
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+	return !(desc->status & _IRQ_NOPROBE);
+}
+
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+	desc->status &= ~_IRQ_NOPROBE;
+}
+
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_NOPROBE;
+}
+
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_MOVE_PCNTXT;
+}
+
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+	return !(desc->status & _IRQ_NOAUTOEN);
+}
+
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_NESTED_THREAD;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 226ed7d26a84..dd586ebf9c8c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -68,8 +68,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	raw_spin_lock(&desc->lock);
 
 	/* PER_CPU and nested thread interrupts are never polled */
-	if (irq_settings_is_per_cpu(desc) ||
-	    (desc->status & IRQ_NESTED_THREAD))
+	if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
 		goto out;
 
 	/*
-- 
cgit v1.2.1


From f9e4989eb8183a1f33581fa1b99274287b0639d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 14:54:49 +0100
Subject: genirq: Force wrapped access to desc->status in core code

Force the usage of wrappers by another nasty CPP substitution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c   | 6 +++---
 kernel/irq/irqdesc.c  | 4 ++--
 kernel/irq/settings.h | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6e34bdbeb26a..cb62e2d0df4e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -55,7 +55,7 @@ irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
-	unsigned int status = 0, irq = desc->irq_data.irq;
+	unsigned int random = 0, irq = desc->irq_data.irq;
 
 	do {
 		trace_irq_handler_entry(irq, action);
@@ -98,7 +98,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
-			status |= action->flags;
+			random |= action->flags;
 			break;
 
 		default:
@@ -109,7 +109,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		action = action->next;
 	} while (action);
 
-	if (status & IRQF_SAMPLE_RANDOM)
+	if (random & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
 
 	if (!noirqdebug)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 78866d050bc9..3387fbd7f2f4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.chip_data = NULL;
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
-	desc->status = _IRQ_DEFAULT_INIT_FLAGS;
+	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	desc->istate = IRQS_DISABLED;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
@@ -247,7 +247,6 @@ int __init early_irq_init(void)
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
-		.status		= _IRQ_DEFAULT_INIT_FLAGS,
 		.istate		= IRQS_DISABLED,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
@@ -271,6 +270,7 @@ int __init early_irq_init(void)
 		desc[i].irq_data.irq = i;
 		desc[i].irq_data.chip = &no_irq_chip;
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
+		irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 216b6f200e7c..47bcd3b9f399 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -148,3 +148,6 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
 {
 	return desc->status & _IRQ_NESTED_THREAD;
 }
+
+/* Nothing should touch desc->status from now on */
+#define status		USE_THE_PROPER_WRAPPERS_YOU_MORON
-- 
cgit v1.2.1


From 5d4d8fc9ac3e9a90bbdf90bae6864cb2c01f2208 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:27:18 +0100
Subject: genirq: Cleanup irq.h

Put the constants into an enum and document them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/settings.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 47bcd3b9f399..55ebe1e09da4 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,37 +15,21 @@ enum {
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
-#undef IRQ_INPROGRESS
 #define IRQ_INPROGRESS		GOT_YOU_MORON
-#undef IRQ_REPLAY
 #define IRQ_REPLAY		GOT_YOU_MORON
-#undef IRQ_WAITING
 #define IRQ_WAITING		GOT_YOU_MORON
-#undef IRQ_DISABLED
 #define IRQ_DISABLED		GOT_YOU_MORON
-#undef IRQ_PENDING
 #define IRQ_PENDING		GOT_YOU_MORON
-#undef IRQ_MASKED
 #define IRQ_MASKED		GOT_YOU_MORON
-#undef IRQ_WAKEUP
 #define IRQ_WAKEUP		GOT_YOU_MORON
-#undef IRQ_MOVE_PENDING
 #define IRQ_MOVE_PENDING	GOT_YOU_MORON
-#undef IRQ_PER_CPU
 #define IRQ_PER_CPU		GOT_YOU_MORON
-#undef IRQ_NO_BALANCING
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
-#undef IRQ_AFFINITY_SET
 #define IRQ_AFFINITY_SET	GOT_YOU_MORON
-#undef IRQ_LEVEL
 #define IRQ_LEVEL		GOT_YOU_MORON
-#undef IRQ_NOPROBE
 #define IRQ_NOPROBE		GOT_YOU_MORON
-#undef IRQ_NOREQUEST
 #define IRQ_NOREQUEST		GOT_YOU_MORON
-#undef IRQ_NOAUTOEN
 #define IRQ_NOAUTOEN		GOT_YOU_MORON
-#undef IRQ_NESTED_THREAD
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
-- 
cgit v1.2.1


From d4d5e08960844a062da8387ee5f16ca7a33200d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 13:16:14 +0100
Subject: genirq: Add IRQCHIP_SET_TYPE_MASKED flag

irq_chips, which require to mask the chip before changing the trigger
type should set this flag. So the core takes care of it and the
requirement for looking into desc->status in the chip goes away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Walleij <linus.walleij@stericsson.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
---
 kernel/irq/chip.c      |  4 ++--
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    | 16 +++++++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9e9220da4deb..4687457fe7f0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -367,7 +367,7 @@ static inline void mask_ack_irq(struct irq_desc *desc)
 	irq_state_set_masked(desc);
 }
 
-static inline void mask_irq(struct irq_desc *desc)
+void mask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask) {
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
@@ -375,7 +375,7 @@ static inline void mask_irq(struct irq_desc *desc)
 	}
 }
 
-static inline void unmask_irq(struct irq_desc *desc)
+void unmask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_unmask) {
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6776453c454c..1d500fbde0d4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -84,6 +84,8 @@ extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b5de828e58d9..50809c79c7ad 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -554,8 +554,8 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		      unsigned long flags)
 {
-	int ret;
 	struct irq_chip *chip = desc->irq_data.chip;
+	int ret, unmask = 0;
 
 	if (!chip || !chip->irq_set_type) {
 		/*
@@ -568,6 +568,14 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	}
 
 	flags &= IRQ_TYPE_SENSE_MASK;
+
+	if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
+		if (!(desc->istate & IRQS_MASKED))
+			mask_irq(desc);
+		if (!(desc->istate & IRQS_DISABLED))
+			unmask = 1;
+	}
+
 	/* caller masked out all except trigger mode flags */
 	ret = chip->irq_set_type(&desc->irq_data, flags);
 
@@ -588,11 +596,13 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 
 		if (chip != desc->irq_data.chip)
 			irq_chip_set_defaults(desc->irq_data.chip);
-		return 0;
+		ret = 0;
 	default:
 		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
 		       flags, irq, chip->irq_set_type);
 	}
+	if (unmask)
+		unmask_irq(desc);
 	return ret;
 }
 
@@ -669,7 +679,7 @@ again:
 
 #ifdef CONFIG_SMP
 /*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Check whether we need to chasnge the affinity of the interrupt thread.
  */
 static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
-- 
cgit v1.2.1


From 7f94226f03299f1ca32f118f02f2a0295e0e5e93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 19:46:26 +0100
Subject: genirq: Move wakeup state to irq_data

Some irq_chips need to know the state of wakeup mode for
setting the trigger type etc. Reflect it in irq_data state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 2 --
 kernel/irq/manage.c    | 4 ++--
 kernel/irq/pm.c        | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1d500fbde0d4..5e2366da9f38 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -46,7 +46,6 @@ enum {
  * IRQS_PENDING			- irq is pending and replayed later
  * IRQS_MASKED			- irq is masked
  * IRQS_SUSPENDED		- irq is suspended
- * IRQS_WAKEUP			- irq triggers system wakeup from suspend
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -60,7 +59,6 @@ enum {
 	IRQS_PENDING		= 0x00000200,
 	IRQS_MASKED		= 0x00000400,
 	IRQS_SUSPENDED		= 0x00000800,
-	IRQS_WAKEUP		= 0x00001000,
 };
 
 #include "compat.h"
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50809c79c7ad..ea6add6036b1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -492,7 +492,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 0;
 			else
-				desc->istate |= IRQS_WAKEUP;
+				irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
 		}
 	} else {
 		if (desc->wake_depth == 0) {
@@ -502,7 +502,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 1;
 			else
-				desc->istate &= ~IRQS_WAKEUP;
+				irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
 		}
 	}
 
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f39383d8672d..1329f0eff49e 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,7 +69,7 @@ int check_wakeup_irqs(void)
 	int irq;
 
 	for_each_irq_desc(irq, desc)
-		if ((desc->istate & IRQS_WAKEUP) &&
+		if (irqd_is_wakeup_set(&desc->irq_data) &&
 		    (desc->istate & IRQS_PENDING))
 			return -EBUSY;
 
-- 
cgit v1.2.1


From e1ef824146131709d7466e37f889f2dab24ca98e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 22:25:31 +0100
Subject: genirq: Reflect IRQ_MOVE_PCNTXT in irq_data state

Required by x86.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4687457fe7f0..2b0f9192a830 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -712,11 +712,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
-		   IRQD_TRIGGER_MASK | IRQD_LEVEL);
+		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
 	if (irq_settings_has_no_balance_set(desc))
 		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 	if (irq_settings_is_per_cpu(desc))
 		irqd_set(&desc->irq_data, IRQD_PER_CPU);
+	if (irq_settings_can_move_pcntxt(desc))
+		irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
-- 
cgit v1.2.1


From a6967caf00ebbb2d4acdebcb72a25f2e9ba43fd2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 22:01:25 +0100
Subject: genirq: Remove desc->status when GENERIC_HARDIRQS_NO_COMPAT=y

If everything uses the right accessors, then enabling
GENERIC_HARDIRQS_NO_COMPAT should just work. If not it will tell you.

Don't be lazy and use the trick which I use in the core code!

git grep status_use_accessors

will unearth it in a split second. Offenders are tracked down and not
slapped with stinking trouts. This time we use frozen shark for a
better educational value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 4 ++++
 kernel/irq/settings.h  | 1 +
 2 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5e2366da9f38..fd5777ab2d34 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,6 +15,10 @@
 
 #define istate core_internal_state__do_not_mess_with_it
 
+#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+# define status status_use_accessors
+#endif
+
 extern int noirqdebug;
 
 /*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 55ebe1e09da4..0227ad358272 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -134,4 +134,5 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
 }
 
 /* Nothing should touch desc->status from now on */
+#undef status
 #define status		USE_THE_PROPER_WRAPPERS_YOU_MORON
-- 
cgit v1.2.1


From 091738a266fc74329ae186f22ff2b3f01319112d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 20:16:43 +0100
Subject: genirq: Remove real old transition functions

These transition helpers are stale for years now. Remove them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/autoprobe.c |  6 ------
 kernel/irq/chip.c      | 16 ++++------------
 kernel/irq/internals.h |  3 ---
 kernel/irq/manage.c    | 14 +-------------
 4 files changed, 5 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index c8bbc4fabaab..394784c57060 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -46,12 +46,6 @@ unsigned long probe_irq_on(void)
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && irq_settings_can_probe(desc)) {
-			/*
-			 * An old-style architecture might still have
-			 * the handle_bad_irq handler there:
-			 */
-			compat_irq_chip_set_default_handler(desc);
-
 			/*
 			 * Some chips need to know about probing in
 			 * progress:
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2b0f9192a830..c19c0b562c80 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -644,19 +644,11 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		return;
 	}
 
-	if (!handle)
+	if (!handle) {
 		handle = handle_bad_irq;
-	else if (desc->irq_data.chip == &no_irq_chip) {
-		printk(KERN_WARNING "Trying to install %sinterrupt handler "
-		       "for IRQ%d\n", is_chained ? "chained " : "", irq);
-		/*
-		 * Some ARM implementations install a handler for really dumb
-		 * interrupt hardware without setting an irq_chip. This worked
-		 * with the ARM no_irq_chip but the check in setup_irq would
-		 * prevent us to setup the interrupt at all. Switch it to
-		 * dummy_irq_chip for easy transition.
-		 */
-		desc->irq_data.chip = &dummy_irq_chip;
+	} else {
+		if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+			return;
 	}
 
 	chip_bus_lock(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fd5777ab2d34..f80a77471617 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -74,9 +74,6 @@ enum {
 /* Set default functions for irq_chip structures: */
 extern void irq_chip_set_defaults(struct irq_chip *chip);
 
-/* Set default handler: */
-extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
-
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea6add6036b1..99395a24f432 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -540,17 +540,6 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	return !action;
 }
 
-void compat_irq_chip_set_default_handler(struct irq_desc *desc)
-{
-	/*
-	 * If the architecture still has not overriden
-	 * the flow handler then zap the default. This
-	 * should catch incorrect flow-type setting.
-	 */
-	if (desc->handle_irq == &handle_bad_irq)
-		desc->handle_irq = NULL;
-}
-
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		      unsigned long flags)
 {
@@ -912,8 +901,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 			if (ret)
 				goto out_mask;
-		} else
-			compat_irq_chip_set_default_handler(desc);
+		}
 
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
 				  IRQS_INPROGRESS | IRQS_ONESHOT | \
-- 
cgit v1.2.1


From d5eb4ad2dfb2dfae43fd51bc8630b4fc3ef00e92 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Feb 2011 12:16:16 +0100
Subject: genirq: Implement irq_get/put_desc_[bus]locked/unlock()

Most of the managing functions get the irq descriptor and lock it -
either with or without buslock. Instead of open coding this over and
over provide a common function to do that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 28 ++++++++++++++++++++++++++++
 kernel/irq/irqdesc.c   | 20 ++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f80a77471617..935bec4bfa87 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -126,6 +126,34 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 		desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+{
+	return __irq_get_desc_lock(irq, flags, true);
+}
+
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+{
+	__irq_put_desc_unlock(desc, flags, true);
+}
+
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+{
+	return __irq_get_desc_lock(irq, flags, false);
+}
+
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+	__irq_put_desc_unlock(desc, flags, false);
+}
+
 /*
  * Manipulation functions for irq_data.state
  */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 3387fbd7f2f4..394ab6a6c62c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -402,6 +402,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
 	return find_next_bit(allocated_irqs, nr_irqs, offset);
 }
 
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc) {
+		if (bus)
+			chip_bus_lock(desc);
+		raw_spin_lock_irqsave(&desc->lock, *flags);
+	}
+	return desc;
+}
+
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	if (bus)
+		chip_bus_sync_unlock(desc);
+}
+
 /**
  * dynamic_irq_cleanup - cleanup a dynamically allocated irq
  * @irq:	irq number to initialize
-- 
cgit v1.2.1


From 02725e7471b8dd58fa96f6604bdb5dde45405a2e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Feb 2011 10:37:36 +0100
Subject: genirq: Use irq_get/put functions

Convert the management functions to use the common irq_get/put
function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c   | 93 +++++++++++++++--------------------------------------
 kernel/irq/manage.c | 81 ++++++++++++++++++----------------------------
 2 files changed, 57 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c19c0b562c80..3ea6aecd99c0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -25,22 +25,18 @@
  */
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
 
 	if (!chip)
 		chip = &no_irq_chip;
 
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	irq_chip_set_defaults(chip);
 	desc->irq_data.chip = chip;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_chip);
@@ -52,24 +48,17 @@ EXPORT_SYMBOL(irq_set_chip);
  */
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
-	int ret = -ENXIO;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	int ret = 0;
 
-	if (!desc) {
-		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
-		return -ENODEV;
-	}
+	if (!desc)
+		return -EINVAL;
 
 	type &= IRQ_TYPE_SENSE_MASK;
-	if (type == IRQ_TYPE_NONE)
-		return 0;
-
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = __irq_set_trigger(desc, irq, type);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+	if (type != IRQ_TYPE_NONE)
+		ret = __irq_set_trigger(desc, irq, type);
+	irq_put_desc_busunlock(desc, flags);
 	return ret;
 }
 EXPORT_SYMBOL(irq_set_irq_type);
@@ -83,18 +72,13 @@ EXPORT_SYMBOL(irq_set_irq_type);
  */
 int irq_set_handler_data(unsigned int irq, void *data)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install controller data for IRQ%d\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->irq_data.handler_data = data;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_handler_data);
@@ -108,20 +92,15 @@ EXPORT_SYMBOL(irq_set_handler_data);
  */
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install msi data for IRQ%d\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->irq_data.msi_desc = entry;
 	if (entry)
 		entry->irq = irq;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 
@@ -134,24 +113,13 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
  */
 int irq_set_chip_data(unsigned int irq, void *data)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install chip data for IRQ%d\n", irq);
-		return -EINVAL;
-	}
-
-	if (!desc->irq_data.chip) {
-		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->irq_data.chip_data = data;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_chip_data);
@@ -635,25 +603,19 @@ void
 __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install type control for IRQ%d\n", irq);
+	if (!desc)
 		return;
-	}
 
 	if (!handle) {
 		handle = handle_bad_irq;
 	} else {
 		if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
-			return;
+			goto out;
 	}
 
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
-
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
 		if (desc->irq_data.chip != &no_irq_chip)
@@ -670,8 +632,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		irq_settings_set_norequest(desc);
 		irq_startup(desc);
 	}
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+out:
+	irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
 
@@ -693,14 +655,11 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
 	if (!desc)
 		return;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
@@ -714,5 +673,5 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_unlock(desc, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 99395a24f432..6cca1956c503 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -172,16 +172,13 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
 	if (!desc)
 		return -EINVAL;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->affinity_hint = m;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
@@ -336,6 +333,18 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 		irq_disable(desc);
 }
 
+static int __disable_irq_nosync(unsigned int irq)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+
+	if (!desc)
+		return -EINVAL;
+	__disable_irq(desc, irq, false);
+	irq_put_desc_busunlock(desc, flags);
+	return 0;
+}
+
 /**
  *	disable_irq_nosync - disable an irq without waiting
  *	@irq: Interrupt to disable
@@ -349,17 +358,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
  */
 void disable_irq_nosync(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-
-	if (!desc)
-		return;
-
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	__disable_irq(desc, irq, false);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+	__disable_irq_nosync(irq);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
 
@@ -377,13 +376,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
  */
 void disable_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (!desc)
-		return;
-
-	disable_irq_nosync(irq);
-	if (desc->action)
+	if (!__disable_irq_nosync(irq))
 		synchronize_irq(irq);
 }
 EXPORT_SYMBOL(disable_irq);
@@ -434,21 +427,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
  */
 void enable_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
 
 	if (!desc)
 		return;
-
 	if (WARN(!desc->irq_data.chip,
 		 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
-		return;
+		goto out;
 
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	__enable_irq(desc, irq, false);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+out:
+	irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL(enable_irq);
 
@@ -477,15 +467,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
  */
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
 	int ret = 0;
 
 	/* wakeup-capable irqs can be shared between drivers that
 	 * don't need to have the same sleep mode behaviors.
 	 */
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (on) {
 		if (desc->wake_depth++ == 0) {
 			ret = set_irq_wake_real(irq, on);
@@ -505,9 +493,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 				irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
 		}
 	}
-
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+	irq_put_desc_busunlock(desc, flags);
 	return ret;
 }
 EXPORT_SYMBOL(irq_set_irq_wake);
@@ -519,25 +505,20 @@ EXPORT_SYMBOL(irq_set_irq_wake);
  */
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	int canrequest = 0;
 
 	if (!desc)
 		return 0;
 
-	if (!irq_settings_can_request(desc))
-		return 0;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	action = desc->action;
-	if (action)
-		if (irqflags & action->flags & IRQF_SHARED)
-			action = NULL;
-
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
-	return !action;
+	if (irq_settings_can_request(desc)) {
+		if (desc->action)
+			if (irqflags & desc->action->flags & IRQF_SHARED)
+				canrequest =1;
+	}
+	irq_put_desc_unlock(desc, flags);
+	return canrequest;
 }
 
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
-- 
cgit v1.2.1


From 3836ca08aad4575c120ccf328652f3873eea9063 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 20:09:19 +0100
Subject: genirq: Consolidate set_chip_handler functions

No need to have separate functions if we have one plus inline wrappers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ea6aecd99c0..2a36038b8f59 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -600,7 +600,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 }
 
 void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name)
 {
 	unsigned long flags;
@@ -635,22 +635,14 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 out:
 	irq_put_desc_busunlock(desc, flags);
 }
-EXPORT_SYMBOL_GPL(__set_irq_handler);
+EXPORT_SYMBOL_GPL(__irq_set_handler);
 
 void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
-			 irq_flow_handler_t handle)
-{
-	irq_set_chip(irq, chip);
-	__set_irq_handler(irq, handle, 0, NULL);
-}
-
-void
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
 {
 	irq_set_chip(irq, chip);
-	__set_irq_handler(irq, handle, 0, name);
+	__irq_set_handler(irq, handle, 0, name);
 }
 
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
-- 
cgit v1.2.1


From 781295762defc709a609efc01d8bb065276cd9a2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 15:14:20 +0100
Subject: genirq: Add preflow handler support

sparc64 needs to call a preflow handler on certain interrupts befor
calling the action chain. Integrate it into handle_fasteoi_irq. Must
be enabled via CONFIG_IRQ_FASTEOI_PREFLOW. No impact when disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S. Miller <davem@davemloft.net>
---
 kernel/irq/Kconfig |  3 +++
 kernel/irq/chip.c  | 11 +++++++++++
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 48ad25f5fa59..9149be729e45 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -35,6 +35,9 @@ config AUTO_IRQ_AFFINITY
 config HARDIRQS_SW_RESEND
        def_bool n
 
+config IRQ_PREFLOW_FASTEOI
+       def_bool n
+
 config SPARSE_IRQ
 	bool "Support sparse irq numbering"
 	depends on HAVE_SPARSE_IRQ
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2a36038b8f59..08be5d182be3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -471,6 +471,16 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
 
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+	if (desc->preflow_handler)
+		desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
+
 /**
  *	handle_fasteoi_irq - irq handler for transparent controllers
  *	@irq:	the interrupt number
@@ -503,6 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		mask_irq(desc);
 		goto out;
 	}
+	preflow_handler(desc);
 	handle_irq_event(desc);
 out:
 	desc->irq_data.chip->irq_eoi(&desc->irq_data);
-- 
cgit v1.2.1


From 77694b408abb8f92195ad5ed6ce5492f1d794c77 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 15 Feb 2011 10:33:57 +0100
Subject: genirq; Add fasteoi irq_chip quirk

Some chips want irq_eoi() only called when an interrupt is actually
handled. So they have checks for INPROGRESS and DISABLED in their
irq_eoi callbacks. Add a chip flag, which allows to handle that in the
generic code. No impact on the fastpath.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 08be5d182be3..1d3e25e68b0c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -515,9 +515,16 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	}
 	preflow_handler(desc);
 	handle_irq_event(desc);
-out:
+
+out_eoi:
 	desc->irq_data.chip->irq_eoi(&desc->irq_data);
+out_unlock:
 	raw_spin_unlock(&desc->lock);
+	return;
+out:
+	if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
+		goto out_eoi;
+	goto out_unlock;
 }
 
 /**
-- 
cgit v1.2.1


From a439520f8b18917b322f576be04c54aba84bb044 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 18:46:16 +0100
Subject: genirq: Implement irq_data based move_*_irq() versions

No need to lookup the irq descriptor when calling from a chip callback
function which has irq_data already handy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/migration.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 7a93c6b88b25..ec4806d4778b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,10 +4,10 @@
 
 #include "internals.h"
 
-void move_masked_irq(int irq)
+void irq_move_masked_irq(struct irq_data *idata)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_chip *chip = desc->irq_data.chip;
+	struct irq_desc *desc = irq_data_to_desc(idata);
+	struct irq_chip *chip = idata->chip;
 
 	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
@@ -53,12 +53,17 @@ void move_masked_irq(int irq)
 	cpumask_clear(desc->pending_mask);
 }
 
-void move_native_irq(int irq)
+void move_masked_irq(int irq)
+{
+	irq_move_masked_irq(irq_get_irq_data(irq));
+}
+
+void irq_move_irq(struct irq_data *idata)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = irq_data_to_desc(idata);
 	bool masked;
 
-	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+	if (likely(!irqd_is_setaffinity_pending(idata)))
 		return;
 
 	if (unlikely(desc->istate & IRQS_DISABLED))
@@ -71,8 +76,13 @@ void move_native_irq(int irq)
 	 */
 	masked = desc->istate & IRQS_MASKED;
 	if (!masked)
-		desc->irq_data.chip->irq_mask(&desc->irq_data);
-	move_masked_irq(irq);
+		idata->chip->irq_mask(idata);
+	irq_move_masked_irq(idata);
 	if (!masked)
-		desc->irq_data.chip->irq_unmask(&desc->irq_data);
+		idata->chip->irq_unmask(idata);
+}
+
+void move_native_irq(int irq)
+{
+	irq_move_irq(irq_get_irq_data(irq));
 }
-- 
cgit v1.2.1


From 24d51add7438f9696a7205927bf9de3c5c787a58 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Feb 2011 09:52:50 +0100
Subject: workqueue: fix build failure introduced by s/freezeable/freezable/

wq:fixes-2.6.38 does s/WQ_FREEZEABLE/WQ_FREEZABLE and wq:for-2.6.39
adds new usage of the flag.  The combination of the two creates a
build failure after merge.  Fix it by renaming all freezeables to
freezables.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 kernel/workqueue.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 572f559f6cb9..1b64d225f067 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,12 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
-struct workqueue_struct *system_freezeable_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
-EXPORT_SYMBOL_GPL(system_freezeable_wq);
+EXPORT_SYMBOL_GPL(system_freezable_wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -3777,10 +3777,10 @@ static int __init init_workqueues(void)
 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
 					    WQ_UNBOUND_MAX_ACTIVE);
-	system_freezeable_wq = alloc_workqueue("events_freezeable",
-					       WQ_FREEZEABLE, 0);
+	system_freezable_wq = alloc_workqueue("events_freezable",
+					      WQ_FREEZABLE, 0);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-	       !system_unbound_wq || !system_freezeable_wq);
+	       !system_unbound_wq || !system_freezable_wq);
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
cgit v1.2.1


From a61d825808a0ce9935afebc225dcd602d5339e14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Feb 2011 12:54:34 +0100
Subject: genirq: Fix misplaced status update in irq_disable()

We lazy disable interrupt lines, so only mark the line masked, when
the chip provides an irq_disable callback.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1d3e25e68b0c..b5145654855f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -199,8 +199,8 @@ void irq_disable(struct irq_desc *desc)
 	irq_state_set_disabled(desc);
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_state_set_masked(desc);
 	}
-	irq_state_set_masked(desc);
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-- 
cgit v1.2.1


From ed4dea6e0e33a3e58d8b77b775a8f0e433e7a005 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 19 Feb 2011 11:07:37 -0800
Subject: genirq: Use IRQ_BITMAP_BITS as search size in irq_alloc_descs()

The runtime expansion of nr_irqs does not take into account that
bitmap_find_next_zero_area() returns "start" + size in case the search
for an matching zero area fails. That results in a start value which
can be completely off and is not covered by the following
expand_nr_irqs() and possibly outside of the absolute limit. But we
use it without further checking.

Use IRQ_BITMAP_BITS as the limit for the bitmap search and expand
nr_irqs when the start bit is beyond nr_irqs. So start is always
pointing to the correct area in the bitmap. nr_irqs is just the limit
for irq enumerations, not the real limit for the irq space.

[ tglx: Let irq_expand_nr_irqs() take the new upper end so we do not
  	expand nr_irqs more than necessary. Made changelog readable ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D6014F9.8040605@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 394ab6a6c62c..dbccc799407f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -207,11 +207,11 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	return NULL;
 }
 
-static int irq_expand_nr_irqs(unsigned int cnt)
+static int irq_expand_nr_irqs(unsigned int nr)
 {
-	if (nr_irqs + cnt > IRQ_BITMAP_BITS)
+	if (nr > IRQ_BITMAP_BITS)
 		return -ENOMEM;
-	nr_irqs += cnt;
+	nr_irqs = nr;
 	return 0;
 }
 
@@ -298,7 +298,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 	return start;
 }
 
-static int irq_expand_nr_irqs(unsigned int cnt)
+static int irq_expand_nr_irqs(unsigned int nr)
 {
 	return -ENOMEM;
 }
@@ -346,13 +346,14 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
 
 	mutex_lock(&sparse_irq_lock);
 
-	start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+	start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+					   from, cnt, 0);
 	ret = -EEXIST;
 	if (irq >=0 && start != irq)
 		goto err;
 
-	if (start >= nr_irqs) {
-		ret = irq_expand_nr_irqs(cnt);
+	if (start + cnt > nr_irqs) {
+		ret = irq_expand_nr_irqs(start + cnt);
 		if (ret)
 			goto err;
 	}
-- 
cgit v1.2.1


From 8fff39e06987492da3d4a0b9ec7cdbd245b6762b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Feb 2011 14:19:42 +0100
Subject: genirq: Add missing break in __irq_set_trigger()

The switch case in __irq_set_trigger() lacks a break, which emits a
pr_err unconditionally on success.

Reported-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6cca1956c503..01f8a9519e63 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -567,6 +567,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		if (chip != desc->irq_data.chip)
 			irq_chip_set_defaults(desc->irq_data.chip);
 		ret = 0;
+		break;
 	default:
 		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
 		       flags, irq, chip->irq_set_type);
-- 
cgit v1.2.1


From e06383db9ec591696a06654257474b85bac1f8cb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 14 Dec 2010 19:37:07 -0800
Subject: hrtimers: extend hrtimer base code to handle more then 2 clockids
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hrtimer code is written mainly with CLOCK_REALTIME and CLOCK_MONOTONIC
in mind. These are clockids 0 and 1 resepctively. However, if we are
to introduce any new hrtimer bases, using new clockids, we have to skip
the cputimers (clockids 2,3) as well as other clockids that may not impelement
timers.

This patch adds a little bit of indirection between the clockid and
the base, so that we can extend the base by one when we add
a new clockid at number 7 or so.

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/hrtimer.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 57c4d33c9a9d..ca99e2454600 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
 /*
  * The timer bases:
  *
- * Note: If we want to add new timer bases, we have to skip the two
- * clock ids captured by the cpu-timers. We do this by holding empty
- * entries rather than doing math adjustment of the clock ids.
- * This ensures that we capture erroneous accesses to these clock ids
- * rather than moving them into the range of valid clock id's.
+ * There are more clockids then hrtimer bases. Thus, we index
+ * into the timer bases by the hrtimer_base_type enum. When trying
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
+ * is used to convert from clockid to the proper hrtimer_base_type.
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
@@ -77,6 +76,14 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 	}
 };
 
+static int hrtimer_clock_to_base_table[MAX_CLOCKS];
+
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+	return hrtimer_clock_to_base_table[clock_id];
+}
+
+
 /*
  * Get the coarse grained time at the softirq based on xtime and
  * wall_to_monotonic.
@@ -90,8 +97,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 
 	xtim = timespec_to_ktime(xts);
 	tomono = timespec_to_ktime(tom);
-	base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
-	base->clock_base[CLOCK_MONOTONIC].softirq_time =
+	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
+	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time =
 		ktime_add(xtim, tomono);
 }
 
@@ -179,10 +186,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 	struct hrtimer_cpu_base *new_cpu_base;
 	int this_cpu = smp_processor_id();
 	int cpu = hrtimer_get_target(this_cpu, pinned);
+	int basenum = hrtimer_clockid_to_base(base->index);
 
 again:
 	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
-	new_base = &new_cpu_base->clock_base[base->index];
+	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
 		/*
@@ -618,7 +626,7 @@ static void retrigger_next_event(void *arg)
 
 	/* Adjust CLOCK_REALTIME offset */
 	raw_spin_lock(&base->lock);
-	base->clock_base[CLOCK_REALTIME].offset =
+	base->clock_base[HRTIMER_BASE_REALTIME].offset =
 		timespec_to_ktime(realtime_offset);
 
 	hrtimer_force_reprogram(base, 0);
@@ -716,8 +724,8 @@ static int hrtimer_switch_to_hres(void)
 		return 0;
 	}
 	base->hres_active = 1;
-	base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
-	base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+	base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
+	base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
 
 	tick_setup_sched_timer();
 
@@ -1112,6 +1120,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 			   enum hrtimer_mode mode)
 {
 	struct hrtimer_cpu_base *cpu_base;
+	int base;
 
 	memset(timer, 0, sizeof(struct hrtimer));
 
@@ -1120,7 +1129,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
 		clock_id = CLOCK_MONOTONIC;
 
-	timer->base = &cpu_base->clock_base[clock_id];
+	base = hrtimer_clockid_to_base(clock_id);
+	timer->base = &cpu_base->clock_base[base];
 	hrtimer_init_timer_hres(timer);
 	timerqueue_init(&timer->node);
 
@@ -1156,9 +1166,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
 	struct hrtimer_cpu_base *cpu_base;
+	int base = hrtimer_clockid_to_base(which_clock);
 
 	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
 
 	return 0;
 }
@@ -1705,6 +1716,9 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
 
 void __init hrtimers_init(void)
 {
+	hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
+	hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
+
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-- 
cgit v1.2.1


From abb3a4ea2e0ea7114a4475745da2f32bd9ad5b73 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 14 Feb 2011 17:52:09 -0800
Subject: time: Introduce get_monotonic_boottime and ktime_get_boottime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds new functions that return the monotonic time since boot
(in other words, CLOCK_MONOTONIC + suspend time).

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6262c1d18397..5fbd9aa7df95 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -907,7 +907,7 @@ static void update_wall_time(void)
  * getboottime - Return the real time of system boot.
  * @ts:		pointer to the timespec to be set
  *
- * Returns the time of day in a timespec.
+ * Returns the wall-time of boot in a timespec.
  *
  * This is based on the wall_to_monotonic offset and the total suspend
  * time. Calls to settimeofday will affect the value returned (which
@@ -925,6 +925,55 @@ void getboottime(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(getboottime);
 
+
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+	struct timespec tomono, sleep;
+	unsigned int seq;
+	s64 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		*ts = xtime;
+		tomono = wall_to_monotonic;
+		sleep = total_sleep_time;
+		nsecs = timekeeping_get_ns();
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+			ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+	struct timespec ts;
+
+	get_monotonic_boottime(&ts);
+	return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
+
 /**
  * monotonic_to_bootbased - Convert the monotonic time to boot based.
  * @ts:		pointer to the timespec to be converted
-- 
cgit v1.2.1


From 314ac37150011ebb398f522db528d2dbcc611189 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 14 Feb 2011 18:43:08 -0800
Subject: time: Extend get_xtime_and_monotonic_offset() to also return sleep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend get_xtime_and_monotonic_offset to
get_xtime_and_monotonic_and_sleep_offset().

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/hrtimer.c          | 9 +++++----
 kernel/time/timekeeping.c | 8 ++++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ca99e2454600..e8bf3ad99063 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -91,9 +91,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, tomono;
-	struct timespec xts, tom;
+	struct timespec xts, tom, slp;
 
-	get_xtime_and_monotonic_offset(&xts, &tom);
+	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
 
 	xtim = timespec_to_ktime(xts);
 	tomono = timespec_to_ktime(tom);
@@ -614,12 +614,13 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
-	struct timespec realtime_offset, wtm;
+	struct timespec realtime_offset, wtm, sleep;
 
 	if (!hrtimer_hres_active())
 		return;
 
-	get_xtime_and_monotonic_offset(&realtime_offset, &wtm);
+	get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
+							&sleep);
 	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fbd9aa7df95..3bd7e3d5c632 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1040,11 +1040,14 @@ void do_timer(unsigned long ticks)
 }
 
 /**
- * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ *    and sleep offsets.
  * @xtim:	pointer to timespec to be set with xtime
  * @wtom:	pointer to timespec to be set with wall_to_monotonic
+ * @sleep:	pointer to timespec to be set with time in suspend
  */
-void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom)
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+				struct timespec *wtom, struct timespec *sleep)
 {
 	unsigned long seq;
 
@@ -1052,6 +1055,7 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom
 		seq = read_seqbegin(&xtime_lock);
 		*xtim = xtime;
 		*wtom = wall_to_monotonic;
+		*sleep = total_sleep_time;
 	} while (read_seqretry(&xtime_lock, seq));
 }
 
-- 
cgit v1.2.1


From 70a08cca1227dc31c784ec930099a4417a06e7d0 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 15 Feb 2011 10:45:16 -0800
Subject: timers: Add CLOCK_BOOTTIME hrtimer base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CLOCK_MONOTONIC stops while the system is in suspend. This is because
to applications system suspend is invisible. However, there is a
growing set of applications that are wanting to be suspend-aware,
but do not want to deal with the complications of CLOCK_REALTIME
(which might jump around if settimeofday is called).

For these applications, I propose a new clockid: CLOCK_BOOTTIME.
CLOCK_BOOTTIME is idential to CLOCK_MONOTONIC, except it also
includes any time spent in suspend.

This patch add hrtimer base for CLOCK_BOOTTIME, using
get_monotonic_boottime/ktime_get_boottime, to allow
in kernel users to set timers against.

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/hrtimer.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e8bf3ad99063..4c53af18734b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -73,6 +73,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.get_time = &ktime_get,
 			.resolution = KTIME_LOW_RES,
 		},
+		{
+			.index = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+			.resolution = KTIME_LOW_RES,
+		},
 	}
 };
 
@@ -90,16 +95,17 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  */
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
-	ktime_t xtim, tomono;
+	ktime_t xtim, mono, boot;
 	struct timespec xts, tom, slp;
 
 	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
 
 	xtim = timespec_to_ktime(xts);
-	tomono = timespec_to_ktime(tom);
+	mono = ktime_add(xtim, timespec_to_ktime(tom));
+	boot = ktime_add(mono, timespec_to_ktime(slp));
 	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time =
-		ktime_add(xtim, tomono);
+	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
 }
 
 /*
@@ -727,6 +733,7 @@ static int hrtimer_switch_to_hres(void)
 	base->hres_active = 1;
 	base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
 	base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
+	base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
 
 	tick_setup_sched_timer();
 
@@ -1719,6 +1726,7 @@ void __init hrtimers_init(void)
 {
 	hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
 	hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
+	hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
 
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
-- 
cgit v1.2.1


From 7fdd7f89006dd5a4c702fa0ce0c272345fa44ae0 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 15 Feb 2011 10:52:57 -0800
Subject: timers: Export CLOCK_BOOTTIME via the posix timers interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch exports CLOCK_BOOTTIME through the posix timers interface

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/posix-timers.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44fcff131b38..4c0124919f9a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -187,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 }
 
 /*
- * Get monotonic time for posix timers
+ * Get monotonic-raw time for posix timers
  */
 static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 {
@@ -214,6 +214,14 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
 	*tp = ktime_to_timespec(KTIME_LOW_RES);
 	return 0;
 }
+
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+	get_monotonic_boottime(tp);
+	return 0;
+}
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -253,12 +261,23 @@ static __init int init_posix_timers(void)
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 	};
+	struct k_clock clock_boottime = {
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_boottime,
+		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
+	};
 
 	posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
 	posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
 	posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
 	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
 	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+	posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
-- 
cgit v1.2.1


From 70433c01613c2a44756c7b25f7bdd6c1c77b119f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Feb 2011 12:50:12 +0100
Subject: genirq: Use the correct variable for note_interrupt

note_interrupt wants to be called with the combined result of all
handlers called, not with the last one. If it's a shared interrupt
then the last handler might return IRQ_NONE often enough to trigger
the spurious dectector which turns off a perfectly fine working
interrupt line. Bug was introduced in commit 1277a532(genirq: Simplify
handle_irq_event()).

Yes, I really messed up there. First the variable ret should not have
been named differently to avoid similarity with retval. Second it
should have been declared in the do {} loop.

Rename it to res and move it into the do {} loop and vanish under a
huge brown paperbag.

Reported-bisected-tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index cb62e2d0df4e..e099e9e9de0b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,24 +54,26 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
-	irqreturn_t ret, retval = IRQ_NONE;
+	irqreturn_t retval = IRQ_NONE;
 	unsigned int random = 0, irq = desc->irq_data.irq;
 
 	do {
+		irqreturn_t res;
+
 		trace_irq_handler_entry(irq, action);
-		ret = action->handler(irq, action->dev_id);
-		trace_irq_handler_exit(irq, action, ret);
+		res = action->handler(irq, action->dev_id);
+		trace_irq_handler_exit(irq, action, res);
 
 		if (WARN_ON_ONCE(!irqs_disabled()))
 			local_irq_disable();
 
-		switch (ret) {
+		switch (res) {
 		case IRQ_WAKE_THREAD:
 			/*
 			 * Set result to handled so the spurious check
 			 * does not trigger.
 			 */
-			ret = IRQ_HANDLED;
+			res = IRQ_HANDLED;
 
 			/*
 			 * Catch drivers which return WAKE_THREAD but
@@ -105,7 +107,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 			break;
 		}
 
-		retval |= ret;
+		retval |= res;
 		action = action->next;
 	} while (action);
 
@@ -113,7 +115,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		add_interrupt_randomness(irq);
 
 	if (!noirqdebug)
-		note_interrupt(irq, desc, ret);
+		note_interrupt(irq, desc, retval);
 	return retval;
 }
 
-- 
cgit v1.2.1


From dbebbfbb1605f0179e7c0d900d941cc9c45de569 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Feb 2011 21:46:25 +0100
Subject: rtmutex: tester: Remove the remaining BKL leftovers

We just leave the numbers assinged as commemoration and in case that
someone was crazy enough to reimplement the test stuff out of tree.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rtmutex-tester.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index d5b543506cbc..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -44,9 +44,8 @@ enum test_opcodes {
 	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
 	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
 	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	RTTEST_LOCKBKL,		/* 9 Was: Lock BKL */
-	RTTEST_UNLOCKBKL,	/* 10 Was: Unlock BKL */
-	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
+	/* 9, 10 - reserved for BKL commemoration */
+	RTTEST_SIGNAL = 11,	/* 11 Signal other test thread, data = thread id */
 	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
 	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
 };
-- 
cgit v1.2.1


From fd4afaf33313d94f548cb09129ecba3dbab62931 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 17 Feb 2011 13:39:05 +0000
Subject: genirq: Streamline kernel/irq/Kconfig

"def_bool n" without prompt is pointless, these should be just "bool".

[ tglx: Adapted to latest changes ]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D5D3309020000780003264A@vpn.id2.novell.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9149be729e45..355b8c7957f5 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,5 @@
 config HAVE_GENERIC_HARDIRQS
-	def_bool n
+	bool
 
 if HAVE_GENERIC_HARDIRQS
 menu "IRQ subsystem"
@@ -11,32 +11,32 @@ config GENERIC_HARDIRQS
 
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
-       def_bool n
+       bool
 
 config GENERIC_HARDIRQS_NO_COMPAT
-       def_bool n
+       bool
 
 # Options selectable by the architecture code
 config HAVE_SPARSE_IRQ
-       def_bool n
+       bool
 
 config GENERIC_IRQ_PROBE
-	def_bool n
+	bool
 
 config GENERIC_IRQ_SHOW
-       def_bool n
+       bool
 
 config GENERIC_PENDING_IRQ
-	def_bool n
+	bool
 
 config AUTO_IRQ_AFFINITY
-       def_bool n
+       bool
 
 config HARDIRQS_SW_RESEND
-       def_bool n
+       bool
 
 config IRQ_PREFLOW_FASTEOI
-       def_bool n
+       bool
 
 config SPARSE_IRQ
 	bool "Support sparse irq numbering"
-- 
cgit v1.2.1


From c186fafe9aba87c1a93df8c7120a6ae01fe435ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:52:53 +0100
Subject: sched: Clean up remnants of sd_idle

With the wholesale removal of the sd_idle SMT logic we can clean up
some more.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d384e739ea95..cd18600a8a63 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3150,25 +3150,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-	/*
-	 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
-	 * And to check for busy balance use !idle_cpu instead of
-	 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
-	 * even when they are idle.
-	 */
-	if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
-		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-			goto out_balanced;
-	} else {
+	if (idle == CPU_IDLE) {
 		/*
 		 * This cpu is idle. If the busiest group load doesn't
 		 * have more tasks than the number of available cpu's and
 		 * there is no imbalance between this and busiest group
 		 * wrt to idle cpu's, it is balanced.
 		 */
-		if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
 		    sds.busiest_nr_running <= sds.busiest_group_weight)
 			goto out_balanced;
+	} else {
+		/*
+		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+		 * imbalance_pct to be conservative.
+		 */
+		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+			goto out_balanced;
 	}
 
 force_balance:
@@ -3862,8 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
-				 * longer idle, or one of our SMT siblings is
-				 * not idle.
+				 * longer idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
-- 
cgit v1.2.1


From cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:55:32 +0100
Subject: sched: Clean up some f_b_g() comments

The existing comment tends to grow state (as it already has), split it
up and place it near the actual tests.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cd18600a8a63..03496ebc4553 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3113,19 +3113,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 */
 	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
 
-	/* Cases where imbalance does not exist from POV of this_cpu */
-	/* 1) this_cpu is not the appropriate cpu to perform load balancing
-	 *    at this level.
-	 * 2) There is no busy sibling group to pull from.
-	 * 3) This group is the busiest group.
-	 * 4) This group is more busy than the avg busieness at this
-	 *    sched_domain.
-	 * 5) The imbalance is within the specified limit.
-	 *
-	 * Note: when doing newidle balance, if the local group has excess
-	 * capacity (i.e. nr_running < group_capacity) and the busiest group
-	 * does not have any capacity, we force a load balance to pull tasks
-	 * to the local group. In this case, we skip past checks 3, 4 and 5.
+	/*
+	 * this_cpu is not the appropriate cpu to perform load balancing at
+	 * this level.
 	 */
 	if (!(*balance))
 		goto ret;
@@ -3134,19 +3124,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	    check_asym_packing(sd, &sds, this_cpu, imbalance))
 		return sds.busiest;
 
+	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
-	/*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
 			!sds.busiest_has_capacity)
 		goto force_balance;
 
+	/*
+	 * If the local group is more busy than the selected busiest group
+	 * don't try and pull any tasks.
+	 */
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
+	/*
+	 * Don't pull any tasks if this group is already above the domain
+	 * average load.
+	 */
 	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-- 
cgit v1.2.1


From 866ab43efd325fae8889ea77a744d03f2b957e38 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:56:47 +0100
Subject: sched: Fix the group_imb logic

On a 2*6*2 machine something like:

 taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done'

_should_ result in 9 busy CPUs, each running 1 task.

However it didn't quite work reliably, most of the time one cpu of the
second socket (6-11) would be idle and one cpu of the first socket
(0-5) would have two tasks on it.

The group_imb logic is supposed to deal with this and detect when a
particular group is imbalanced (like in our case, 0-2 are idle but 3-5
will have 4 tasks on it).

The detection phase needed a bit of a tweak as it was too weak and
required more than 2 avg weight tasks difference between idle and busy
cpus in the group which won't trigger for our test-case. So cure that
to be one or more avg task weight difference between cpus.

Once the detection phase worked, it was then defeated by the f_b_g()
tests trying to avoid ping-pongs. In particular, this_load >= max_load
triggered because the pulling cpu (the (first) idle cpu in on the
second socket, say 6) would find this_load to be 5 and max_load to be
4 (there'd be 5 tasks running on our socket and only 4 on the other
socket).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03496ebc4553..3a88dee165c0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
-	 * than the average weight of two tasks.
+	 * than the average weight of a task.
 	 *
 	 * APZ: with cgroup the avg task weight can vary wildly and
 	 *      might not be a suitable number - should we keep a
@@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if (sgs->sum_nr_running)
 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	/*
+	 * If the busiest group is imbalanced the below checks don't
+	 * work because they assumes all things are equal, which typically
+	 * isn't true due to cpus_allowed constraints and the like.
+	 */
+	if (sds.group_imb)
+		goto force_balance;
+
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
 			!sds.busiest_has_capacity)
-- 
cgit v1.2.1


From 1747b21fecbfb63fbf6b9624e8b92707960d5a97 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 20 Feb 2011 15:08:12 +0800
Subject: sched, autogroup, sysctl: Use proc_dointvec_minmax() instead

sched_autogroup_enabled has min/max value, proc_dointvec_minmax() is
be used for this case.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1298185696-4403-2-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb7c830f7faa..7b5eeadfb254 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_autogroup_enabled,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-- 
cgit v1.2.1


From 800d4d30c8f20bd728e5741a3b77c4859a613f7c Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 20 Feb 2011 15:08:14 +0800
Subject: sched, autogroup: Stop going ahead if autogroup is disabled

when autogroup is disable from the beginning,
sched_autogroup_create_attach()
  autogroup_move_group()                    <== 1
    sched_move_task()                       <== 2
      task_move_group_fair()
        set_task_rq()
          task_group()
            autogroup_task_group()

We go the whole path without doing anything useful.

Then stop going further if autogroup is disabled.

But there will be a race window between 1 and 2, in which
sysctl_sched_autogroup_enabled is enabled. This issue
will be toke by following patch.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1298185696-4403-4-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_autogroup.c | 4 ++++
 kernel/sched_autogroup.h | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..137a096ae9d8 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -161,11 +161,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
+	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+		goto out;
+
 	t = p;
 	do {
 		sched_move_task(t);
 	} while_each_thread(p, t);
 
+out:
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
 
 struct autogroup {
+	/*
+	 * reference doesn't mean how many thread attach to this
+	 * autogroup now. It just stands for the number of task
+	 * could use this autogroup.
+	 */
 	struct kref		kref;
 	struct task_group	*tg;
 	struct rw_semaphore	lock;
-- 
cgit v1.2.1


From 511f67a5997c4967c69a3961e2fc9f04d8d244ac Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 22 Feb 2011 15:02:00 +0100
Subject: sched, autogroup: Stop claiming ownership of the root task group

Disown it, and only display autogroup association if one exists.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1298383320.8036.5.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_autogroup.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 137a096ae9d8..5946ac515602 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
 static void __init autogroup_init(struct task_struct *init_task)
 {
 	autogroup_default.tg = &root_task_group;
-	root_task_group.autogroup = &autogroup_default;
 	kref_init(&autogroup_default.kref);
 	init_rwsem(&autogroup_default.lock);
 	init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 
 static inline bool task_group_is_autogroup(struct task_group *tg)
 {
-	return tg != &root_task_group && tg->autogroup;
+	return !!tg->autogroup;
 }
 
 static inline struct task_group *
@@ -251,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 {
 	struct autogroup *ag = autogroup_task_get(p);
 
+	if (!task_group_is_autogroup(ag->tg))
+		goto out;
+
 	down_read(&ag->lock);
 	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
 	up_read(&ag->lock);
 
+out:
 	autogroup_kref_put(ag);
 }
 #endif /* CONFIG_PROC_FS */
@@ -262,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-
-	if (!enabled || !tg->autogroup)
+	if (!task_group_is_autogroup(tg))
 		return 0;
 
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
-- 
cgit v1.2.1


From 3f7cce3c18188a067d463749168bdda5abc5b0f7 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 18 Feb 2011 14:40:01 +0200
Subject: perf_events: Fix rcu and locking issues with cgroup support

This patches ensures that we do not end up calling
perf_cgroup_from_task() when there is no cgroup event.
This avoids potential RCU and locking issues.

The change in perf_cgroup_set_timestamp() ensures we
check against ctx->nr_cgroups. It also avoids calling
perf_clock() tiwce in a row. It also ensures we do need
to grab ctx->lock before calling the function.

We drop update_cgrp_time() from task_clock_event_read()
because it is not needed. This also avoids having to
deal with perf_cgroup_from_task().

Thanks to Peter Zijlstra for his help on this.

Signed-off-by: Stephane Eranian <eranian@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d5e76b8.815bdf0a.7ac3.774f@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a0a6987fabc4..dadeaea4b3fc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -201,6 +201,11 @@ __get_cpu_context(struct perf_event_context *ctx)
 
 #ifdef CONFIG_CGROUP_PERF
 
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
 static inline struct perf_cgroup *
 perf_cgroup_from_task(struct task_struct *task)
 {
@@ -268,28 +273,41 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 
 static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
-	struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+	struct perf_cgroup *cgrp;
+
 	/*
-	 * do not update time when cgroup is not active
+	 * ensure we access cgroup data only when needed and
+	 * when we know the cgroup is pinned (css_get)
 	 */
-	if (!event->cgrp || cgrp != event->cgrp)
+	if (!is_cgroup_event(event))
 		return;
 
-	__update_cgrp_time(event->cgrp);
+	cgrp = perf_cgroup_from_task(current);
+	/*
+	 * Do not update time when cgroup is not active
+	 */
+	if (cgrp == event->cgrp)
+		__update_cgrp_time(event->cgrp);
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+perf_cgroup_set_timestamp(struct task_struct *task,
+			  struct perf_event_context *ctx)
 {
 	struct perf_cgroup *cgrp;
 	struct perf_cgroup_info *info;
 
-	if (!task)
+	/*
+	 * ctx->lock held by caller
+	 * ensure we do not access cgroup data
+	 * unless we have the cgroup pinned (css_get)
+	 */
+	if (!task || !ctx->nr_cgroups)
 		return;
 
 	cgrp = perf_cgroup_from_task(task);
 	info = this_cpu_ptr(cgrp->info);
-	info->timestamp = now;
+	info->timestamp = ctx->timestamp;
 }
 
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
@@ -494,7 +512,8 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+perf_cgroup_set_timestamp(struct task_struct *task,
+			  struct perf_event_context *ctx)
 {
 }
 
@@ -1613,7 +1632,7 @@ static int __perf_event_enable(void *info)
 	/*
 	 * set current task's cgroup time reference point
 	 */
-	perf_cgroup_set_timestamp(current, perf_clock());
+	perf_cgroup_set_timestamp(current, ctx);
 
 	__perf_event_mark_enabled(event, ctx);
 
@@ -2048,7 +2067,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	now = perf_clock();
 	ctx->timestamp = now;
-	perf_cgroup_set_timestamp(task, now);
+	perf_cgroup_set_timestamp(task, ctx);
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@@ -5795,7 +5814,6 @@ static void task_clock_event_read(struct perf_event *event)
 
 	if (!in_nmi()) {
 		update_context_time(event->ctx);
-		update_cgrp_time_from_event(event);
 		time = event->ctx->time;
 	} else {
 		u64 now = perf_clock();
-- 
cgit v1.2.1


From 768a06e2ca49cdf72389208cfc056a36cf8bc5e3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 22 Feb 2011 16:52:24 +0100
Subject: perf: Simplify task_clock_event_read()

There is no point in us having different code paths for nmi and !nmi
here, so remove the !nmi one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index dadeaea4b3fc..64a018e94fca 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5810,16 +5810,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
 
 static void task_clock_event_read(struct perf_event *event)
 {
-	u64 time;
-
-	if (!in_nmi()) {
-		update_context_time(event->ctx);
-		time = event->ctx->time;
-	} else {
-		u64 now = perf_clock();
-		u64 delta = now - event->ctx->timestamp;
-		time = event->ctx->time + delta;
-	}
+	u64 now = perf_clock();
+	u64 delta = now - event->ctx->timestamp;
+	u64 time = event->ctx->time + delta;
 
 	task_clock_event_update(event, time);
 }
-- 
cgit v1.2.1


From 7e9498705e810404ecf29bb2d6fa632b9484c609 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 25 Feb 2011 14:32:28 +0100
Subject: sched: Add #ifdef around irq time accounting functions

Get rid of this:

 kernel/sched.c:3731:13: warning: 'irqtime_account_idle_ticks' defined but not used
 kernel/sched.c:3732:13: warning: 'irqtime_account_process_tick' defined but not used

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110225133228.GD7469@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2effcb71a478..79bca166bed7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3687,6 +3687,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
@@ -3753,6 +3754,7 @@ static void irqtime_account_idle_ticks(int ticks) {}
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq) {}
 #endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 /*
  * Account for involuntary wait time.
-- 
cgit v1.2.1


From 1204e95689f9fbd245a4ce5c1b0cd0a9b77f8d25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Feb 2011 17:17:18 +0100
Subject: genirq: Make warning in handle_percpu_event useful

The WARN_ON_ONCE in handle_percpu_event() which emits a warning when
an action handler returns with interrupts enabled is not really
useful. It does not reveal the interrupt number and handler function
which caused it. Make it WARN_ONCE() and add the information.

Reported-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e099e9e9de0b..b110c835e070 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -64,7 +64,8 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		res = action->handler(irq, action->dev_id);
 		trace_irq_handler_exit(irq, action, res);
 
-		if (WARN_ON_ONCE(!irqs_disabled()))
+		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+			      irq, action->handler))
 			local_irq_disable();
 
 		switch (res) {
-- 
cgit v1.2.1


From b5faba21a6805c33b40e258d36f57997ee1de131 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:13 +0000
Subject: genirq: Prepare the handling of shared oneshot interrupts

For level type interrupts we need to track how many threads are on
flight to avoid useless interrupt storms when not all thread handlers
have finished yet. Keep track of the woken threads and only unmask
when there are no more threads in flight.

Yes, I'm lazy and using a bitfield. But not only because I'm lazy, the
main reason is that it's way simpler than using a refcount. A refcount
based solution would need to keep track of various things like
crashing the irq thread, spurious interrupts coming in,
disables/enables, free_irq() and some more. The bitfield keeps the
tracking simple and makes things just work. It's also nicely confined
to the thread code pathes and does not require additional checks all
over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.388095876@linutronix.de>
---
 kernel/irq/handle.c | 76 ++++++++++++++++++++++++++++++++++++++++++++---------
 kernel/irq/manage.c | 54 ++++++++++++++++++++++++++++++++-----
 2 files changed, 111 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b110c835e070..517561fc7317 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,6 +51,68 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
+static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+{
+	/*
+	 * Wake up the handler thread for this action. In case the
+	 * thread crashed and was killed we just pretend that we
+	 * handled the interrupt. The hardirq handler has disabled the
+	 * device interrupt, so no irq storm is lurking. If the
+	 * RUNTHREAD bit is already set, nothing to do.
+	 */
+	if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+	    test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+		return;
+
+	/*
+	 * It's safe to OR the mask lockless here. We have only two
+	 * places which write to threads_oneshot: This code and the
+	 * irq thread.
+	 *
+	 * This code is the hard irq context and can never run on two
+	 * cpus in parallel. If it ever does we have more serious
+	 * problems than this bitmask.
+	 *
+	 * The irq threads of this irq which clear their "running" bit
+	 * in threads_oneshot are serialized via desc->lock against
+	 * each other and they are serialized against this code by
+	 * IRQS_INPROGRESS.
+	 *
+	 * Hard irq handler:
+	 *
+	 *	spin_lock(desc->lock);
+	 *	desc->state |= IRQS_INPROGRESS;
+	 *	spin_unlock(desc->lock);
+	 *	set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+	 *	desc->threads_oneshot |= mask;
+	 *	spin_lock(desc->lock);
+	 *	desc->state &= ~IRQS_INPROGRESS;
+	 *	spin_unlock(desc->lock);
+	 *
+	 * irq thread:
+	 *
+	 * again:
+	 *	spin_lock(desc->lock);
+	 *	if (desc->state & IRQS_INPROGRESS) {
+	 *		spin_unlock(desc->lock);
+	 *		while(desc->state & IRQS_INPROGRESS)
+	 *			cpu_relax();
+	 *		goto again;
+	 *	}
+	 *	if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+	 *		desc->threads_oneshot &= ~mask;
+	 *	spin_unlock(desc->lock);
+	 *
+	 * So either the thread waits for us to clear IRQS_INPROGRESS
+	 * or we are waiting in the flow handler for desc->lock to be
+	 * released before we reach this point. The thread also checks
+	 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+	 * threads_oneshot untouched and runs the thread another time.
+	 */
+	desc->threads_oneshot |= action->thread_mask;
+	wake_up_process(action->thread);
+}
+
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
@@ -85,19 +147,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 				break;
 			}
 
-			/*
-			 * Wake up the handler thread for this
-			 * action. In case the thread crashed and was
-			 * killed we just pretend that we handled the
-			 * interrupt. The hardirq handler above has
-			 * disabled the device interrupt, so no irq
-			 * storm is lurking.
-			 */
-			if (likely(!test_bit(IRQTF_DIED,
-					     &action->thread_flags))) {
-				set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-				wake_up_process(action->thread);
-			}
+			irq_wake_thread(desc, action);
 
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 01f8a9519e63..2301de19ac7d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -617,8 +617,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  * handler finished. unmask if the interrupt has not been disabled and
  * is marked MASKED.
  */
-static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+static void irq_finalize_oneshot(struct irq_desc *desc,
+				 struct irqaction *action, bool force)
 {
+	if (!(desc->istate & IRQS_ONESHOT))
+		return;
 again:
 	chip_bus_lock(desc);
 	raw_spin_lock_irq(&desc->lock);
@@ -631,6 +634,11 @@ again:
 	 * on the other CPU. If we unmask the irq line then the
 	 * interrupt can come in again and masks the line, leaves due
 	 * to IRQS_INPROGRESS and the irq line is masked forever.
+	 *
+	 * This also serializes the state of shared oneshot handlers
+	 * versus "desc->threads_onehsot |= action->thread_mask;" in
+	 * irq_wake_thread(). See the comment there which explains the
+	 * serialization.
 	 */
 	if (unlikely(desc->istate & IRQS_INPROGRESS)) {
 		raw_spin_unlock_irq(&desc->lock);
@@ -639,11 +647,23 @@ again:
 		goto again;
 	}
 
-	if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) {
+	/*
+	 * Now check again, whether the thread should run. Otherwise
+	 * we would clear the threads_oneshot bit of this thread which
+	 * was just set.
+	 */
+	if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+		goto out_unlock;
+
+	desc->threads_oneshot &= ~action->thread_mask;
+
+	if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
+	    (desc->istate & IRQS_MASKED)) {
 		irq_compat_clr_masked(desc);
 		desc->istate &= ~IRQS_MASKED;
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	}
+out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
 	chip_bus_sync_unlock(desc);
 }
@@ -691,7 +711,7 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	int wake, oneshot = desc->istate & IRQS_ONESHOT;
+	int wake;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
@@ -719,8 +739,7 @@ static int irq_thread(void *data)
 
 			action->thread_fn(action->irq, action->dev_id);
 
-			if (oneshot)
-				irq_finalize_oneshot(action->irq, desc);
+			irq_finalize_oneshot(desc, action, false);
 		}
 
 		wake = atomic_dec_and_test(&desc->threads_active);
@@ -729,6 +748,9 @@ static int irq_thread(void *data)
 			wake_up(&desc->wait_for_threads);
 	}
 
+	/* Prevent a stale desc->threads_oneshot */
+	irq_finalize_oneshot(desc, action, true);
+
 	/*
 	 * Clear irqaction. Otherwise exit_irq_thread() would make
 	 * fuzz about an active irq thread going into nirvana.
@@ -743,6 +765,7 @@ static int irq_thread(void *data)
 void exit_irq_thread(void)
 {
 	struct task_struct *tsk = current;
+	struct irq_desc *desc;
 
 	if (!tsk->irqaction)
 		return;
@@ -751,6 +774,14 @@ void exit_irq_thread(void)
 	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
 	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
 
+	desc = irq_to_desc(tsk->irqaction->irq);
+
+	/*
+	 * Prevent a stale desc->threads_oneshot. Must be called
+	 * before setting the IRQTF_DIED flag.
+	 */
+	irq_finalize_oneshot(desc, tsk->irqaction, true);
+
 	/*
 	 * Set the THREAD DIED flag to prevent further wakeups of the
 	 * soon to be gone threaded handler.
@@ -767,7 +798,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
 	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
-	unsigned long flags;
+	unsigned long flags, thread_mask = 0;
 	int ret, nested, shared = 0;
 	cpumask_var_t mask;
 
@@ -865,12 +896,23 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 		/* add new interrupt at end of irq queue */
 		do {
+			thread_mask |= old->thread_mask;
 			old_ptr = &old->next;
 			old = *old_ptr;
 		} while (old);
 		shared = 1;
 	}
 
+	/*
+	 * Setup the thread mask for this irqaction. Unlikely to have
+	 * 32 resp 64 irqs sharing one line, but who knows.
+	 */
+	if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+		ret = -EBUSY;
+		goto out_mask;
+	}
+	new->thread_mask = 1 << ffz(thread_mask);
+
 	if (!shared) {
 		irq_chip_set_defaults(desc->irq_data.chip);
 
-- 
cgit v1.2.1


From 9d591edd02a245305b1b9379e4c5571bad4d2774 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:16 +0000
Subject: genirq: Allow shared oneshot interrupts

Support ONESHOT on shared interrupts, if all drivers agree on it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.483640430@linutronix.de>
---
 kernel/irq/manage.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2301de19ac7d..58c861367300 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -824,10 +824,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
-	/* Oneshot interrupts are not allowed with shared */
-	if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
-		return -EINVAL;
-
 	/*
 	 * Check whether the interrupt nests into another interrupt
 	 * thread.
@@ -881,10 +877,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * Can't share interrupts unless both agree to and are
 		 * the same type (level, edge, polarity). So both flag
 		 * fields must have IRQF_SHARED set and the bits which
-		 * set the trigger type must match.
+		 * set the trigger type must match. Also all must
+		 * agree on ONESHOT.
 		 */
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
-		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+		    ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
 			old_name = old->name;
 			goto mismatch;
 		}
-- 
cgit v1.2.1


From 8eb90c30e0e815a1308828352eabd03ca04229dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:21 +0000
Subject: sched: Switch wait_task_inactive to schedule_hrtimeout()

When we force thread hard and soft interrupts the startup of ksoftirqd
would hang in kthread_bind() when wait_task_inactive() calls
schedule_timeout_uninterruptible() because there is no softirq yet
which will wake us up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.677109139@linutronix.de>
---
 kernel/sched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..66ca5d9ba83c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2224,7 +2224,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
-			schedule_timeout_uninterruptible(1);
+			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
 			continue;
 		}
 
-- 
cgit v1.2.1


From 544b4a1f309d18f40969dbab7e08bafd136b2f55 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 25 Feb 2011 15:13:16 -0800
Subject: sched: Clean up the IRQ_TIME_ACCOUNTING code

Fix this warning:

  lkml.org/lkml/2011/1/30/124

 kernel/sched.c:3719: warning: 'irqtime_account_idle_ticks' defined but not used
 kernel/sched.c:3720: warning: 'irqtime_account_process_tick' defined but not used

In a cleaner way than:

 7e9498705e81: sched: Add #ifdef around irq time accounting functions

This patch will not have any functional impact.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Cc: heiko.carstens@de.ibm.com
Cc: a.p.zijlstra@chello.nl
LKML-Reference: <1298675596-10992-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 64 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 79bca166bed7..0c8712630f05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3687,7 +3687,36 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
+
+	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
+	struct rq *rq = this_rq();
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+}
+
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
@@ -3749,42 +3778,11 @@ static void irqtime_account_idle_ticks(int ticks)
 	for (i = 0; i < ticks; i++)
 		irqtime_account_process_tick(current, 0, rq);
 }
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static void irqtime_account_idle_ticks(int ticks) {}
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq) {}
-#endif
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
-
-/*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t cputime64 = cputime_to_cputime64(cputime);
-
-	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
-	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t cputime64 = cputime_to_cputime64(cputime);
-	struct rq *rq = this_rq();
-
-	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 /*
  * Account a single tick of cpu time.
-- 
cgit v1.2.1


From 3a142a0672b48a853f00af61f184c7341ac9c99d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Feb 2011 22:34:23 +0100
Subject: clockevents: Prevent oneshot mode when broadcast device is periodic

When the per cpu timer is marked CLOCK_EVT_FEAT_C3STOP, then we only
can switch into oneshot mode, when the backup broadcast device
supports oneshot mode as well. Otherwise we would try to switch the
broadcast device into an unsupported mode unconditionally. This went
unnoticed so far as the current available broadcast devices support
oneshot mode. Seth unearthed this problem while debugging and working
around an hpet related BIOS wreckage.

Add the necessary check to tick_is_oneshot_available().

Reported-and-tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <alpine.LFD.2.00.1102252231200.2701@localhost6.localdomain6>
Cc: stable@kernel.org # .21 ->
---
 kernel/time/tick-broadcast.c | 10 ++++++++++
 kernel/time/tick-common.c    |  6 +++++-
 kernel/time/tick-internal.h  |  3 +++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..a3b5aff62606 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -600,4 +600,14 @@ int tick_broadcast_oneshot_active(void)
 	return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
 }
 
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..ed228ef6f6b8 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -51,7 +51,11 @@ int tick_is_oneshot_available(void)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+		return 0;
+	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+		return 1;
+	return tick_broadcast_oneshot_available();
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..f65d3a723a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -46,6 +47,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
 # endif /* !BROADCAST */
 
 #else /* !ONESHOT */
@@ -76,6 +78,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 	return 0;
 }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
 #endif /* !TICK_ONESHOT */
 
 /*
-- 
cgit v1.2.1


From 8d32a307e4faa8b123dc8a9cd56d1a7525f69ad3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:23 +0000
Subject: genirq: Provide forced interrupt threading

Add a commandline parameter "threadirqs" which forces all interrupts except
those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to
allow retrieving better debug data from crashing interrupt handlers. If
"threadirqs" is not enabled on the kernel command line, then there is no
impact in the interrupt hotpath.

Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after
marking the interrupts which cant be threaded IRQF_NO_THREAD. All
interrupts which have IRQF_TIMER set are implict marked
IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded.

Forced threading hard interrupts also forces all soft interrupt
handling into thread context.

When enabled it might slow down things a bit, but for debugging problems in
interrupt code it's a reasonable penalty as it does not immediately
crash and burn the machine when an interrupt handler is buggy.

Some test results on a Core2Duo machine:

Cache cold run of:
 # time git grep irq_desc

      non-threaded       threaded
 real 1m18.741s          1m19.061s
 user 0m1.874s           0m1.757s
 sys  0m5.843s           0m5.427s

 # iperf -c server
non-threaded
[  3]  0.0-10.0 sec  1.09 GBytes   933 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   934 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   933 Mbits/sec
threaded
[  3]  0.0-10.0 sec  1.09 GBytes   939 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   934 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   937 Mbits/sec

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.772668648@linutronix.de>
---
 kernel/irq/Kconfig     |  3 +++
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    | 67 +++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/softirq.c       | 16 ++++++++++--
 4 files changed, 82 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 355b8c7957f5..144db9dcfcde 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -38,6 +38,9 @@ config HARDIRQS_SW_RESEND
 config IRQ_PREFLOW_FASTEOI
        bool
 
+config IRQ_FORCED_THREADING
+       bool
+
 config SPARSE_IRQ
 	bool "Support sparse irq numbering"
 	depends on HAVE_SPARSE_IRQ
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 935bec4bfa87..6c6ec9a49027 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -27,12 +27,14 @@ extern int noirqdebug;
  * IRQTF_DIED      - handler thread died
  * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD  - irq action is force threaded
  */
 enum {
 	IRQTF_RUNTHREAD,
 	IRQTF_DIED,
 	IRQTF_WARNED,
 	IRQTF_AFFINITY,
+	IRQTF_FORCED_THREAD,
 };
 
 /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 58c861367300..acd599a43bfb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
 
 #include "internals.h"
 
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+
+static int __init setup_forced_irqthreads(char *arg)
+{
+	force_irqthreads = true;
+	return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -701,6 +712,32 @@ static inline void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 
+/*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static void
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+	local_bh_disable();
+	action->thread_fn(action->irq, action->dev_id);
+	irq_finalize_oneshot(desc, action, false);
+	local_bh_enable();
+}
+
+/*
+ * Interrupts explicitely requested as threaded interupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+	action->thread_fn(action->irq, action->dev_id);
+	irq_finalize_oneshot(desc, action, false);
+}
+
 /*
  * Interrupt handler thread
  */
@@ -711,8 +748,15 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
+	void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
 	int wake;
 
+	if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+					&action->thread_flags))
+		handler_fn = irq_forced_thread_fn;
+	else
+		handler_fn = irq_thread_fn;
+
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
 
@@ -736,10 +780,7 @@ static int irq_thread(void *data)
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
 			raw_spin_unlock_irq(&desc->lock);
-
-			action->thread_fn(action->irq, action->dev_id);
-
-			irq_finalize_oneshot(desc, action, false);
+			handler_fn(desc, action);
 		}
 
 		wake = atomic_dec_and_test(&desc->threads_active);
@@ -789,6 +830,22 @@ void exit_irq_thread(void)
 	set_bit(IRQTF_DIED, &tsk->irqaction->flags);
 }
 
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+	if (!force_irqthreads)
+		return;
+	if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+		return;
+
+	new->flags |= IRQF_ONESHOT;
+
+	if (!new->thread_fn) {
+		set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+		new->thread_fn = new->handler;
+		new->handler = irq_default_primary_handler;
+	}
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -838,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * dummy function which warns when called.
 		 */
 		new->handler = irq_nested_primary_handler;
+	} else {
+		irq_setup_forced_threading(new);
 	}
 
 	/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c0490464e92f..a33fb2911248 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,9 +311,21 @@ void irq_enter(void)
 }
 
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-# define invoke_softirq()	__do_softirq()
+static inline void invoke_softirq(void)
+{
+	if (!force_irqthreads)
+		__do_softirq();
+	else
+		wakeup_softirqd();
+}
 #else
-# define invoke_softirq()	do_softirq()
+static inline void invoke_softirq(void)
+{
+	if (!force_irqthreads)
+		do_softirq();
+	else
+		wakeup_softirqd();
+}
 #endif
 
 /*
-- 
cgit v1.2.1


From c69e3758ff56d03e161187355791ec992c574276 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Mar 2011 11:49:21 +0100
Subject: genirq: Fixup fasteoi handler for oneshot mode

The fasteoi handler must mask the interrupt line in oneshot mode
otherwise we end up with an irq storm.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b5145654855f..c9c0601f0615 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -513,6 +513,10 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		mask_irq(desc);
 		goto out;
 	}
+
+	if (desc->istate & IRQS_ONESHOT)
+		mask_irq(desc);
+
 	preflow_handler(desc);
 	handle_irq_event(desc);
 
-- 
cgit v1.2.1


From 5cd10e7946d28cfc42442fee2e6c757e244d756e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Mar 2011 16:58:30 +0100
Subject: hrtimer: Update base[CLOCK_BOOTTIME].offset correctly

We calculate the current time of each clock base by adding an offset
to clock_monotonic. The offset for the clock bases is set in
retrigger_next_event() which is called when we switch a cpu to highres
mode or when the clock was set.

Add the missing update for clock boottime.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
---
 kernel/hrtimer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 4c53af18734b..f679a2d08723 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -635,6 +635,8 @@ static void retrigger_next_event(void *arg)
 	raw_spin_lock(&base->lock);
 	base->clock_base[HRTIMER_BASE_REALTIME].offset =
 		timespec_to_ktime(realtime_offset);
+	base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+		timespec_to_ktime(sleep);
 
 	hrtimer_force_reprogram(base, 0);
 	raw_spin_unlock(&base->lock);
-- 
cgit v1.2.1


From 2d3a8497f8cc5aca14b722cd37d51f6c15ff9f74 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 3 Mar 2011 10:53:20 -0500
Subject: blktrace: Remove blk_fill_rwbs_rq.

If we enable trace events to trace block actions, We use
blk_fill_rwbs_rq to analyze the corresponding actions
in request's cmd_flags, but we only choose the minor 2 bits
from it, so most of other flags(e.g, REQ_SYNC) are missing.
For example, with a sync write we get:
write_test-2409  [001]   160.013869: block_rq_insert: 3,64 W 0 () 258135 + =
8 [write_test]

Since now we have integrated the flags of both bio and request,
it is safe to pass rq->cmd_flags directly to blk_fill_rwbs and
blk_fill_rwbs_rq isn't needed any more.

With this patch, after a sync write we get:
write_test-2417  [000]   226.603878: block_rq_insert: 3,64 WS 0 () 258135 +=
 8 [write_test]

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d95721f33702..cbafed7d4f38 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1827,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 	rwbs[i] = '\0';
 }
 
-void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
-{
-	int rw = rq->cmd_flags & 0x03;
-	int bytes;
-
-	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= REQ_DISCARD;
-
-	if (rq->cmd_flags & REQ_SECURE)
-		rw |= REQ_SECURE;
-
-	bytes = blk_rq_bytes(rq);
-
-	blk_fill_rwbs(rwbs, rw, bytes);
-}
-
 #endif /* CONFIG_EVENT_TRACING */
 
-- 
cgit v1.2.1


From c53fa1ed92cd671a1dfb1e7569e9ab672612ddc6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Mar 2011 10:55:40 -0800
Subject: netlink: kill loginuid/sessionid/sid members from struct
 netlink_skb_parms

Netlink message processing in the kernel is synchronous these days, the
session information can be collected when needed.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/audit.c       |  6 +++---
 kernel/auditfilter.c | 10 +++++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 162e88e33bc9..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -673,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
-	loginuid = NETLINK_CB(skb).loginuid;
-	sessionid = NETLINK_CB(skb).sessionid;
-	sid  = NETLINK_CB(skb).sid;
+	loginuid = audit_get_loginuid(current);
+	sessionid = audit_get_sessionid(current);
+	security_task_getsecid(current, &sid);
 	seq  = nlh->nlmsg_seq;
 	data = NLMSG_DATA(nlh);
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &rule->fields[i];
 		int result = 0;
+		u32 sid;
 
 		switch (f->type) {
 		case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
 			result = audit_comparator(cb->creds.gid, f->op, f->val);
 			break;
 		case AUDIT_LOGINUID:
-			result = audit_comparator(cb->loginuid, f->op, f->val);
+			result = audit_comparator(audit_get_loginuid(current),
+						  f->op, f->val);
 			break;
 		case AUDIT_SUBJ_USER:
 		case AUDIT_SUBJ_ROLE:
 		case AUDIT_SUBJ_TYPE:
 		case AUDIT_SUBJ_SEN:
 		case AUDIT_SUBJ_CLR:
-			if (f->lsm_rule)
-				result = security_audit_rule_match(cb->sid,
+			if (f->lsm_rule) {
+				security_task_getsecid(current, &sid);
+				result = security_audit_rule_match(sid,
 								   f->type,
 								   f->op,
 								   f->lsm_rule,
 								   NULL);
+			}
 			break;
 		}
 
-- 
cgit v1.2.1


From 0c3b9168017cbad2c4af3dd65ec93fe646eeaa62 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Thu, 3 Mar 2011 17:04:35 +0530
Subject: sched: Fix sched rt group scheduling when hierachy is enabled

The current sched rt code is broken when it comes to hierarchical
scheduling, this patch fixes two problems

1. It adds redundant enqueuing (harmless) when it finds a queue
   has tasks enqueued, but it has no run time and it is not
   throttled.

2. The most important change is in sched_rt_rq_enqueue/dequeue.
   The code just picks the rt_rq belonging to the current cpu
   on which the period timer runs, the patch fixes it, so that
   the correct rt_se is enqueued/dequeued.

Tested with a simple hierarchy

/c/d, c and d assigned similar runtimes of 50,000 and a while
1 loop runs within "d". Both c and d get throttled, without
the patch, the task just stops running and never runs (depends
on where the sched_rt b/w timer runs). With the patch, the
task is throttled and runs as expected.

[ bharata, suggestions on how to pick the rt_se belong to the
  rt_rq and correct cpu ]

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ad6267714c84..01f75a5f17af 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-	int this_cpu = smp_processor_id();
 	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 	struct sched_rt_entity *rt_se;
 
-	rt_se = rt_rq->tg->rt_se[this_cpu];
+	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+
+	rt_se = rt_rq->tg->rt_se[cpu];
 
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
@@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-	int this_cpu = smp_processor_id();
 	struct sched_rt_entity *rt_se;
+	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 
-	rt_se = rt_rq->tg->rt_se[this_cpu];
+	rt_se = rt_rq->tg->rt_se[cpu];
 
 	if (rt_se && on_rt_rq(rt_se))
 		dequeue_rt_entity(rt_se);
@@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 				idle = 0;
 			raw_spin_unlock(&rt_rq->rt_runtime_lock);
-		} else if (rt_rq->rt_nr_running)
+		} else if (rt_rq->rt_nr_running) {
 			idle = 0;
+			if (!rt_rq_throttled(rt_rq))
+				enqueue = 1;
+		}
 
 		if (enqueue)
 			sched_rt_rq_enqueue(rt_rq);
-- 
cgit v1.2.1


From a2f5c9ab79f78e8b91ac993e0543d65b661dd19b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Tue, 22 Feb 2011 13:04:33 -0800
Subject: sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks

Perform the test for SCHED_IDLE before testing for SCHED_BATCH (and
ensure idle tasks don't preempt idle tasks) so the non-interactive,
but still important, SCHED_BATCH tasks will run in favor of the very
low priority SCHED_IDLE tasks.

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Richard Purdie <richard.purdie@linuxfoundation.org>
LKML-Reference: <1298408674-3130-2-git-send-email-dvhart@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3a88dee165c0..1438e13cf8be 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1870,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (test_tsk_need_resched(curr))
 		return;
 
+	/* Idle tasks are by definition preempted by non-idle tasks. */
+	if (unlikely(curr->policy == SCHED_IDLE) &&
+	    likely(p->policy != SCHED_IDLE))
+		goto preempt;
+
 	/*
-	 * Batch and idle tasks do not preempt (their preemption is driven by
-	 * the tick):
+	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
+	 * is driven by the tick):
 	 */
 	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
-	/* Idle tasks are by definition preempted by everybody. */
-	if (unlikely(curr->policy == SCHED_IDLE))
-		goto preempt;
 
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
-- 
cgit v1.2.1


From c02aa73b1d18e43cfd79c2f193b225e84ca497c8 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Thu, 17 Feb 2011 15:37:07 -0800
Subject: sched: Allow users with sufficient RLIMIT_NICE to change from
 SCHED_IDLE policy

The current scheduler implementation returns -EPERM when trying to
change from SCHED_IDLE to SCHED_OTHER or SCHED_BATCH. Since SCHED_IDLE
is considered to be a nice 20 on steroids, changing to another policy
should be allowed provided the RLIMIT_NICE is accounted for.

This patch allows the following test-case to pass with RLIMIT_NICE=40,
but still fail with RLIMIT_NICE=10 when the calling process is run
from a typical shell (nice 0, or 20 in rlimit terms).

int main()
{
	int ret;
	struct sched_param sp;
	sp.sched_priority = 0;

	/* switch to SCHED_IDLE */
	ret = sched_setscheduler(0, SCHED_IDLE, &sp);
	printf("setscheduler IDLE: %d\n", ret);
	if (ret) return ret;

	/* switch back to SCHED_OTHER */
	ret = sched_setscheduler(0, SCHED_OTHER, &sp);
	printf("setscheduler OTHER: %d\n", ret);

	return ret;
}

 $ ulimit -e
 40
 $ ./test
 setscheduler IDLE: 0
 setscheduler OTHER: 0

 $ ulimit -e 10
 $ ulimit -e
 10
 $ ./test
 setscheduler IDLE: 0
 setscheduler OTHER: -1

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Purdie <richard.purdie@linuxfoundation.org>
LKML-Reference: <4D657BEE.4040608@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0c8712630f05..f3030709d826 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4981,12 +4981,15 @@ recheck:
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
+
 		/*
-		 * Like positive nice levels, dont allow tasks to
-		 * move out of SCHED_IDLE either:
+		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
+		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
-		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-			return -EPERM;
+		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+			if (!can_nice(p, TASK_NICE(p)))
+				return -EPERM;
+		}
 
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
-- 
cgit v1.2.1


From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 1 Mar 2011 16:28:21 -0800
Subject: sched: Resched proper CPU on yield_to()

yield_to_task_fair() has code to resched the CPU of yielding task when the
intention is to resched the CPU of the task that is being yielded to.

Change here fixes the problem and also makes the resched conditional on
rq != p_rq.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 9 ++++++++-
 kernel/sched_fair.c | 4 ----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f3030709d826..61452e86c73b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5522,8 +5522,15 @@ again:
 		goto out;
 
 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
-	if (yielded)
+	if (yielded) {
 		schedstat_inc(rq, yld_count);
+		/*
+		 * Make p's CPU reschedule; pick_next_entity takes care of
+		 * fairness.
+		 */
+		if (preempt && rq != p_rq)
+			resched_task(p_rq->curr);
+	}
 
 out:
 	double_rq_unlock(rq, p_rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1438e13cf8be..3f7ec9e27ee1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1987,10 +1987,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 	/* Tell the scheduler that we'd really like pse to run next. */
 	set_next_buddy(se);
 
-	/* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
-	if (preempt)
-		resched_task(rq->curr);
-
 	yield_task_fair(rq);
 
 	return true;
-- 
cgit v1.2.1


From 940c5b2971de443df22eed0441bc74fb0116e9f5 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Sun, 27 Feb 2011 21:13:31 +0800
Subject: perf: Fix the missing event initialization when pmu is found in idr

Currently, the event is not initialized if pmu is found in idr. This
never causes bug just because now no pmu is associated with the idr
id.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1298812411.2699.9.camel@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 64a018e94fca..821ce8221974 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6098,17 +6098,22 @@ struct pmu *perf_init_event(struct perf_event *event)
 {
 	struct pmu *pmu = NULL;
 	int idx;
+	int ret;
 
 	idx = srcu_read_lock(&pmus_srcu);
 
 	rcu_read_lock();
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
-	if (pmu)
+	if (pmu) {
+		ret = pmu->event_init(event);
+		if (ret)
+			pmu = ERR_PTR(ret);
 		goto unlock;
+	}
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		int ret = pmu->event_init(event);
+		ret = pmu->event_init(event);
 		if (!ret)
 			goto unlock;
 
-- 
cgit v1.2.1


From 3db272c0494900fcb905a201180a78cae3addd6e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:25:37 +0800
Subject: perf cgroup: Fix leak of file reference count

In perf_cgroup_connect(), fput_light() is missing in a failure path.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F3461.6060406@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 821ce8221974..7c999e809ba3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -404,8 +404,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		return -EBADF;
 
 	css = cgroup_css_from_dir(file, perf_subsys_id);
-	if (IS_ERR(css))
-		return PTR_ERR(css);
+	if (IS_ERR(css)) {
+		ret = PTR_ERR(css);
+		goto out;
+	}
 
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
@@ -422,6 +424,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		/* must be done before we fput() the file */
 		perf_get_cgroup(event);
 	}
+out:
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v1.2.1


From f75e18cb9627b1d3d752b83a0b5563da0042c50a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:25:50 +0800
Subject: perf cgroup: Fix unmatched call to perf_detach_cgroup()

In the failure path, we call perf_detach_cgroup(), but we didn't
call perf_get_cgroup() prio to it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F346E.9070606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7c999e809ba3..b00209544d57 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -412,6 +412,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
 
+	/* must be done before we fput() the file */
+	perf_get_cgroup(event);
+
 	/*
 	 * all events in a group must monitor
 	 * the same cgroup because a task belongs
@@ -420,9 +423,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	if (group_leader && group_leader->cgrp != cgrp) {
 		perf_detach_cgroup(event);
 		ret = -EINVAL;
-	} else {
-		/* must be done before we fput() the file */
-		perf_get_cgroup(event);
 	}
 out:
 	fput_light(file, fput_needed);
-- 
cgit v1.2.1


From 1b15d0558e82df9b3659804ceb44187b98eda354 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:26:06 +0800
Subject: perf cgroup: Clean up perf_cgroup_create()

- Use kzalloc() to replace kmalloc() + memset().

- Remove redundant initialization, since alloc_percpu() returns
  zero-filled percpu memory.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F347E.2010806@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b00209544d57..193b1900e64f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -7346,26 +7346,17 @@ static struct cgroup_subsys_state *perf_cgroup_create(
 	struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct perf_cgroup *jc;
-	struct perf_cgroup_info *t;
-	int c;
 
-	jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
 	if (!jc)
 		return ERR_PTR(-ENOMEM);
 
-	memset(jc, 0, sizeof(*jc));
-
 	jc->info = alloc_percpu(struct perf_cgroup_info);
 	if (!jc->info) {
 		kfree(jc);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	for_each_possible_cpu(c) {
-		t = per_cpu_ptr(jc->info, c);
-		t->time = 0;
-		t->timestamp = 0;
-	}
 	return &jc->css;
 }
 
-- 
cgit v1.2.1


From 08309379b7083a9ceec0f9bb96a629058fb623c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Mar 2011 11:31:20 +0100
Subject: perf: Fix cgroup vs jump_label problem

Li Zefan reported that the jump label code sleeps and we're calling it
under a spinlock, *fail* ;-)

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 193b1900e64f..ed253aa24ba4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -820,16 +820,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
-	if (is_cgroup_event(event)) {
+	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
-		/*
-		 * one more event:
-		 * - that has cgroup constraint on event->cpu
-		 * - that may need work on context switch
-		 */
-		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-		jump_label_inc(&perf_sched_events);
-	}
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
@@ -957,11 +949,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-	if (is_cgroup_event(event)) {
+	if (is_cgroup_event(event))
 		ctx->nr_cgroups--;
-		atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-		jump_label_dec(&perf_sched_events);
-	}
 
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
@@ -2903,6 +2892,10 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&nr_task_events);
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
 			put_callchain_buffers();
+		if (is_cgroup_event(event)) {
+			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+			jump_label_dec(&perf_sched_events);
+		}
 	}
 
 	if (event->buffer) {
@@ -6478,6 +6471,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		err = perf_cgroup_connect(pid, event, &attr, group_leader);
 		if (err)
 			goto err_alloc;
+		/*
+		 * one more event:
+		 * - that has cgroup constraint on event->cpu
+		 * - that may need work on context switch
+		 */
+		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_inc(&perf_sched_events);
 	}
 
 	/*
-- 
cgit v1.2.1


From ba74f4d7e5125d04d453b4af69c53c533e6feb80 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 9 Jan 2011 18:09:51 -0800
Subject: rcu: call __rcu_read_unlock() in exit_rcu for tiny RCU

Using __rcu_read_lock() in place of rcu_read_lock() leaves any debug
state as it really should be, namely with the lock still held.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
 	if (t->rcu_read_lock_nesting == 0)
 		return;
 	t->rcu_read_lock_nesting = 1;
-	rcu_read_unlock();
+	__rcu_read_unlock();
 }
 
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
-- 
cgit v1.2.1


From 37743de384951ef97c040e69039820c987849501 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 10 Jan 2011 21:43:13 +0100
Subject: rcutorture: Get rid of duplicate sched.h include

linux/sched.h is included twice in kernel/rcutorture.c - once is enough.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
-#include <linux/sched.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
-- 
cgit v1.2.1


From fe8e64071a4ff5925ced4f33cb32ba090a04649c Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Sun, 30 Jan 2011 18:23:08 +0800
Subject: rcupdate: remove dead code

DEBUG_OBJECTS_RCU_HEAD depends on PREEMPT, so #ifndef CONFIG_PREEMPT
is totally useless in kernel/rcupdate.c.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcupdate.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..afd21d17c081 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -215,10 +215,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
 		 * If we detect that we are nested in a RCU read-side critical
 		 * section, we should simply fail, otherwise we would deadlock.
 		 */
-#ifndef CONFIG_PREEMPT
-		WARN_ON(1);
-		return 0;
-#else
 		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
 		    irqs_disabled()) {
 			WARN_ON(1);
@@ -229,7 +225,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
 		rcu_barrier_bh();
 		debug_object_free(head, &rcuhead_debug_descr);
 		return 1;
-#endif
 	default:
 		return 0;
 	}
-- 
cgit v1.2.1


From e611eecd6f9f91d74beda8cfbb3b5e02abdeb5a1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 30 Jan 2011 23:56:46 -0800
Subject: rcu: add comment saying why DEBUG_OBJECTS_RCU_HEAD depends on
 PREEMPT.

The build will break if you change the Kconfig to allow
DEBUG_OBJECTS_RCU_HEAD and !PREEMPT, so document the reasoning
near where the breakage would occur.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcupdate.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index afd21d17c081..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,6 +214,11 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
 		 * Ensure that queued callbacks are all executed.
 		 * If we detect that we are nested in a RCU read-side critical
 		 * section, we should simply fail, otherwise we would deadlock.
+		 * Note that the machinery to reliably determine whether
+		 * or not we are in an RCU read-side critical section
+		 * exists only in the preemptible RCU implementations
+		 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
+		 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
 		 */
 		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
 		    irqs_disabled()) {
-- 
cgit v1.2.1


From e3e89cc535223433a619d0969db3fa05cdd946b8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 4 Mar 2011 09:23:30 -0800
Subject: Mark ptrace_{traceme,attach,detach} static

They are only used inside kernel/ptrace.c, and have been for a long
time.  We don't want to go back to the bad-old-days when architectures
did things on their own, so make them static and private.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1708b1e2972d..e2302e40b360 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return !err;
 }
 
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
 {
 	int retval;
 
@@ -219,7 +219,7 @@ out:
  * Performs checks and sets PT_PTRACED.
  * Should be used by all ptrace implementations for PTRACE_TRACEME.
  */
-int ptrace_traceme(void)
+static int ptrace_traceme(void)
 {
 	int ret = -EPERM;
 
@@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 	return false;
 }
 
-int ptrace_detach(struct task_struct *child, unsigned int data)
+static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	bool dead = false;
 
-- 
cgit v1.2.1


From b75f38d659e6fc747eda64cb72f3920e29dd44a4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 4 Mar 2011 17:36:21 -0800
Subject: cpuset: add a missing unlock in cpuset_write_resmask()

Don't forget to release cgroup_mutex if alloc_trial_cpuset() fails.

[akpm@linux-foundation.org: avoid multiple return points]
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..e92e98189032 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 		return -ENODEV;
 
 	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs)
-		return -ENOMEM;
+	if (!trialcs) {
+		retval = -ENOMEM;
+		goto out;
+	}
 
 	switch (cft->private) {
 	case FILE_CPULIST:
@@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 	}
 
 	free_trial_cpuset(trialcs);
+out:
 	cgroup_unlock();
 	return retval;
 }
-- 
cgit v1.2.1


From 4ba8216cd90560bc402f52076f64d8546e8aefcb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 25 Jan 2011 22:52:22 +0100
Subject: BKL: That's all, folks

This removes the implementation of the big kernel lock,
at last. A lot of people have worked on this in the
past, I so the credit for this patch should be with
everyone who participated in the hunt.

The names on the Cc list are the people that were the
most active in this, according to the recorded git
history, in alphabetical order.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alan Cox <alan@linux.intel.com>
Cc: Alessio Igor Bogani <abogani@texware.it>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hans Verkuil <hverkuil@xs4all.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jan Blunck <jblunck@infradead.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Oliver Neukum <oliver@neukum.org>
Cc: Paul Menage <menage@google.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 kernel/sched.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..827c170c6017 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
@@ -3945,9 +3944,6 @@ need_resched:
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 
-	release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
 	schedule_debug(prev);
 
 	if (sched_feat(HRTICK))
@@ -4010,9 +4006,6 @@ need_resched_nonpreemptible:
 
 	post_schedule(rq);
 
-	if (unlikely(reacquire_kernel_lock(prev)))
-		goto need_resched_nonpreemptible;
-
 	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
@@ -8074,7 +8067,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 
-	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+	return (nested == preempt_offset);
 }
 
 void __might_sleep(const char *file, int line, int preempt_offset)
-- 
cgit v1.2.1


From dfef6dcd35cb4a251f6322ca9b2c06f0bb1aa1f4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 01:25:28 -0500
Subject: unfuck proc_sysctl ->d_compare()

a) struct inode is not going to be freed under ->d_compare();
however, the thing PROC_I(inode)->sysctl points to just might.
Fortunately, it's enough to make freeing that sucker delayed,
provided that we don't step on its ->unregistering, clear
the pointer to it in PROC_I(inode) before dropping the reference
and check if it's NULL in ->d_compare().

b) I'm not sure that we *can* walk into NULL inode here (we recheck
dentry->seq between verifying that it's still hashed / fetching
dentry->d_inode and passing it to ->d_compare() and there's no
negative hashed dentries in /proc/sys/*), but if we can walk into
that, we really should not have ->d_compare() return 0 on it!
Said that, I really suspect that this check can be simply killed.
Nick?

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..4eed0af5d144 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -194,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
-	.count = 1,
+	{{.count = 1,
 	.ctl_table = root_table,
-	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
 	.root = &sysctl_table_root,
 	.set = &sysctl_table_root.default_set,
 };
@@ -1567,11 +1567,16 @@ void sysctl_head_get(struct ctl_table_header *head)
 	spin_unlock(&sysctl_lock);
 }
 
+static void free_head(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct ctl_table_header, rcu));
+}
+
 void sysctl_head_put(struct ctl_table_header *head)
 {
 	spin_lock(&sysctl_lock);
 	if (!--head->count)
-		kfree(head);
+		call_rcu(&head->rcu, free_head);
 	spin_unlock(&sysctl_lock);
 }
 
@@ -1948,10 +1953,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	start_unregistering(header);
 	if (!--header->parent->count) {
 		WARN_ON(1);
-		kfree(header->parent);
+		call_rcu(&header->parent->rcu, free_head);
 	}
 	if (!--header->count)
-		kfree(header);
+		call_rcu(&header->rcu, free_head);
 	spin_unlock(&sysctl_lock);
 }
 
-- 
cgit v1.2.1


From 997772884036e6e121de39322179989154437d9f Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 7 Mar 2011 09:58:33 +0100
Subject: debugobjects: Add hint for better object identification

In complex subsystems like mac80211 structures can contain several
timers and work structs, so identifying a specific instance from the
call trace and object type output of debugobjects can be hard.

Allow the subsystems which support debugobjects to provide a hint
function. This function returns a pointer to a kernel address
(preferrably the objects callback function) which is printed along
with the debugobjects type.

Add hint methods for timer_list, work_struct and hrtimer.

[ tglx: Massaged changelog, made it compile ]

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <20110307085809.GA9334@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c   | 6 ++++++
 kernel/timer.c     | 6 ++++++
 kernel/workqueue.c | 6 ++++++
 3 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..e38f5a073d01 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -334,6 +334,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
 
 static struct debug_obj_descr hrtimer_debug_descr;
 
+static void *hrtimer_debug_hint(void *addr)
+{
+	return ((struct hrtimer *) addr)->function;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -393,6 +398,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 
 static struct debug_obj_descr hrtimer_debug_descr = {
 	.name		= "hrtimer",
+	.debug_hint	= hrtimer_debug_hint,
 	.fixup_init	= hrtimer_fixup_init,
 	.fixup_activate	= hrtimer_fixup_activate,
 	.fixup_free	= hrtimer_fixup_free,
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..33a67925d900 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
 
 static struct debug_obj_descr timer_debug_descr;
 
+static void *timer_debug_hint(void *addr)
+{
+	return ((struct timer_list *) addr)->function;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
 
 static struct debug_obj_descr timer_debug_descr = {
 	.name		= "timer_list",
+	.debug_hint	= timer_debug_hint,
 	.fixup_init	= timer_fixup_init,
 	.fixup_activate	= timer_fixup_activate,
 	.fixup_free	= timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578ad..b5fe4c00eb3c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
 
 static struct debug_obj_descr work_debug_descr;
 
+static void *work_debug_hint(void *addr)
+{
+	return ((struct work_struct *) addr)->func;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
 
 static struct debug_obj_descr work_debug_descr = {
 	.name		= "work_struct",
+	.debug_hint	= work_debug_hint,
 	.fixup_init	= work_fixup_init,
 	.fixup_activate	= work_fixup_activate,
 	.fixup_free	= work_fixup_free,
-- 
cgit v1.2.1


From c68fd4f3ca90de7d18c567e70b2c164078aefadf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Mar 2011 19:52:55 +0100
Subject: genirq: Add comments to Kconfig switches

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Sam Ravnborg <sam@ravnborg.org>
---
 kernel/irq/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 144db9dcfcde..09bef82d74cb 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# Select this to activate the generic irq options below
 config HAVE_GENERIC_HARDIRQS
 	bool
 
@@ -17,27 +18,36 @@ config GENERIC_HARDIRQS_NO_COMPAT
        bool
 
 # Options selectable by the architecture code
+
+# Make sparse irq Kconfig switch below available
 config HAVE_SPARSE_IRQ
        bool
 
+# Enable the generic irq autoprobe mechanism
 config GENERIC_IRQ_PROBE
 	bool
 
+# Use the generic /proc/interrupts implementation
 config GENERIC_IRQ_SHOW
        bool
 
+# Support for delayed migration from interrupt context
 config GENERIC_PENDING_IRQ
 	bool
 
+# Alpha specific irq affinity mechanism
 config AUTO_IRQ_AFFINITY
        bool
 
+# Tasklet based software resend for pending interrupts on enable_irq()
 config HARDIRQS_SW_RESEND
        bool
 
+# Preflow handler support for fasteoi (sparc64)
 config IRQ_PREFLOW_FASTEOI
        bool
 
+# Support forced irq threading
 config IRQ_FORCED_THREADING
        bool
 
-- 
cgit v1.2.1


From 750912fa366312e9c5bc83eab352898a26750401 Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Wed, 8 Dec 2010 13:46:47 -0800
Subject: tracing: Add an 'overwrite' trace_option.

Add an "overwrite" trace_option for ftrace to control whether the buffer should
be overwritten on overflow or not. The default remains to overwrite old events
when the buffer is full. This patch adds the option to instead discard newest
events when the buffer is full. This is useful to get a snapshot of traces just
after enabling traces. Dropping the current event is also a simpler code path.

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291844807-15481-1-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 11 +++++++++++
 kernel/trace/trace.c       | 17 +++++++++++------
 kernel/trace/trace.h       |  1 +
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..269db80a961e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1429,6 +1429,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+	mutex_lock(&buffer->mutex);
+	if (val)
+		buffer->flags |= RB_FL_OVERWRITE;
+	else
+		buffer->flags &= ~RB_FL_OVERWRITE;
+	mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
+
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8dc8da6733f9..85e3ee1e474e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
 #include "trace.h"
 #include "trace_output.h"
 
-#define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
-
 /*
  * On boot up, the ring buffer is set to the minimum size, so that
  * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
 	"sleep-time",
 	"graph-time",
 	"record-cmd",
+	"overwrite",
 	NULL
 };
 
@@ -2529,6 +2528,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
+
+	if (mask == TRACE_ITER_OVERWRITE)
+		ring_buffer_change_overwrite(global_trace.buffer, enabled);
 }
 
 static ssize_t
@@ -4555,9 +4557,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
+	enum ring_buffer_flags rb_flags;
 	int i;
 	int ret = -ENOMEM;
 
+
 	if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
 		goto out;
 
@@ -4570,12 +4574,13 @@ __init static int tracer_alloc_buffers(void)
 	else
 		ring_buf_size = 1;
 
+	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
-						   TRACE_BUFFER_FLAGS);
+	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
 	if (!global_trace.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
@@ -4585,7 +4590,7 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
+	max_tr.buffer = ring_buffer_alloc(1, rb_flags);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 856e73cc1d3f..951d0b7e7062 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SLEEP_TIME		= 0x40000,
 	TRACE_ITER_GRAPH_TIME		= 0x80000,
 	TRACE_ITER_RECORD_CMD		= 0x100000,
+	TRACE_ITER_OVERWRITE		= 0x200000,
 };
 
 /*
-- 
cgit v1.2.1


From de29be5e712dc8b7eef2bef9417af3bb6a88e47a Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:16 -0800
Subject: ring-buffer: Remove unused #include <linux/trace_irq.h>

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-3-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 269db80a961e..3237d961d905 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
  */
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
-#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-- 
cgit v1.2.1


From e6e1e2593592a8f6f6380496655d8c6f67431266 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Mar 2011 10:41:56 -0500
Subject: tracing: Remove lock_depth from event entry

The lock_depth field in the event headers was added as a temporary
data point for help in removing the BKL. Now that the BKL is pretty
much been removed, we can remove this field.

This in turn changes the header from 12 bytes to 8 bytes,
removing the 4 byte buffer that gcc would insert if the first field
in the data load was 8 bytes in size.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  8 +++-----
 kernel/trace/trace_events.c |  1 -
 kernel/trace/trace_output.c | 10 ++--------
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 85e3ee1e474e..fd6e1b906b3c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1101,7 +1101,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->lock_depth		= (tsk) ? tsk->lock_depth : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1748,10 +1747,9 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /_--=> lock-depth       \n");
-	seq_puts(m, "#                |||||/     delay             \n");
-	seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");
+	seq_puts(m, "#                |||| /     delay             \n");
+	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
 
 static void print_func_help_header(struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..e1d579b19834 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
-	__common_field(int, lock_depth);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..151f32e5f2c6 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,7 +529,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
  * @entry: The trace entry field from the ring buffer
  *
  * Prints the generic fields of irqs off, in hard or softirq, preempt
- * count and lock depth.
+ * count.
  */
 int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
@@ -554,13 +554,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
-	if (!ret)
-		return 0;
-
-	if (entry->lock_depth < 0)
-		return trace_seq_putc(s, '.');
-
-	return trace_seq_printf(s, "%d", entry->lock_depth);
+	return ret;
 }
 
 static int
-- 
cgit v1.2.1


From 140e4f2d1cd816aed196705c036763313c0e4bd3 Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:19 -0800
Subject: tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-6-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_entries.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..1516cb3ec549 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
  */
 #define FTRACE_CTX_FIELDS					\
 	__field(	unsigned int,	prev_pid	)	\
+	__field(	unsigned int,	next_pid	)	\
+	__field(	unsigned int,	next_cpu	)       \
 	__field(	unsigned char,	prev_prio	)	\
 	__field(	unsigned char,	prev_state	)	\
-	__field(	unsigned int,	next_pid	)	\
 	__field(	unsigned char,	next_prio	)	\
-	__field(	unsigned char,	next_state	)	\
-	__field(	unsigned int,	next_cpu	)
+	__field(	unsigned char,	next_state	)
 
 FTRACE_ENTRY(context_switch, ctx_switch_entry,
 
-- 
cgit v1.2.1


From 10da37a645b5e915d8572cc2b1f5eb11ada3ea4f Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:26 -0800
Subject: tracing: Adjust conditional expression latency formatting.

Formatting change only to improve code readability. No code changes except to
introduce intermediate variables.

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-13-git-send-email-dhsharp@google.com>

[ Keep variable declarations and assignment separate ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 151f32e5f2c6..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -533,20 +533,30 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
  */
 int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
-	int hardirq, softirq;
+	char hardsoft_irq;
+	char need_resched;
+	char irqs_off;
+	int hardirq;
+	int softirq;
 	int ret;
 
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 
+	irqs_off =
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+		'.';
+	need_resched =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+	hardsoft_irq =
+		(hardirq && softirq) ? 'H' :
+		hardirq ? 'h' :
+		softirq ? 's' :
+		'.';
+
 	if (!trace_seq_printf(s, "%c%c%c",
-			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
-				  'X' : '.',
-			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
-				'N' : '.',
-			      (hardirq && softirq) ? 'H' :
-				hardirq ? 'h' : softirq ? 's' : '.'))
+			      irqs_off, need_resched, hardsoft_irq))
 		return 0;
 
 	if (entry->preempt_count)
-- 
cgit v1.2.1


From 31274d72f01604f4b02d933b4f3cac84d2c201fd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 18 Feb 2011 15:52:19 +0100
Subject: tracing: Explain about unstable clock on resume with ring buffer
 warning

The "Delta way too big" warning might appear on a system with a
unstable shed clock right after the system is resumed and tracing
was enabled at time of suspend.

Since it's not realy a bug, and the unstable sched clock is working
fast and reliable otherwise, Steven suggested to keep using the
sched clock in any case and just to make note in the warning itself.

v2 changes:
- added #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <20110218145219.GD2604@jolsa.brq.redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3237d961d905..db7b439d23ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2172,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (likely(ts >= cpu_buffer->write_stamp)) {
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
+			int local_clock_stable = 1;
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+			local_clock_stable = sched_clock_stable;
+#endif
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp);
+				  (unsigned long long)cpu_buffer->write_stamp,
+				  local_clock_stable ? "" :
+				  "If you just came from a suspend/resume,\n"
+				  "please switch to the trace global clock:\n"
+				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
 			add_timestamp = 1;
 		}
 	}
-- 
cgit v1.2.1


From 56355b83e2a24ce7e1870c8479205e2cdd332225 Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Mon, 8 Nov 2010 14:05:12 +0800
Subject: tracing: Export trace_set_clr_event()

Trace events belonging to a module only exists when the module is
loaded. Well, we can use trace_set_clr_event funtion to enable some
trace event at the module init routine, so that we will not miss
something while loading then module.

So, Export the trace_set_clr_event function so that module can use it.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
LKML-Reference: <1289196312-25323-1-git-send-email-yuanhan.liu@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e1d579b19834..e88f74fe1d4c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -325,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
 {
 	return __ftrace_set_clr_event(NULL, system, event, set);
 }
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
 
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
-- 
cgit v1.2.1


From 9a24470b2826e4665b1484836c7ae6aba1ddea32 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Mar 2011 14:53:38 -0500
Subject: tracing: Align 4 byte ints together in struct tracer

Move elements in struct tracer for better alignment.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 951d0b7e7062..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
 	/* If you handled the flag setting, return 0 */
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
-	int			print_max;
 	struct tracer_flags	*flags;
+	int			print_max;
 	int			use_max_tr;
 };
 
-- 
cgit v1.2.1


From 4a0b1665db09cf2da9ad7d0f12da386373c10bfa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Mar 2011 20:09:26 -0500
Subject: tracing: Fix irqoff selftest expanding max buffer

If the kernel command line declares a tracer "ftrace=sometracer" and
that tracer is either not defined or is enabled after irqsoff,
then the irqs off selftest will fail with the following error:

Testing tracer irqsoff:
------------[ cut here ]------------
WARNING: at /home/rostedt/work/autotest/nobackup/linux-test.git/kernel/trace/tra
ce.c:713 update_max_tr_single+0xfa/0x11b()
Hardware name:
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.38-rc8-test #1
Call Trace:
 [<c0441d9d>] ? warn_slowpath_common+0x65/0x7a
 [<c049adb2>] ? update_max_tr_single+0xfa/0x11b
 [<c0441dc1>] ? warn_slowpath_null+0xf/0x13
 [<c049adb2>] ? update_max_tr_single+0xfa/0x11b
 [<c049e454>] ? stop_critical_timing+0x154/0x204
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049e529>] ? time_hardirqs_on+0x25/0x28
 [<c0468bca>] ? trace_hardirqs_on_caller+0x18/0x12f
 [<c0468cec>] ? trace_hardirqs_on+0xb/0xd
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049b6b8>] ? register_tracer+0xf8/0x1a3
 [<c14e93fe>] ? init_irqsoff_tracer+0xd/0x11
 [<c040115e>] ? do_one_initcall+0x71/0x121
 [<c14e93f1>] ? init_irqsoff_tracer+0x0/0x11
 [<c14ce3a9>] ? kernel_init+0x13a/0x1b6
 [<c14ce26f>] ? kernel_init+0x0/0x1b6
 [<c0403842>] ? kernel_thread_helper+0x6/0x10
---[ end trace e93713a9d40cd06c ]---
.. no entries found ..FAILED!

What happens is the "ftrace=..." will expand the ring buffer to its
default size (from its minimum size) but it will not expand the
max ring buffer (the ring buffer to store maximum latencies).
When the irqsoff test runs, it will call the ring buffer swap routine
that checks if the max ring buffer is the same size as the normal
ring buffer, and will fail if it is not. This causes the test to fail.

The solution is to expand the max ring buffer before running the self
test if the max ring buffer is used by that tracer and the normal ring
buffer is expanded. The max ring buffer should be shrunk again after
the test is done to save space.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd6e1b906b3c..9541c27c1cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -779,6 +779,11 @@ __acquires(kernel_lock)
 		tracing_reset_online_cpus(tr);
 
 		current_trace = type;
+
+		/* If we expanded the buffers, make sure the max is expanded too */
+		if (ring_buffer_expanded && type->use_max_tr)
+			ring_buffer_resize(max_tr.buffer, trace_buf_size);
+
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
@@ -791,6 +796,10 @@ __acquires(kernel_lock)
 		/* Only reset on passing, to avoid touching corrupted buffers */
 		tracing_reset_online_cpus(tr);
 
+		/* Shrink the max buffer again */
+		if (ring_buffer_expanded && type->use_max_tr)
+			ring_buffer_resize(max_tr.buffer, 1);
+
 		printk(KERN_CONT "PASSED\n");
 	}
 #endif
-- 
cgit v1.2.1


From a9e7acfff0a279792918b7b0de74106e576e9988 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2011 19:12:24 +0100
Subject: hrtimer: Remove empty hrtimer_init_hres_timer()

Leftover from earlier implementation. All empty, remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f679a2d08723..63fb74972b75 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -680,14 +680,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 	base->hres_active = 0;
 }
 
-/*
- * Initialize the high resolution related parts of a hrtimer
- */
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
-{
-}
-
-
 /*
  * When High resolution timers are active, try to reprogram. Note, that in case
  * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -759,7 +751,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 	return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -1141,7 +1132,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
-	hrtimer_init_timer_hres(timer);
 	timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v1.2.1


From 8fe8f545c6d753ead15e1f4919d39e8f9bb49629 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Sun, 6 Mar 2011 18:07:50 -0800
Subject: futex: Update futex_wait_setup comments about locking

Reviving a cleanup I had done about a year ago as part of a larger
futex_set_wait proposal. Over the years, the locking of the hashed
futex queue got improved, so that some of the "rare but normal" race
conditions described in comments can't actually happen anymore.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110307020750.GA31188@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..3184d3b9cadf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1781,13 +1781,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 *
 	 * The basic logical guarantee of a futex is that it blocks ONLY
 	 * if cond(var) is known to be true at the time of blocking, for
-	 * any cond.  If we queued after testing *uaddr, that would open
-	 * a race condition where we could block indefinitely with
+	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
+	 * would open a race condition where we could block indefinitely with
 	 * cond(var) false, which would violate the guarantee.
 	 *
-	 * A consequence is that futex_wait() can return zero and absorb
-	 * a wakeup when *uaddr != val on entry to the syscall.  This is
-	 * rare, but normal.
+	 * On the other hand, we insert q and release the hash-bucket only
+	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
+	 * absorb a wakeup if *uaddr does not match the desired values
+	 * while the syscall executes.
 	 */
 retry:
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
-- 
cgit v1.2.1


From bf294b41cefcb22fc3139e0f42c5b3f06728bd5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 21 Feb 2011 11:05:41 -0800
Subject: SUNRPC: Close a race in __rpc_wait_for_completion_task()

Although they run as rpciod background tasks, under normal operation
(i.e. no SIGKILL), functions like nfs_sillyrename(), nfs4_proc_unlck()
and nfs4_do_close() want to be fully synchronous. This means that when we
exit, we want all references to the rpc_task to be gone, and we want
any dentry references etc. held by that task to be released.

For this reason these functions call __rpc_wait_for_completion_task(),
followed by rpc_put_task() in the expectation that the latter will be
releasing the last reference to the rpc_task, and thus ensuring that the
callback_ops->rpc_release() has been called synchronously.

This patch fixes a race which exists due to the fact that
rpciod calls rpc_complete_task() (in order to wake up the callers of
__rpc_wait_for_completion_task()) and then subsequently calls
rpc_put_task() without ensuring that these two steps are done atomically.

In order to avoid adding new spin locks, the patch uses the existing
waitqueue spin lock to order the rpc_task reference count releases between
the waiting process and rpciod.
The common case where nobody is waiting for completion is optimised for by
checking if the RPC_TASK_ASYNC flag is cleared and/or if the rpc_task
reference count is 1: in those cases we drop trying to grab the spin lock,
and immediately free up the rpc_task.

Those few processes that need to put the rpc_task from inside an
asynchronous context and that do not care about ordering are given a new
helper: rpc_put_task_async().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..42eab5a8437d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4213,6 +4213,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
 	__wake_up_common(q, mode, 1, 0, key);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
-- 
cgit v1.2.1


From c0c9ed15042ceac7c485813012a0a97316101b57 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Mar 2011 11:51:22 +0100
Subject: futex: Avoid redudant evaluation of task_pid_vnr()

The result is not going to change under us, so no need to reevaluate
this over and over. Seems to be a leftover from the mechanical mass
conversion of task->pid to task_pid_vnr(tsk).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3184d3b9cadf..773815465bac 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -674,7 +674,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				struct task_struct *task, int set_waiters)
 {
 	int lock_taken, ret, ownerdied = 0;
-	u32 uval, newval, curval;
+	u32 uval, newval, curval, vpid = task_pid_vnr(task);
 
 retry:
 	ret = lock_taken = 0;
@@ -684,7 +684,7 @@ retry:
 	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
 	 * the locks. It will most likely not succeed.
 	 */
-	newval = task_pid_vnr(task);
+	newval = vpid;
 	if (set_waiters)
 		newval |= FUTEX_WAITERS;
 
@@ -696,7 +696,7 @@ retry:
 	/*
 	 * Detect deadlocks.
 	 */
-	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+	if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 		return -EDEADLK;
 
 	/*
@@ -723,7 +723,7 @@ retry:
 	 */
 	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
 		/* Keep the OWNER_DIED bit */
-		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+		newval = (curval & ~FUTEX_TID_MASK) | vpid;
 		ownerdied = 0;
 		lock_taken = 1;
 	}
@@ -2047,9 +2047,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
-	u32 uval;
 	struct plist_head *head;
 	union futex_key key = FUTEX_KEY_INIT;
+	u32 uval, vpid = task_pid_vnr(current);
 	int ret;
 
 retry:
@@ -2058,7 +2058,7 @@ retry:
 	/*
 	 * We release only a lock we actually own:
 	 */
-	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;
 
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2074,7 +2074,7 @@ retry:
 	 * anyone else up:
 	 */
 	if (!(uval & FUTEX_OWNER_DIED))
-		uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
+		uval = cmpxchg_futex_value_locked(uaddr, vpid, 0);
 
 
 	if (unlikely(uval == -EFAULT))
@@ -2083,7 +2083,7 @@ retry:
 	 * Rare case: we managed to release the lock atomically,
 	 * no need to wake anyone else up:
 	 */
-	if (unlikely(uval == task_pid_vnr(current)))
+	if (unlikely(uval == vpid))
 		goto out_unlock;
 
 	/*
-- 
cgit v1.2.1


From 37a9d912b24f96a0591773e6e6c3642991ae5a70 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Thu, 10 Mar 2011 18:48:51 -0800
Subject: futex: Sanitize cmpxchg_futex_value_locked API

The cmpxchg_futex_value_locked API was funny in that it returned either
the original, user-exposed futex value OR an error code such as -EFAULT.
This was confusing at best, and could be a source of livelocks in places
that retry the cmpxchg_futex_value_locked after trying to fix the issue
by running fault_in_user_writeable().

This change makes the cmpxchg_futex_value_locked API more similar to the
get_futex_value_locked one, returning an error code and updating the
original value through a reference argument.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>  [tile]
Acked-by: Tony Luck <tony.luck@intel.com>  [ia64]
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michal Simek <monstr@monstr.eu>  [microblaze]
Acked-by: David Howells <dhowells@redhat.com> [frv]
Cc: Darren Hart <darren@dvhart.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110311024851.GC26122@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 45 +++++++++++++++------------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 773815465bac..237f14bfc022 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 	return NULL;
 }
 
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+				      u32 uval, u32 newval)
 {
-	u32 curval;
+	int ret;
 
 	pagefault_disable();
-	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 	pagefault_enable();
 
-	return curval;
+	return ret;
 }
 
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -688,9 +689,7 @@ retry:
 	if (set_waiters)
 		newval |= FUTEX_WAITERS;
 
-	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-	if (unlikely(curval == -EFAULT))
+	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 		return -EFAULT;
 
 	/*
@@ -728,9 +727,7 @@ retry:
 		lock_taken = 1;
 	}
 
-	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-	if (unlikely(curval == -EFAULT))
+	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 		return -EFAULT;
 	if (unlikely(curval != uval))
 		goto retry;
@@ -843,9 +840,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 
 		newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-		if (curval == -EFAULT)
+		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 			ret = -EFAULT;
 		else if (curval != uval)
 			ret = -EINVAL;
@@ -880,10 +875,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 	 * There is no waiter, so we unlock the futex. The owner died
 	 * bit has not to be preserved here. We are the owner:
 	 */
-	oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
-	if (oldval == -EFAULT)
-		return oldval;
+	if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+		return -EFAULT;
 	if (oldval != uval)
 		return -EAGAIN;
 
@@ -1578,9 +1571,7 @@ retry:
 	while (1) {
 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-		if (curval == -EFAULT)
+		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 			goto handle_fault;
 		if (curval == uval)
 			break;
@@ -2073,11 +2064,8 @@ retry:
 	 * again. If it succeeds then we can return without waking
 	 * anyone else up:
 	 */
-	if (!(uval & FUTEX_OWNER_DIED))
-		uval = cmpxchg_futex_value_locked(uaddr, vpid, 0);
-
-
-	if (unlikely(uval == -EFAULT))
+	if (!(uval & FUTEX_OWNER_DIED) &&
+	    cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
 		goto pi_faulted;
 	/*
 	 * Rare case: we managed to release the lock atomically,
@@ -2464,9 +2452,7 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
-		if (nval == -EFAULT)
+		if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval))
 			return -1;
 
 		if (nval != uval)
@@ -2679,8 +2665,7 @@ static int __init futex_init(void)
 	 * implementation, the non-functional ones will return
 	 * -ENOSYS.
 	 */
-	curval = cmpxchg_futex_value_locked(NULL, 0, 0);
-	if (curval == -EFAULT)
+	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-- 
cgit v1.2.1


From 2e12978a9f7a7abd54e8eb9ce70a7718767b8b2c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 22 Dec 2010 14:18:50 +0800
Subject: futex,plist: Pass the real head of the priority list to plist_del()

Some plist_del()s in kernel/futex.c are passed a faked head of the
priority list.

It does not fail because the current code does not require the real head
in plist_del(). The current code of plist_del() just uses the head for checking,
so it will not cause a bad result even when we use a faked head.

But it is undocumented usage:

/**
 * plist_del - Remove a @node from plist.
 *
 * @node:	&struct plist_node pointer - entry to be removed
 * @head:	&struct plist_head pointer - list head
 */

The document says that the @head is the "list head" head of the priority list.

In futex code, several places use "plist_del(&q->list, &q->list.plist);",
they pass a fake head. We need to fix them all.

Thanks to Darren Hart for many suggestions.

Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D11984A.5030203@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/futex.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..6feeea4f8f15 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -775,6 +775,24 @@ retry:
 	return ret;
 }
 
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q:	The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+	struct futex_hash_bucket *hb;
+
+	if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
+			|| plist_node_empty(&q->list)))
+		return;
+
+	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+	plist_del(&q->list, &hb->chain);
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -792,7 +810,7 @@ static void wake_futex(struct futex_q *q)
 	 */
 	get_task_struct(p);
 
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as
 	 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -1100,8 +1118,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	get_futex_key_refs(key);
 	q->key = *key;
 
-	WARN_ON(plist_node_empty(&q->list));
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 
 	WARN_ON(!q->rt_waiter);
 	q->rt_waiter = NULL;
@@ -1504,8 +1521,7 @@ retry:
 			spin_unlock(lock_ptr);
 			goto retry;
 		}
-		WARN_ON(plist_node_empty(&q->list));
-		plist_del(&q->list, &q->list.plist);
+		__unqueue_futex(q);
 
 		BUG_ON(q->pi_state);
 
@@ -1525,8 +1541,7 @@ retry:
 static void unqueue_me_pi(struct futex_q *q)
 	__releases(q->lock_ptr)
 {
-	WARN_ON(plist_node_empty(&q->list));
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 
 	BUG_ON(!q->pi_state);
 	free_pi_state(q->pi_state);
@@ -2167,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		 * We were woken prior to requeue by a timeout or a signal.
 		 * Unqueue the futex_q and determine which it was.
 		 */
-		plist_del(&q->list, &q->list.plist);
+		plist_del(&q->list, &hb->chain);
 
 		/* Handle spurious wakeups gracefully */
 		ret = -EWOULDBLOCK;
-- 
cgit v1.2.1


From 017f2b239dabb2740b91df162e004371b861f371 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 21 Dec 2010 17:55:10 +0800
Subject: futex,plist: Remove debug lock assignment from plist_node

The original code uses &plist_node->plist as the fake head of
the priority list for plist_del(), these debug locks in
the fake head are needed for CONFIG_DEBUG_PI_LIST.

But now we always pass the real head to plist_del(), the debug locks
in plist_node will not be used, so we remove these assignments.

Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D10797E.7040803@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/futex.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6feeea4f8f15..9fe913141ec9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1089,9 +1089,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 		plist_del(&q->list, &hb1->chain);
 		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-		q->list.plist.spinlock = &hb2->lock;
-#endif
 	}
 	get_futex_key_refs(key2);
 	q->key = *key2;
@@ -1124,9 +1121,6 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	q->rt_waiter = NULL;
 
 	q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-	q->list.plist.spinlock = &hb->lock;
-#endif
 
 	wake_up_state(q->task, TASK_NORMAL);
 }
@@ -1474,9 +1468,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 	prio = min(current->normal_prio, MAX_RT_PRIO);
 
 	plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
-	q->list.plist.spinlock = &hb->lock;
-#endif
 	plist_add(&q->list, &hb->chain);
 	q->task = current;
 	spin_unlock(&hb->lock);
-- 
cgit v1.2.1


From d209a699a0b975ad47f399d70ddc3791f1b84496 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Mar 2011 21:22:14 +0100
Subject: genirq: Add chip flag to force mask on suspend

On suspend we disable all interrupts in the core code, but this does
not mask the interrupt line in the default implementation as we use a
lazy disable approach. That means we mark the interrupt disabled, but
leave the hardware unmasked. That's an optimization because we avoid
the hardware access for the common case where no interrupt happens
after we marked it disabled. If an interrupt happens, then the
interrupt flow handler masks the line at the hardware level and marks
it pending.

Suspend makes use of this delayed disable as it "disables" all
interrupts when preparing the suspend transition. Right before the
system goes into hardware suspend state it checks whether one of the
interrupts which is marked as a wakeup interrupt came in after
disabling it.

Most interrupt chips have a separate register which selects the
interrupts which can wake up the system from suspend, so we don't have
to mask any on the non wakeup interrupts.

But now we have to deal with brilliant designed hardware which lacks
such a wakeup configuration facility. For such hardware it's necessary
to mask all non wakeup interrupts before going into suspend in order
to avoid the wakeup from random interrupts.

Rather than working around this in the affected interrupt chip
implementations we can solve this elegant in the core code itself.

Add a flag IRQCHIP_MASK_ON_SUSPEND which can be set by the irq chip
implementation to indicate, that the interrupts which are not selected
as wakeup sources must be masked in the suspend path. Mask them in the
loop which checks the wakeup interrupts pending flag.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
LKML-Reference: <alpine.LFD.2.00.1103112112310.2787@localhost6.localdomain6>
---
 kernel/irq/pm.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 1329f0eff49e..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -68,10 +68,24 @@ int check_wakeup_irqs(void)
 	struct irq_desc *desc;
 	int irq;
 
-	for_each_irq_desc(irq, desc)
-		if (irqd_is_wakeup_set(&desc->irq_data) &&
-		    (desc->istate & IRQS_PENDING))
-			return -EBUSY;
+	for_each_irq_desc(irq, desc) {
+		if (irqd_is_wakeup_set(&desc->irq_data)) {
+			if (desc->istate & IRQS_PENDING)
+				return -EBUSY;
+			continue;
+		}
+		/*
+		 * Check the non wakeup interrupts whether they need
+		 * to be masked before finally going into suspend
+		 * state. That's for hardware which has no wakeup
+		 * source configuration facility. The chip
+		 * implementation indicates that with
+		 * IRQCHIP_MASK_ON_SUSPEND.
+		 */
+		if (desc->istate & IRQS_SUSPENDED &&
+		    irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+			mask_irq(desc);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.1


From 6e6823d17b157f185be09f4c70181299f9273f0b Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 3 Mar 2011 18:26:14 +0100
Subject: posix-clocks: Check write permissions in posix syscalls

pc_clock_settime() and pc_clock_adjtime() do not check whether the fd
was opened in write mode, so a clock can be set with a read only fd.

[ tglx: We deliberately do not return -EPERM as we want this to be
  	distingushable from the capability based permission check ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
LKML-Reference: <1299173174-348-4-git-send-email-torbenh@gmx.de>
Cc: Richard Cochran <richard.cochran@omicron.at>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/posix-clock.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 04498cbf6002..25028dd4fa18 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -287,11 +287,16 @@ static int pc_clock_adjtime(clockid_t id, struct timex *tx)
 	if (err)
 		return err;
 
+	if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+		err = -EACCES;
+		goto out;
+	}
+
 	if (cd.clk->ops.clock_adjtime)
 		err = cd.clk->ops.clock_adjtime(cd.clk, tx);
 	else
 		err = -EOPNOTSUPP;
-
+out:
 	put_clock_desc(&cd);
 
 	return err;
@@ -344,11 +349,16 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 	if (err)
 		return err;
 
+	if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+		err = -EACCES;
+		goto out;
+	}
+
 	if (cd.clk->ops.clock_settime)
 		err = cd.clk->ops.clock_settime(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
-
+out:
 	put_clock_desc(&cd);
 
 	return err;
-- 
cgit v1.2.1


From 15a9155fe3e8215c02b80df51ec2cac7c0d726ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 15:08:54 -0500
Subject: fix race in audit_get_nd()

don't rely on pathname resolution ending up twice at the same point...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_watch.c | 85 ++++++++++++++++++++--------------------------------
 1 file changed, 32 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..20b9fe6907d0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 }
 
 /* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
 {
-	struct inode *inode = ndp->path.dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	struct audit_parent *parent;
 	int ret;
 
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 }
 
 /* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-	struct nameidata *ndparent, *ndwatch;
+	struct nameidata nd;
+	struct dentry *d;
 	int err;
 
-	ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
-	if (unlikely(!ndparent))
-		return -ENOMEM;
+	err = path_lookup(watch->path, LOOKUP_PARENT, &nd);
+	if (err)
+		return err;
 
-	ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
-	if (unlikely(!ndwatch)) {
-		kfree(ndparent);
-		return -ENOMEM;
+	if (nd.last_type != LAST_NORM) {
+		path_put(&nd.path);
+		return -EINVAL;
 	}
 
-	err = path_lookup(path, LOOKUP_PARENT, ndparent);
-	if (err) {
-		kfree(ndparent);
-		kfree(ndwatch);
-		return err;
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+	if (IS_ERR(d)) {
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+		path_put(&nd.path);
+		return PTR_ERR(d);
 	}
-
-	err = path_lookup(path, 0, ndwatch);
-	if (err) {
-		kfree(ndwatch);
-		ndwatch = NULL;
+	if (d->d_inode) {
+		/* update watch filter fields */
+		watch->dev = d->d_inode->i_sb->s_dev;
+		watch->ino = d->d_inode->i_ino;
 	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 
-	*ndp = ndparent;
-	*ndw = ndwatch;
-
+	*parent = nd.path;
+	dput(d);
 	return 0;
 }
 
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-	if (ndp) {
-		path_put(&ndp->path);
-		kfree(ndp);
-	}
-	if (ndw) {
-		path_put(&ndw->path);
-		kfree(ndw);
-	}
-}
-
 /* Associate the given rule with an existing parent.
  * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
 	struct audit_watch *watch = krule->watch;
 	struct audit_parent *parent;
-	struct nameidata *ndp = NULL, *ndw = NULL;
+	struct path parent_path;
 	int h, ret = 0;
 
 	mutex_unlock(&audit_filter_mutex);
 
 	/* Avoid calling path_lookup under audit_filter_mutex. */
-	ret = audit_get_nd(watch->path, &ndp, &ndw);
-	if (ret) {
-		/* caller expects mutex locked */
-		mutex_lock(&audit_filter_mutex);
-		goto error;
-	}
+	ret = audit_get_nd(watch, &parent_path);
 
+	/* caller expects mutex locked */
 	mutex_lock(&audit_filter_mutex);
 
-	/* update watch filter fields */
-	if (ndw) {
-		watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-		watch->ino = ndw->path.dentry->d_inode->i_ino;
-	}
+	if (ret)
+		return ret;
 
 	/* either find an old parent or attach a new one */
-	parent = audit_find_parent(ndp->path.dentry->d_inode);
+	parent = audit_find_parent(parent_path.dentry->d_inode);
 	if (!parent) {
-		parent = audit_init_parent(ndp);
+		parent = audit_init_parent(&parent_path);
 		if (IS_ERR(parent)) {
 			ret = PTR_ERR(parent);
 			goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 	h = audit_hash_ino((u32)watch->ino);
 	*list = &audit_inode_hash[h];
 error:
-	audit_put_nd(ndp, ndw);		/* NULL args OK */
+	path_put(&parent_path);
 	return ret;
-
 }
 
 void audit_remove_watch_rule(struct audit_krule *krule)
-- 
cgit v1.2.1


From c9c6cac0c2bdbda42e7b804838648d0bc60ddb13 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 15:15:47 -0500
Subject: kill path_lookup()

all remaining callers pass LOOKUP_PARENT to it, so
flags argument can die; renamed to kern_path_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_watch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 20b9fe6907d0..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -359,7 +359,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	struct dentry *d;
 	int err;
 
-	err = path_lookup(watch->path, LOOKUP_PARENT, &nd);
+	err = kern_path_parent(watch->path, &nd);
 	if (err)
 		return err;
 
-- 
cgit v1.2.1


From 73d049a40fc6269189c4e2ba6792cb5dd054883c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 12:08:24 -0500
Subject: open-style analog of vfs_path_lookup()

new function: file_open_root(dentry, mnt, name, flags) opens the file
vfs_path_lookup would arrive to.

Note that name can be empty; in that case the usual requirement that
dentry should be a directory is lifted.

open-coded equivalents switched to it, may_open() got down exactly
one caller and became static.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl_binary.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
 	const struct bin_table *table = NULL;
-	struct nameidata nd;
 	struct vfsmount *mnt;
 	struct file *file;
 	ssize_t result;
 	char *pathname;
 	int flags;
-	int acc_mode;
 
 	pathname = sysctl_getname(name, nlen, &table);
 	result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	/* How should the sysctl be accessed? */
 	if (oldval && oldlen && newval && newlen) {
 		flags = O_RDWR;
-		acc_mode = MAY_READ | MAY_WRITE;
 	} else if (newval && newlen) {
 		flags = O_WRONLY;
-		acc_mode = MAY_WRITE;
 	} else if (oldval && oldlen) {
 		flags = O_RDONLY;
-		acc_mode = MAY_READ;
 	} else {
 		result = 0;
 		goto out_putname;
 	}
 
 	mnt = current->nsproxy->pid_ns->proc_mnt;
-	result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
-	if (result)
-		goto out_putname;
-
-	result = may_open(&nd.path, acc_mode, flags);
-	if (result)
-		goto out_putpath;
-
-	file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+	file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
 	result = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
 	putname(pathname);
 out:
 	return result;
-
-out_putpath:
-	path_put(&nd.path);
-	goto out_putname;
 }
 
 
-- 
cgit v1.2.1


From 9d90c8d9cde929cbc575098e825d7c29d9f45054 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 13 Mar 2011 03:19:51 +0100
Subject: printk: do not mangle valid userspace syslog prefixes

printk: do not mangle valid userspace syslog prefixes with /dev/kmsg

Log messages passed to the kernel log by using /dev/kmsg or /dev/ttyprintk
might contain a syslog prefix including the syslog facility value.

This makes printk to recognize these headers properly, extract the real log
level from it to use, and add the prefix as a proper prefix to the
log buffer, instead of wrongly printing it as the log message text.

Before:
  $ echo '<14>text' > /dev/kmsg
  $ dmesg -r
  <4>[135159.594810] <14>text

After:
  $ echo '<14>text' > /dev/kmsg
  $ dmesg -r
  <14>[   50.750654] text

Cc: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/printk.c | 138 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 104 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 2ddbdc73aade..5e3d042e7001 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -498,6 +498,71 @@ static void _call_console_drivers(unsigned start,
 	}
 }
 
+/*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+	unsigned int lev = 0;
+	char sp = '\0';
+	size_t len;
+
+	if (p[0] != '<' || !p[1])
+		return 0;
+	if (p[2] == '>') {
+		/* usual single digit level number or special char */
+		switch (p[1]) {
+		case '0' ... '7':
+			lev = p[1] - '0';
+			break;
+		case 'c': /* KERN_CONT */
+		case 'd': /* KERN_DEFAULT */
+			sp = p[1];
+			break;
+		default:
+			return 0;
+		}
+		len = 3;
+	} else {
+		/* multi digit including the level and facility number */
+		char *endp = NULL;
+
+		if (p[1] < '0' && p[1] > '9')
+			return 0;
+
+		lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+		if (endp == NULL || endp[0] != '>')
+			return 0;
+		len = (endp + 1) - p;
+	}
+
+	/* do not accept special char if not asked for */
+	if (sp && !special)
+		return 0;
+
+	if (special) {
+		*special = sp;
+		/* return special char, do not touch level */
+		if (sp)
+			return len;
+	}
+
+	if (level)
+		*level = lev;
+	return len;
+}
+
 /*
  * Call the console drivers, asking them to write out
  * log_buf[start] to log_buf[end - 1].
@@ -513,13 +578,9 @@ static void call_console_drivers(unsigned start, unsigned end)
 	cur_index = start;
 	start_print = start;
 	while (cur_index != end) {
-		if (msg_level < 0 && ((end - cur_index) > 2) &&
-				LOG_BUF(cur_index + 0) == '<' &&
-				LOG_BUF(cur_index + 1) >= '0' &&
-				LOG_BUF(cur_index + 1) <= '7' &&
-				LOG_BUF(cur_index + 2) == '>') {
-			msg_level = LOG_BUF(cur_index + 1) - '0';
-			cur_index += 3;
+		if (msg_level < 0 && ((end - cur_index) > 2)) {
+			/* strip log prefix */
+			cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
 			start_print = cur_index;
 		}
 		while (cur_index != end) {
@@ -717,6 +778,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	unsigned long flags;
 	int this_cpu;
 	char *p;
+	size_t plen;
+	char special;
 
 	boot_delay_msec();
 	printk_delay();
@@ -757,45 +820,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	printed_len += vscnprintf(printk_buf + printed_len,
 				  sizeof(printk_buf) - printed_len, fmt, args);
 
-
 	p = printk_buf;
 
-	/* Do we have a loglevel in the string? */
-	if (p[0] == '<') {
-		unsigned char c = p[1];
-		if (c && p[2] == '>') {
-			switch (c) {
-			case '0' ... '7': /* loglevel */
-				current_log_level = c - '0';
-			/* Fallthrough - make sure we're on a new line */
-			case 'd': /* KERN_DEFAULT */
-				if (!new_text_line) {
-					emit_log_char('\n');
-					new_text_line = 1;
-				}
-			/* Fallthrough - skip the loglevel */
-			case 'c': /* KERN_CONT */
-				p += 3;
-				break;
+	/* Read log level and handle special printk prefix */
+	plen = log_prefix(p, &current_log_level, &special);
+	if (plen) {
+		p += plen;
+
+		switch (special) {
+		case 'c': /* Strip <c> KERN_CONT, continue line */
+			plen = 0;
+			break;
+		case 'd': /* Strip <d> KERN_DEFAULT, start new line */
+			plen = 0;
+		default:
+			if (!new_text_line) {
+				emit_log_char('\n');
+				new_text_line = 1;
 			}
 		}
 	}
 
 	/*
-	 * Copy the output into log_buf.  If the caller didn't provide
-	 * appropriate log level tags, we insert them here
+	 * Copy the output into log_buf. If the caller didn't provide
+	 * the appropriate log prefix, we insert them here
 	 */
-	for ( ; *p; p++) {
+	for (; *p; p++) {
 		if (new_text_line) {
-			/* Always output the token */
-			emit_log_char('<');
-			emit_log_char(current_log_level + '0');
-			emit_log_char('>');
-			printed_len += 3;
 			new_text_line = 0;
 
+			if (plen) {
+				/* Copy original log prefix */
+				int i;
+
+				for (i = 0; i < plen; i++)
+					emit_log_char(printk_buf[i]);
+				printed_len += plen;
+			} else {
+				/* Add log prefix */
+				emit_log_char('<');
+				emit_log_char(current_log_level + '0');
+				emit_log_char('>');
+				printed_len += 3;
+			}
+
 			if (printk_time) {
-				/* Follow the token with the time */
+				/* Add the current time stamp */
 				char tbuf[50], *tp;
 				unsigned tlen;
 				unsigned long long t;
-- 
cgit v1.2.1


From 6e0aa9f8a8190e0879a29bd67aa606b51734a122 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Mar 2011 10:34:35 +0100
Subject: futex: Deobfuscate handle_futex_death()

handle_futex_death() uses futex_atomic_cmpxchg_inatomic() without
disabling page faults. That's ok, but totally non obvious.

We don't hold locks so we actually can and want to fault here, because
the get_user() before futex_atomic_cmpxchg_inatomic() does not
guarantee a R/W mapping.

We could just add a big fat comment to explain this, but actually
changing the code so that the functionality is entirely clear is
better.

Use the helper function which disables page faults around the
futex_atomic_cmpxchg_inatomic() and handle a fault with a call to
fault_in_user_writeable() as all other places in the futex code do as
well.

Pointed-out-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Darren Hart <darren@dvhart.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
LKML-Reference: <alpine.LFD.2.00.1103141126590.2787@localhost6.localdomain6>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index c6bef6e404fe..e9251d934f7d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2458,9 +2458,20 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-		if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval))
-			return -1;
-
+		/*
+		 * We are not holding a lock here, but we want to have
+		 * the pagefault_disable/enable() protection because
+		 * we want to handle the fault gracefully. If the
+		 * access fails we try to fault in the futex with R/W
+		 * verification via get_user_pages. get_user() above
+		 * does not guarantee R/W access. If that fails we
+		 * give up and leave the futex locked.
+		 */
+		if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+			if (fault_in_user_writeable(uaddr))
+				return -1;
+			goto retry;
+		}
 		if (nval != uval)
 			goto retry;
 
-- 
cgit v1.2.1


From cd51e61cf4e8b220da37dc35e9c2dc2dc258b4de Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 11 Feb 2011 00:04:52 +0100
Subject: PM / ACPI: Remove references to pm_flags from bus.c

If direct references to pm_flags are removed from drivers/acpi/bus.c,
CONFIG_ACPI will not need to depend on CONFIG_PM any more.  Make that
happen.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Len Brown <len.brown@intel.com>
---
 kernel/power/main.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..b5405af48ddb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,10 +17,20 @@
 
 DEFINE_MUTEX(pm_mutex);
 
+#ifdef CONFIG_PM_SLEEP
+
 unsigned int pm_flags;
 EXPORT_SYMBOL(pm_flags);
 
-#ifdef CONFIG_PM_SLEEP
+bool pm_apm_enabled(void)
+{
+	return !!(pm_flags & PM_APM);
+}
+
+void pm_set_acpi_flag(void)
+{
+	pm_flags |= PM_ACPI;
+}
 
 /* Routines for PM-transition notifications */
 
-- 
cgit v1.2.1


From 1eb208aea3179dd2fc0cdeea45ef869d75b4fe70 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 11 Feb 2011 00:06:30 +0100
Subject: PM: Make CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME)

From the users' point of view CONFIG_PM is really only used for
making it possible to set CONFIG_SUSPEND, CONFIG_HIBERNATION,
CONFIG_PM_RUNTIME and (surprisingly enough) CONFIG_XEN_SAVE_RESTORE
(CONFIG_PM_OPP also depends on CONFIG_PM, but quite artificially).
However, both CONFIG_SUSPEND and CONFIG_HIBERNATION require platform
support (independent of CONFIG_PM) and it is not quite obvious that
CONFIG_PM has to be set for CONFIG_XEN_SAVE_RESTORE to be available.
Thus, from the users' point of view, it would be more logical to
automatically select CONFIG_PM if any of the above options depending
on it are set.

Make CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME),
which will cause it to be selected when any of CONFIG_SUSPEND,
CONFIG_HIBERNATION, CONFIG_PM_RUNTIME, CONFIG_XEN_SAVE_RESTORE is
set and will clarify its meaning.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..d739cf612e50 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,23 +1,7 @@
 config PM
-	bool "Power Management support"
-	depends on !IA64_HP_SIM
-	---help---
-	  "Power Management" means that parts of your computer are shut
-	  off or put into a power conserving "sleep" mode if they are not
-	  being used.  There are two competing standards for doing this: APM
-	  and ACPI.  If you want to use either one, say Y here and then also
-	  to the requisite support below.
-
-	  Power Management is most important for battery powered laptop
-	  computers; if you have a laptop, check out the Linux Laptop home
-	  page on the WWW at <http://www.linux-on-laptops.com/> or
-	  Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
-	  and the Battery Powered Linux mini-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>.
-
-	  Note that, even if you say N here, Linux on the x86 architecture
-	  will issue the hlt instruction if nothing is to be done, thereby
-	  sending the processor to sleep and saving power.
+	bool
+	depends on PM_SLEEP || PM_RUNTIME
+	default y
 
 config PM_DEBUG
 	bool "Power Management Debug Support"
@@ -102,7 +86,7 @@ config PM_SLEEP_ADVANCED_DEBUG
 
 config SUSPEND
 	bool "Suspend to RAM and standby"
-	depends on PM && ARCH_SUSPEND_POSSIBLE
+	depends on ARCH_SUSPEND_POSSIBLE
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
@@ -133,7 +117,7 @@ config SUSPEND_FREEZER
 
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
-	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+	depends on SWAP && ARCH_HIBERNATION_POSSIBLE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
 	---help---
@@ -224,7 +208,7 @@ config APM_EMULATION
 
 config PM_RUNTIME
 	bool "Run-time PM core functionality"
-	depends on PM
+	depends on !IA64_HP_SIM
 	---help---
 	  Enable functionality allowing I/O devices to be put into energy-saving
 	  (low power) states at run time (or autosuspended) after a specified
@@ -246,7 +230,6 @@ config ARCH_HAS_OPP
 
 config PM_OPP
 	bool "Operating Performance Point (OPP) Layer library"
-	depends on PM
 	depends on ARCH_HAS_OPP
 	---help---
 	  SOCs have a standard set of tuples consisting of frequency and
-- 
cgit v1.2.1


From 196ec243224bb38fc5c41d9fa4050f70708b7fb4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 11 Feb 2011 00:06:42 +0100
Subject: PM: Reorder power management Kconfig options

Reorder configuration options in kernel/power/Kconfig so that
the options depended on are at the top of the list.

This patch doesn't introduce any functional changes.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig | 222 +++++++++++++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 111 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d739cf612e50..1e4d9238c5da 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,89 +1,3 @@
-config PM
-	bool
-	depends on PM_SLEEP || PM_RUNTIME
-	default y
-
-config PM_DEBUG
-	bool "Power Management Debug Support"
-	depends on PM
-	---help---
-	This option enables various debugging support in the Power Management
-	code. This is helpful when debugging and reporting PM bugs, like
-	suspend support.
-
-config PM_ADVANCED_DEBUG
-	bool "Extra PM attributes in sysfs for low-level debugging/testing"
-	depends on PM_DEBUG
-	default n
-	---help---
-	Add extra sysfs attributes allowing one to access some Power Management
-	fields of device objects from user space.  If you are not a kernel
-	developer interested in debugging/testing Power Management, say "no".
-
-config PM_VERBOSE
-	bool "Verbose Power Management debugging"
-	depends on PM_DEBUG
-	default n
-	---help---
-	This option enables verbose messages from the Power Management code.
-
-config CAN_PM_TRACE
-	def_bool y
-	depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
-
-config PM_TRACE
-	bool
-	help
-	  This enables code to save the last PM event point across
-	  reboot. The architecture needs to support this, x86 for
-	  example does by saving things in the RTC, see below.
-
-	  The architecture specific code must provide the extern
-	  functions from <linux/resume-trace.h> as well as the
-	  <asm/resume-trace.h> header with a TRACE_RESUME() macro.
-
-	  The way the information is presented is architecture-
-	  dependent, x86 will print the information during a
-	  late_initcall.
-
-config PM_TRACE_RTC
-	bool "Suspend/resume event tracing"
-	depends on CAN_PM_TRACE
-	depends on X86
-	select PM_TRACE
-	default n
-	---help---
-	This enables some cheesy code to save the last PM event point in the
-	RTC across reboots, so that you can debug a machine that just hangs
-	during suspend (or more commonly, during resume).
-
-	To use this debugging feature you should attempt to suspend the
-	machine, reboot it and then run
-
-		dmesg -s 1000000 | grep 'hash matches'
-
-	CAUTION: this option will cause your machine's real-time clock to be
-	set to an invalid time after a resume.
-
-config PM_SLEEP_SMP
-	bool
-	depends on SMP
-	depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
-	depends on PM_SLEEP
-	select HOTPLUG
-	select HOTPLUG_CPU
-	default y
-
-config PM_SLEEP
-	bool
-	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
-	default y
-
-config PM_SLEEP_ADVANCED_DEBUG
-	bool
-	depends on PM_ADVANCED_DEBUG
-	default n
-
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on ARCH_SUSPEND_POSSIBLE
@@ -93,17 +7,6 @@ config SUSPEND
 	  powered and thus its contents are preserved, such as the
 	  suspend-to-RAM state (e.g. the ACPI S3 state).
 
-config PM_TEST_SUSPEND
-	bool "Test suspend/resume and wakealarm during bootup"
-	depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
-	---help---
-	This option will let you suspend your machine during bootup, and
-	make it wake up a few seconds later using an RTC wakeup alarm.
-	Enable this with a kernel parameter like "test_suspend=mem".
-
-	You probably want to have your system's RTC driver statically
-	linked, ensuring that it's available when this test runs.
-
 config SUSPEND_FREEZER
 	bool "Enable freezer for suspend to RAM/standby" \
 		if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -180,6 +83,117 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
+config PM_SLEEP
+	bool
+	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
+	default y
+
+config PM_SLEEP_SMP
+	bool
+	depends on SMP
+	depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+	depends on PM_SLEEP
+	select HOTPLUG
+	select HOTPLUG_CPU
+	default y
+
+config PM_RUNTIME
+	bool "Run-time PM core functionality"
+	depends on !IA64_HP_SIM
+	---help---
+	  Enable functionality allowing I/O devices to be put into energy-saving
+	  (low power) states at run time (or autosuspended) after a specified
+	  period of inactivity and woken up in response to a hardware-generated
+	  wake-up event or a driver's request.
+
+	  Hardware support is generally required for this functionality to work
+	  and the bus type drivers of the buses the devices are on are
+	  responsible for the actual handling of the autosuspend requests and
+	  wake-up events.
+
+config PM
+	bool
+	depends on PM_SLEEP || PM_RUNTIME
+	default y
+
+config PM_DEBUG
+	bool "Power Management Debug Support"
+	depends on PM
+	---help---
+	This option enables various debugging support in the Power Management
+	code. This is helpful when debugging and reporting PM bugs, like
+	suspend support.
+
+config PM_VERBOSE
+	bool "Verbose Power Management debugging"
+	depends on PM_DEBUG
+	default n
+	---help---
+	This option enables verbose messages from the Power Management code.
+
+config PM_ADVANCED_DEBUG
+	bool "Extra PM attributes in sysfs for low-level debugging/testing"
+	depends on PM_DEBUG
+	default n
+	---help---
+	Add extra sysfs attributes allowing one to access some Power Management
+	fields of device objects from user space.  If you are not a kernel
+	developer interested in debugging/testing Power Management, say "no".
+
+config PM_SLEEP_ADVANCED_DEBUG
+	bool
+	depends on PM_ADVANCED_DEBUG
+	default n
+
+config PM_TEST_SUSPEND
+	bool "Test suspend/resume and wakealarm during bootup"
+	depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+	---help---
+	This option will let you suspend your machine during bootup, and
+	make it wake up a few seconds later using an RTC wakeup alarm.
+	Enable this with a kernel parameter like "test_suspend=mem".
+
+	You probably want to have your system's RTC driver statically
+	linked, ensuring that it's available when this test runs.
+
+config CAN_PM_TRACE
+	def_bool y
+	depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
+
+config PM_TRACE
+	bool
+	help
+	  This enables code to save the last PM event point across
+	  reboot. The architecture needs to support this, x86 for
+	  example does by saving things in the RTC, see below.
+
+	  The architecture specific code must provide the extern
+	  functions from <linux/resume-trace.h> as well as the
+	  <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+
+	  The way the information is presented is architecture-
+	  dependent, x86 will print the information during a
+	  late_initcall.
+
+config PM_TRACE_RTC
+	bool "Suspend/resume event tracing"
+	depends on CAN_PM_TRACE
+	depends on X86
+	select PM_TRACE
+	default n
+	---help---
+	This enables some cheesy code to save the last PM event point in the
+	RTC across reboots, so that you can debug a machine that just hangs
+	during suspend (or more commonly, during resume).
+
+	To use this debugging feature you should attempt to suspend the
+	machine, reboot it and then run
+
+		dmesg -s 1000000 | grep 'hash matches'
+
+	CAUTION: this option will cause your machine's real-time clock to be
+	set to an invalid time after a resume.
+
 config APM_EMULATION
 	tristate "Advanced Power Management Emulation"
 	depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -206,20 +220,6 @@ config APM_EMULATION
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
 
-config PM_RUNTIME
-	bool "Run-time PM core functionality"
-	depends on !IA64_HP_SIM
-	---help---
-	  Enable functionality allowing I/O devices to be put into energy-saving
-	  (low power) states at run time (or autosuspended) after a specified
-	  period of inactivity and woken up in response to a hardware-generated
-	  wake-up event or a driver's request.
-
-	  Hardware support is generally required for this functionality to work
-	  and the bus type drivers of the buses the devices are on are
-	  responsible for the actual handling of the autosuspend requests and
-	  wake-up events.
-
 config PM_OPS
 	bool
 	depends on PM_SLEEP || PM_RUNTIME
-- 
cgit v1.2.1


From aa33860158114d0df3c7997bc1dd41c0168e1c2a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 11 Feb 2011 00:06:54 +0100
Subject: PM: Remove CONFIG_PM_OPS

After redefining CONFIG_PM to depend on (CONFIG_PM_SLEEP ||
CONFIG_PM_RUNTIME) the CONFIG_PM_OPS option is redundant and can be
replaced with CONFIG_PM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 1e4d9238c5da..0710beead05b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -220,11 +220,6 @@ config APM_EMULATION
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
 
-config PM_OPS
-	bool
-	depends on PM_SLEEP || PM_RUNTIME
-	default y
-
 config ARCH_HAS_OPP
 	bool
 
-- 
cgit v1.2.1


From 88a6f33e4d7143f94f8e787fa4065ac873507984 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 11 Feb 2011 20:31:11 +0100
Subject: PM: Clean up PM_TRACE dependencies and drop unnecessary Kconfig
 option

CONFIG_PM_SLEEP_ADVANCED_DEBUG is not used any more, so drop it
and CONFIG_CAN_PM_TRACE need not depend on EXPERIMENTAL, so remove
that dependency.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 0710beead05b..298bed04555e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -140,11 +140,6 @@ config PM_ADVANCED_DEBUG
 	fields of device objects from user space.  If you are not a kernel
 	developer interested in debugging/testing Power Management, say "no".
 
-config PM_SLEEP_ADVANCED_DEBUG
-	bool
-	depends on PM_ADVANCED_DEBUG
-	default n
-
 config PM_TEST_SUSPEND
 	bool "Test suspend/resume and wakealarm during bootup"
 	depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
@@ -158,7 +153,7 @@ config PM_TEST_SUSPEND
 
 config CAN_PM_TRACE
 	def_bool y
-	depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
+	depends on PM_DEBUG && PM_SLEEP
 
 config PM_TRACE
 	bool
-- 
cgit v1.2.1


From 6831c6edc7b272a08dd2a6c71bb183a48fe98ae6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 15 Feb 2011 21:22:24 +0100
Subject: PM: Drop pm_flags that is not necessary

The variable pm_flags is used to prevent APM from being enabled
along with ACPI, which would lead to problems.  However, acpi_init()
is always called before apm_init() and after acpi_init() has
returned, it is known whether or not ACPI will be used.  Namely, if
acpi_disabled is not set after acpi_init() has returned, this means
that ACPI is enabled.  Thus, it is sufficient to check acpi_disabled
in apm_init() to prevent APM from being enabled in parallel with
ACPI.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Len Brown <len.brown@intel.com>
---
 kernel/power/main.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index b5405af48ddb..8eaba5f27b10 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -19,19 +19,6 @@ DEFINE_MUTEX(pm_mutex);
 
 #ifdef CONFIG_PM_SLEEP
 
-unsigned int pm_flags;
-EXPORT_SYMBOL(pm_flags);
-
-bool pm_apm_enabled(void)
-{
-	return !!(pm_flags & PM_APM);
-}
-
-void pm_set_acpi_flag(void)
-{
-	pm_flags |= PM_ACPI;
-}
-
 /* Routines for PM-transition notifications */
 
 static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
-- 
cgit v1.2.1


From cf4fb80ca3d591cae366ae8364e3c3f7a68bd249 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 18 Feb 2011 01:05:36 +0100
Subject: PM: Simplify kernel/power/Kconfig

'n' defaults are pretty pointless and actually bogus when used with
prompt-less config options.

The "bool"/"default y" pair with no prompt can be expressed more
compactly using def_bool.

[rjw: Rebased on top of earlier patches modifying this file.]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 298bed04555e..4603f08dc47b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -84,18 +84,16 @@ config PM_STD_PARTITION
 	  device.
 
 config PM_SLEEP
-	bool
+	def_bool y
 	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
-	default y
 
 config PM_SLEEP_SMP
-	bool
+	def_bool y
 	depends on SMP
 	depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
 	depends on PM_SLEEP
 	select HOTPLUG
 	select HOTPLUG_CPU
-	default y
 
 config PM_RUNTIME
 	bool "Run-time PM core functionality"
@@ -112,9 +110,8 @@ config PM_RUNTIME
 	  wake-up events.
 
 config PM
-	bool
+	def_bool y
 	depends on PM_SLEEP || PM_RUNTIME
-	default y
 
 config PM_DEBUG
 	bool "Power Management Debug Support"
@@ -127,14 +124,12 @@ config PM_DEBUG
 config PM_VERBOSE
 	bool "Verbose Power Management debugging"
 	depends on PM_DEBUG
-	default n
 	---help---
 	This option enables verbose messages from the Power Management code.
 
 config PM_ADVANCED_DEBUG
 	bool "Extra PM attributes in sysfs for low-level debugging/testing"
 	depends on PM_DEBUG
-	default n
 	---help---
 	Add extra sysfs attributes allowing one to access some Power Management
 	fields of device objects from user space.  If you are not a kernel
@@ -175,7 +170,6 @@ config PM_TRACE_RTC
 	depends on CAN_PM_TRACE
 	depends on X86
 	select PM_TRACE
-	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the
 	RTC across reboots, so that you can debug a machine that just hangs
-- 
cgit v1.2.1


From f9b9e806ae0ede772cbb9916d9ac7354a123d044 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Mon, 28 Feb 2011 22:06:34 +0100
Subject: PM QoS: Make pm_qos settings readable

I have a machine where entering deep C-states broke.
pm_qos was a hot candidate, but I couldn't find any way to double
check without the need of recompiling.

While in this case it was a driver bug (ath9k):
https://bugzilla.kernel.org/show_bug.cgi?id=27532

powertop or others may want to read out cpu_dma_latency
restrictions which could be the cause of preventing a machine
entering deeper C-states.

Output with this patch:

# default value of 2000 * USEC_PER_SEC (0x77359400)
cat /dev/network_latency |hexdump
0000000 9400 7735
0000004

# value of 55 us which is the reason for not entering C2
cat /dev/cpu_dma_latency |hexdump
0000000 0037 0000
0000004

There is no reason to hide this info -> make pm_qos files readable.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/pm_qos_params.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..0da058bff8eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
 
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+		size_t count, loff_t *f_pos);
 static int pm_qos_power_open(struct inode *inode, struct file *filp);
 static int pm_qos_power_release(struct inode *inode, struct file *filp);
 
 static const struct file_operations pm_qos_power_fops = {
 	.write = pm_qos_power_write,
+	.read = pm_qos_power_read,
 	.open = pm_qos_power_open,
 	.release = pm_qos_power_release,
 	.llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
 }
 
 
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+		size_t count, loff_t *f_pos)
+{
+	s32 value;
+	unsigned long flags;
+	struct pm_qos_object *o;
+	struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+
+	if (!pm_qos_req)
+		return -EINVAL;
+	if (!pm_qos_request_active(pm_qos_req))
+		return -EINVAL;
+
+	o = pm_qos_array[pm_qos_req->pm_qos_class];
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	value = pm_qos_get_value(o);
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		size_t count, loff_t *f_pos)
 {
-- 
cgit v1.2.1


From 40dc166cb5dddbd36aa4ad11c03915ea538f5a61 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 15 Mar 2011 00:43:46 +0100
Subject: PM / Core: Introduce struct syscore_ops for core subsystems PM

Some subsystems need to carry out suspend/resume and shutdown
operations with one CPU on-line and interrupts disabled.  The only
way to register such operations is to define a sysdev class and
a sysdev specifically for this purpose which is cumbersome and
inefficient.  Moreover, the arguments taken by sysdev suspend,
resume and shutdown callbacks are practically never necessary.

For this reason, introduce a simpler interface allowing subsystems
to register operations to be executed very late during system suspend
and shutdown and very early during resume in the form of
strcut syscore_ops objects.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/hibernate.c | 9 +++++++++
 kernel/power/suspend.c   | 4 ++++
 kernel/sys.c             | 4 ++++
 3 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..aeabd26e3342 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
 
@@ -272,6 +273,8 @@ static int create_image(int platform_mode)
 	local_irq_disable();
 
 	error = sysdev_suspend(PMSG_FREEZE);
+	if (!error)
+		error = syscore_suspend();
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
 			"aborting hibernation\n");
@@ -295,6 +298,7 @@ static int create_image(int platform_mode)
 	}
 
  Power_up:
+	syscore_resume();
 	sysdev_resume();
 	/* NOTE:  dpm_resume_noirq() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
@@ -403,6 +407,8 @@ static int resume_target_kernel(bool platform_mode)
 	local_irq_disable();
 
 	error = sysdev_suspend(PMSG_QUIESCE);
+	if (!error)
+		error = syscore_suspend();
 	if (error)
 		goto Enable_irqs;
 
@@ -429,6 +435,7 @@ static int resume_target_kernel(bool platform_mode)
 	restore_processor_state();
 	touch_softlockup_watchdog();
 
+	syscore_resume();
 	sysdev_resume();
 
  Enable_irqs:
@@ -516,6 +523,7 @@ int hibernation_platform_enter(void)
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
+	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
 		goto Power_up;
@@ -526,6 +534,7 @@ int hibernation_platform_enter(void)
 	while (1);
 
  Power_up:
+	syscore_resume();
 	sysdev_resume();
 	local_irq_enable();
 	enable_nonboot_cpus();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..2814c32aed51 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
+#include <linux/syscore_ops.h>
 #include <trace/events/power.h>
 
 #include "power.h"
@@ -163,11 +164,14 @@ static int suspend_enter(suspend_state_t state)
 	BUG_ON(!irqs_disabled());
 
 	error = sysdev_suspend(PMSG_SUSPEND);
+	if (!error)
+		error = syscore_suspend();
 	if (!error) {
 		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
 			error = suspend_ops->enter(state);
 			events_check_enabled = false;
 		}
+		syscore_resume();
 		sysdev_resume();
 	}
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..1ad48b3b9068 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -298,6 +299,7 @@ void kernel_restart_prepare(char *cmd)
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
 	sysdev_shutdown();
+	syscore_shutdown();
 }
 
 /**
@@ -336,6 +338,7 @@ void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
 	sysdev_shutdown();
+	syscore_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	kmsg_dump(KMSG_DUMP_HALT);
 	machine_halt();
@@ -355,6 +358,7 @@ void kernel_power_off(void)
 		pm_power_off_prepare();
 	disable_nonboot_cpus();
 	sysdev_shutdown();
+	syscore_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	kmsg_dump(KMSG_DUMP_POWEROFF);
 	machine_power_off();
-- 
cgit v1.2.1


From bea3864fb627d110933cfb8babe048b63c4fc76e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 15 Mar 2011 00:45:46 +0100
Subject: PM / Hibernate: Reduce autotuned default image size

The hibernate image size autotuning mechanism sets the default
image size to 5/2 of the total system RAM, but it is reported
that on some systems device drivers allocate substantial
amounts of memory during suspend and the creation of the image
fails as a result (too little memory is preallocated).

Modify the autotuning mechanism to use 1/3 instead of 2/5 of RAM
as the default image size, which is reported to be sufficient for
the affected systems.

References: https://bugzilla.kernel.org/show_bug.cgi?id=30482
Reported-and-tested-by: Martin Steigerwald <Martin@Lichtvoll.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ca0aacc24874 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
 
 /*
  * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N bytes, but if that is impossible, it will
- * try to create the smallest image possible.
+ * When it is set to N, the image creating code will do its best to
+ * ensure the image size will not exceed N bytes, but if that is
+ * impossible, it will try to create the smallest image possible.
  */
 unsigned long image_size;
 
 void __init hibernate_image_size_init(void)
 {
-	image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+	image_size = (totalram_pages / 3) * PAGE_SIZE;
 }
 
 /* List of PBEs needed for restoring the pages that were allocated before
-- 
cgit v1.2.1


From 990d6c2d7aee921e3bce22b2d6a750fd552262be Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add name to file handle conversion support

The syscall also return mount id which can be used
to lookup file system specific information such as uuid
in /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sys_ni.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..4e013439ac28 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,6 @@ cond_syscall(sys_perf_event_open);
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
+
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
-- 
cgit v1.2.1


From becfd1f37544798cbdfd788f32c827160fab98c1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add open by file handle support

[AV: duplicate of open() guts removed; file_open_root() used instead]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sys_ni.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4e013439ac28..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -189,3 +189,5 @@ cond_syscall(sys_fanotify_mark);
 
 /* open by handle */
 cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
-- 
cgit v1.2.1


From 1fd06bb1571e2618ae392e2484925bf0dadd7857 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 15 Mar 2011 16:12:30 -0700
Subject: sched.c: fix kernel-doc for runqueue_is_locked()

Fix kernel-doc warning for runqueue_is_locked():

  Warning(kernel/sched.c:664): missing initial short description on line:

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c8e40b7005c0..58d66ea7d200 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -661,10 +661,9 @@ static void update_rq_clock(struct rq *rq)
 #endif
 
 /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
  * @cpu: the processor in question.
  *
- * Returns true if the current cpu runqueue is locked.
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
-- 
cgit v1.2.1