10 files changed, 445 insertions, 24 deletions
diff --git a/lib/Makefile b/lib/Makefile
index 6c4ea33bb2cb..c5f215d509d3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,8 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o argv_split.o
+	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
+	 proportions.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/argv_split.c b/lib/argv_split.c
index 4096ed42f490..fad6ce4f7b57 100644
--- a/lib/argv_split.c
+++ b/lib/argv_split.c
@@ -75,7 +75,9 @@ char **argv_split(gfp_t gfp, const char *str, int *argcp)
 	if (argv == NULL)
 		goto out;
 
-	*argcp = argc;
+	if (argcp)
+		*argcp = argc;
+
 	argvp = argv;
 
 	while (*str) {
diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c
index accb35658169..486da62b2b07 100644
--- a/lib/bust_spinlocks.c
+++ b/lib/bust_spinlocks.c
@@ -17,13 +17,13 @@
 void __attribute__((weak)) bust_spinlocks(int yes)
 {
 	if (yes) {
-		oops_in_progress = 1;
+		++oops_in_progress;
 	} else {
 #ifdef CONFIG_VT
 		unblank_screen();
 #endif
-		oops_in_progress = 0;
-		wake_up_klogd();
+		if (--oops_in_progress == 0)
+			wake_up_klogd();
 	}
 }
 
diff --git a/lib/idr.c b/lib/idr.c
index 09cbe2b69edb..afbb0b1023d4 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -580,8 +580,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(void * idr_layer, struct kmem_cache *idr_layer_cache,
-		unsigned long flags)
+static void idr_cache_ctor(struct kmem_cache *idr_layer_cache, void *idr_layer)
 {
 	memset(idr_layer, 0, sizeof(struct idr_layer));
 }
diff --git a/lib/iomap.c b/lib/iomap.c
index 864f2ec1966e..72c42687ba10 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -40,7 +40,7 @@ static void bad_io_access(unsigned long port, const char *access)
 	static int count = 10;
 	if (count) {
 		count--;
-		printk(KERN_ERR "Bad IO access at port %lx (%s)\n", port, access);
+		printk(KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access);
 		WARN_ON(1);
 	}
 }
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 760521417b69..14c6078f17a2 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/io.h>
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index cf22c617baa4..9659eabffc31 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -14,15 +14,29 @@ static LIST_HEAD(percpu_counters);
 static DEFINE_MUTEX(percpu_counters_lock);
 #endif
 
-void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
+void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
-	long count;
+	int cpu;
+
+	spin_lock(&fbc->lock);
+	for_each_possible_cpu(cpu) {
+		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+		*pcount = 0;
+	}
+	fbc->count = amount;
+	spin_unlock(&fbc->lock);
+}
+EXPORT_SYMBOL(percpu_counter_set);
+
+void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+{
+	s64 count;
 	s32 *pcount;
 	int cpu = get_cpu();
 
 	pcount = per_cpu_ptr(fbc->counters, cpu);
 	count = *pcount + amount;
-	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+	if (count >= batch || count <= -batch) {
 		spin_lock(&fbc->lock);
 		fbc->count += count;
 		*pcount = 0;
@@ -32,13 +46,13 @@ void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
 	}
 	put_cpu();
 }
-EXPORT_SYMBOL(percpu_counter_mod);
+EXPORT_SYMBOL(__percpu_counter_add);
 
 /*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
 	s64 ret;
 	int cpu;
@@ -50,25 +64,43 @@ s64 percpu_counter_sum(struct percpu_counter *fbc)
 		ret += *pcount;
 	}
 	spin_unlock(&fbc->lock);
-	return ret < 0 ? 0 : ret;
+	return ret;
 }
-EXPORT_SYMBOL(percpu_counter_sum);
+EXPORT_SYMBOL(__percpu_counter_sum);
+
+static struct lock_class_key percpu_counter_irqsafe;
 
-void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
 	spin_lock_init(&fbc->lock);
 	fbc->count = amount;
 	fbc->counters = alloc_percpu(s32);
+	if (!fbc->counters)
+		return -ENOMEM;
 #ifdef CONFIG_HOTPLUG_CPU
 	mutex_lock(&percpu_counters_lock);
 	list_add(&fbc->list, &percpu_counters);
 	mutex_unlock(&percpu_counters_lock);
 #endif
+	return 0;
 }
 EXPORT_SYMBOL(percpu_counter_init);
 
+int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)
+{
+	int err;
+
+	err = percpu_counter_init(fbc, amount);
+	if (!err)
+		lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
+	return err;
+}
+
 void percpu_counter_destroy(struct percpu_counter *fbc)
 {
+	if (!fbc->counters)
+		return;
+
 	free_percpu(fbc->counters);
 #ifdef CONFIG_HOTPLUG_CPU
 	mutex_lock(&percpu_counters_lock);
diff --git a/lib/proportions.c b/lib/proportions.c
new file mode 100644
index 000000000000..332d8c58184d
--- /dev/null
+++ b/lib/proportions.c
@@ -0,0 +1,384 @@
+/*
+ * Floating proportions
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Description:
+ *
+ * The floating proportion is a time derivative with an exponentially decaying
+ * history:
+ *
+ *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
+ *
+ * Where j is an element from {prop_local}, x_{j} is j's number of events,
+ * and i the time period over which the differential is taken. So d/dt_{-i} is
+ * the differential over the i-th last period.
+ *
+ * The decaying history gives smooth transitions. The time differential carries
+ * the notion of speed.
+ *
+ * The denominator is 2^(1+i) because we want the series to be normalised, ie.
+ *
+ *   \Sum_{i=0} 1/2^(1+i) = 1
+ *
+ * Further more, if we measure time (t) in the same events as x; so that:
+ *
+ *   t = \Sum_{j} x_{j}
+ *
+ * we get that:
+ *
+ *   \Sum_{j} p_{j} = 1
+ *
+ * Writing this in an iterative fashion we get (dropping the 'd's):
+ *
+ *   if (++x_{j}, ++t > period)
+ *     t /= 2;
+ *     for_each (j)
+ *       x_{j} /= 2;
+ *
+ * so that:
+ *
+ *   p_{j} = x_{j} / t;
+ *
+ * We optimize away the '/= 2' for the global time delta by noting that:
+ *
+ *   if (++t > period) t /= 2:
+ *
+ * Can be approximated by:
+ *
+ *   period/2 + (++t % period/2)
+ *
+ * [ Furthermore, when we choose period to be 2^n it can be written in terms of
+ *   binary operations and wraparound artefacts disappear. ]
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ *   c = t / (period/2)
+ *
+ * [ Its monotonic increasing property can be applied to mitigate the wrap-
+ *   around issue. ]
+ *
+ * This allows us to do away with the loop over all prop_locals on each period
+ * expiration. By remembering the period count under which it was last accessed
+ * as c_{j}, we can obtain the number of 'missed' cycles from:
+ *
+ *   c - c_{j}
+ *
+ * We can then lazily catch up to the global period count every time we are
+ * going to use x_{j}, by doing:
+ *
+ *   x_{j} /= 2^(c - c_{j}), c_{j} = c
+ */
+
+#include <linux/proportions.h>
+#include <linux/rcupdate.h>
+
+/*
+ * Limit the time part in order to ensure there are some bits left for the
+ * cycle counter.
+ */
+#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
+
+int prop_descriptor_init(struct prop_descriptor *pd, int shift)
+{
+	int err;
+
+	if (shift > PROP_MAX_SHIFT)
+		shift = PROP_MAX_SHIFT;
+
+	pd->index = 0;
+	pd->pg[0].shift = shift;
+	mutex_init(&pd->mutex);
+	err = percpu_counter_init_irq(&pd->pg[0].events, 0);
+	if (err)
+		goto out;
+
+	err = percpu_counter_init_irq(&pd->pg[1].events, 0);
+	if (err)
+		percpu_counter_destroy(&pd->pg[0].events);
+
+out:
+	return err;
+}
+
+/*
+ * We have two copies, and flip between them to make it seem like an atomic
+ * update. The update is not really atomic wrt the events counter, but
+ * it is internally consistent with the bit layout depending on shift.
+ *
+ * We copy the events count, move the bits around and flip the index.
+ */
+void prop_change_shift(struct prop_descriptor *pd, int shift)
+{
+	int index;
+	int offset;
+	u64 events;
+	unsigned long flags;
+
+	if (shift > PROP_MAX_SHIFT)
+		shift = PROP_MAX_SHIFT;
+
+	mutex_lock(&pd->mutex);
+
+	index = pd->index ^ 1;
+	offset = pd->pg[pd->index].shift - shift;
+	if (!offset)
+		goto out;
+
+	pd->pg[index].shift = shift;
+
+	local_irq_save(flags);
+	events = percpu_counter_sum(&pd->pg[pd->index].events);
+	if (offset < 0)
+		events <<= -offset;
+	else
+		events >>= offset;
+	percpu_counter_set(&pd->pg[index].events, events);
+
+	/*
+	 * ensure the new pg is fully written before the switch
+	 */
+	smp_wmb();
+	pd->index = index;
+	local_irq_restore(flags);
+
+	synchronize_rcu();
+
+out:
+	mutex_unlock(&pd->mutex);
+}
+
+/*
+ * wrap the access to the data in an rcu_read_lock() section;
+ * this is used to track the active references.
+ */
+static struct prop_global *prop_get_global(struct prop_descriptor *pd)
+{
+	int index;
+
+	rcu_read_lock();
+	index = pd->index;
+	/*
+	 * match the wmb from vcd_flip()
+	 */
+	smp_rmb();
+	return &pd->pg[index];
+}
+
+static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
+{
+	rcu_read_unlock();
+}
+
+static void
+prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
+{
+	int offset = *pl_shift - new_shift;
+
+	if (!offset)
+		return;
+
+	if (offset < 0)
+		*pl_period <<= -offset;
+	else
+		*pl_period >>= offset;
+
+	*pl_shift = new_shift;
+}
+
+/*
+ * PERCPU
+ */
+
+int prop_local_init_percpu(struct prop_local_percpu *pl)
+{
+	spin_lock_init(&pl->lock);
+	pl->shift = 0;
+	pl->period = 0;
+	return percpu_counter_init_irq(&pl->events, 0);
+}
+
+void prop_local_destroy_percpu(struct prop_local_percpu *pl)
+{
+	percpu_counter_destroy(&pl->events);
+}
+
+/*
+ * Catch up with missed period expirations.
+ *
+ *   until (c_{j} == c)
+ *     x_{j} -= x_{j}/2;
+ *     c_{j}++;
+ */
+static
+void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
+{
+	unsigned long period = 1UL << (pg->shift - 1);
+	unsigned long period_mask = ~(period - 1);
+	unsigned long global_period;
+	unsigned long flags;
+
+	global_period = percpu_counter_read(&pg->events);
+	global_period &= period_mask;
+
+	/*
+	 * Fast path - check if the local and global period count still match
+	 * outside of the lock.
+	 */
+	if (pl->period == global_period)
+		return;
+
+	spin_lock_irqsave(&pl->lock, flags);
+	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+	/*
+	 * For each missed period, we half the local counter.
+	 * basically:
+	 *   pl->events >> (global_period - pl->period);
+	 *
+	 * but since the distributed nature of percpu counters make division
+	 * rather hard, use a regular subtraction loop. This is safe, because
+	 * the events will only every be incremented, hence the subtraction
+	 * can never result in a negative number.
+	 */
+	while (pl->period != global_period) {
+		unsigned long val = percpu_counter_read(&pl->events);
+		unsigned long half = (val + 1) >> 1;
+
+		/*
+		 * Half of zero won't be much less, break out.
+		 * This limits the loop to shift iterations, even
+		 * if we missed a million.
+		 */
+		if (!val)
+			break;
+
+		percpu_counter_add(&pl->events, -half);
+		pl->period += period;
+	}
+	pl->period = global_period;
+	spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ *   ++x_{j}, ++t
+ */
+void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
+{
+	struct prop_global *pg = prop_get_global(pd);
+
+	prop_norm_percpu(pg, pl);
+	percpu_counter_add(&pl->events, 1);
+	percpu_counter_add(&pg->events, 1);
+	prop_put_global(pd, pg);
+}
+
+/*
+ * Obtain a fraction of this proportion
+ *
+ *   p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction_percpu(struct prop_descriptor *pd,
+		struct prop_local_percpu *pl,
+		long *numerator, long *denominator)
+{
+	struct prop_global *pg = prop_get_global(pd);
+	unsigned long period_2 = 1UL << (pg->shift - 1);
+	unsigned long counter_mask = period_2 - 1;
+	unsigned long global_count;
+
+	prop_norm_percpu(pg, pl);
+	*numerator = percpu_counter_read_positive(&pl->events);
+
+	global_count = percpu_counter_read(&pg->events);
+	*denominator = period_2 + (global_count & counter_mask);
+
+	prop_put_global(pd, pg);
+}
+
+/*
+ * SINGLE
+ */
+
+int prop_local_init_single(struct prop_local_single *pl)
+{
+	spin_lock_init(&pl->lock);
+	pl->shift = 0;
+	pl->period = 0;
+	pl->events = 0;
+	return 0;
+}
+
+void prop_local_destroy_single(struct prop_local_single *pl)
+{
+}
+
+/*
+ * Catch up with missed period expirations.
+ */
+static
+void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
+{
+	unsigned long period = 1UL << (pg->shift - 1);
+	unsigned long period_mask = ~(period - 1);
+	unsigned long global_period;
+	unsigned long flags;
+
+	global_period = percpu_counter_read(&pg->events);
+	global_period &= period_mask;
+
+	/*
+	 * Fast path - check if the local and global period count still match
+	 * outside of the lock.
+	 */
+	if (pl->period == global_period)
+		return;
+
+	spin_lock_irqsave(&pl->lock, flags);
+	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+	/*
+	 * For each missed period, we half the local counter.
+	 */
+	period = (global_period - pl->period) >> (pg->shift - 1);
+	if (likely(period < BITS_PER_LONG))
+		pl->events >>= period;
+	else
+		pl->events = 0;
+	pl->period = global_period;
+	spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ *   ++x_{j}, ++t
+ */
+void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
+{
+	struct prop_global *pg = prop_get_global(pd);
+
+	prop_norm_single(pg, pl);
+	pl->events++;
+	percpu_counter_add(&pg->events, 1);
+	prop_put_global(pd, pg);
+}
+
+/*
+ * Obtain a fraction of this proportion
+ *
+ *   p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction_single(struct prop_descriptor *pd,
+	       	struct prop_local_single *pl,
+		long *numerator, long *denominator)
+{
+	struct prop_global *pg = prop_get_global(pd);
+	unsigned long period_2 = 1UL << (pg->shift - 1);
+	unsigned long counter_mask = period_2 - 1;
+	unsigned long global_count;
+
+	prop_norm_single(pg, pl);
+	*numerator = pl->events;
+
+	global_count = percpu_counter_read(&pg->events);
+	*denominator = period_2 + (global_count & counter_mask);
+
+	prop_put_global(pd, pg);
+}
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6b26f9d39800..48c250fe2233 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1042,19 +1042,21 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(void *node, struct kmem_cache *cachep, unsigned long flags)
+radix_tree_node_ctor(struct kmem_cache *cachep, void *node)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
 }
 
 static __init unsigned long __maxindex(unsigned int height)
 {
-	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
-	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
-
-	if (tmp >= RADIX_TREE_INDEX_BITS)
-		index = ~0UL;
-	return index;
+	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
+	int shift = RADIX_TREE_INDEX_BITS - width;
+
+	if (shift < 0)
+		return ~0UL;
+	if (shift >= BITS_PER_LONG)
+		return 0UL;
+	return ~0UL >> shift;
 }
 
 static __init void radix_tree_init_maxindex(void)
diff --git a/lib/sort.c b/lib/sort.c
index 961567894d16..6abbaf3d5858 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -67,7 +67,7 @@ void sort(void *base, size_t num, size_t size,
 	}
 
 	/* sort */
-	for (i = n - size; i >= 0; i -= size) {
+	for (i = n - size; i > 0; i -= size) {
 		swap(base, base + i, size);
 		for (r = 0; r * 2 + size < i; r = c) {
 			c = r * 2 + size;