From c6622f63db86fcbd41bf6fe05ddf2e00c1e51ced Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 24 Feb 2006 10:06:59 +1100
Subject: powerpc: Implement accurate task and CPU time accounting

This implements accurate task and cpu time accounting for 64-bit
powerpc kernels.  Instead of accounting a whole jiffy of time to a
task on a timer interrupt because that task happened to be running at
the time, we now account time in units of timebase ticks according to
the actual time spent by the task in user mode and kernel mode.  We
also count the time spent processing hardware and software interrupts
accurately.  This is conditional on CONFIG_VIRT_CPU_ACCOUNTING.  If
that is not set, we do tick-based approximate accounting as before.

To get this accurate information, we read either the PURR (processor
utilization of resources register) on POWER5 machines, or the timebase
on other machines on

* each entry to the kernel from usermode
* each exit to usermode
* transitions between process context, hard irq context and soft irq
  context in kernel mode
* context switches.

On POWER5 systems with shared-processor logical partitioning we also
read both the PURR and the timebase at each timer interrupt and
context switch in order to determine how much time has been taken by
the hypervisor to run other partitions ("steal" time).  Unfortunately,
since we need values of the PURR on both threads at the same time to
accurately calculate the steal time, and since we can only calculate
steal time on a per-core basis, the apportioning of the steal time
between idle time (time which we ceded to the hypervisor in the idle
loop) and actual stolen time is somewhat approximate at the moment.

This is all based quite heavily on what s390 does, and it uses the
generic interfaces that were added by the s390 developers,
i.e. account_system_time(), account_user_time(), etc.

This patch doesn't add any new interfaces between the kernel and
userspace, and doesn't change the units in which time is reported to
userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(),
times(), etc.  Internally the various task and cpu times are stored in
timebase units, but they are converted to USER_HZ units (1/100th of a
second) when reported to userspace.  Some precision is therefore lost
but there should not be any accumulating error, since the internal
accumulation is at full precision.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/time.c | 236 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 234 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/kernel/time.c')
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2a7ddc579379..0b34db28916f 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -51,6 +51,7 @@
 #include <linux/percpu.h>
 #include <linux/rtc.h>
 #include <linux/jiffies.h>
+#include <linux/posix-timers.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -135,6 +136,220 @@ unsigned long tb_last_stamp;
  */
 DEFINE_PER_CPU(unsigned long, last_jiffy);
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+/*
+ * Factors for converting from cputime_t (timebase ticks) to
+ * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds).
+ * These are all stored as 0.64 fixed-point binary fractions.
+ */
+u64 __cputime_jiffies_factor;
+u64 __cputime_msec_factor;
+u64 __cputime_sec_factor;
+u64 __cputime_clockt_factor;
+
+static void calc_cputime_factors(void)
+{
+	struct div_result res;
+
+	div128_by_32(HZ, 0, tb_ticks_per_sec, &res);
+	__cputime_jiffies_factor = res.result_low;
+	div128_by_32(1000, 0, tb_ticks_per_sec, &res);
+	__cputime_msec_factor = res.result_low;
+	div128_by_32(1, 0, tb_ticks_per_sec, &res);
+	__cputime_sec_factor = res.result_low;
+	div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res);
+	__cputime_clockt_factor = res.result_low;
+}
+
+/*
+ * Read the PURR on systems that have it, otherwise the timebase.
+ */
+static u64 read_purr(void)
+{
+	if (cpu_has_feature(CPU_FTR_PURR))
+		return mfspr(SPRN_PURR);
+	return mftb();
+}
+
+/*
+ * Account time for a transition between system, hard irq
+ * or soft irq state.
+ */
+void account_system_vtime(struct task_struct *tsk)
+{
+	u64 now, delta;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	now = read_purr();
+	delta = now - get_paca()->startpurr;
+	get_paca()->startpurr = now;
+	if (!in_interrupt()) {
+		delta += get_paca()->system_time;
+		get_paca()->system_time = 0;
+	}
+	account_system_time(tsk, 0, delta);
+	local_irq_restore(flags);
+}
+
+/*
+ * Transfer the user and system times accumulated in the paca
+ * by the exception entry and exit code to the generic process
+ * user and system time records.
+ * Must be called with interrupts disabled.
+ */
+void account_process_vtime(struct task_struct *tsk)
+{
+	cputime_t utime;
+
+	utime = get_paca()->user_time;
+	get_paca()->user_time = 0;
+	account_user_time(tsk, utime);
+}
+
+static void account_process_time(struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	account_process_vtime(current);
+	run_local_timers();
+	if (rcu_pending(cpu))
+		rcu_check_callbacks(cpu, user_mode(regs));
+	scheduler_tick();
+ 	run_posix_cpu_timers(current);
+}
+
+#ifdef CONFIG_PPC_SPLPAR
+/*
+ * Stuff for accounting stolen time.
+ */
+struct cpu_purr_data {
+	int	initialized;			/* thread is running */
+	u64	tb0;			/* timebase at origin time */
+	u64	purr0;			/* PURR at origin time */
+	u64	tb;			/* last TB value read */
+	u64	purr;			/* last PURR value read */
+	u64	stolen;			/* stolen time so far */
+	spinlock_t lock;
+};
+
+static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data);
+
+static void snapshot_tb_and_purr(void *data)
+{
+	struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data);
+
+	p->tb0 = mftb();
+	p->purr0 = mfspr(SPRN_PURR);
+	p->tb = p->tb0;
+	p->purr = 0;
+	wmb();
+	p->initialized = 1;
+}
+
+/*
+ * Called during boot when all cpus have come up.
+ */
+void snapshot_timebases(void)
+{
+	int cpu;
+
+	if (!cpu_has_feature(CPU_FTR_PURR))
+		return;
+	for_each_cpu(cpu)
+		spin_lock_init(&per_cpu(cpu_purr_data, cpu).lock);
+	on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
+}
+
+void calculate_steal_time(void)
+{
+	u64 tb, purr, t0;
+	s64 stolen;
+	struct cpu_purr_data *p0, *pme, *phim;
+	int cpu;
+
+	if (!cpu_has_feature(CPU_FTR_PURR))
+		return;
+	cpu = smp_processor_id();
+	pme = &per_cpu(cpu_purr_data, cpu);
+	if (!pme->initialized)
+		return;		/* this can happen in early boot */
+	p0 = &per_cpu(cpu_purr_data, cpu & ~1);
+	phim = &per_cpu(cpu_purr_data, cpu ^ 1);
+	spin_lock(&p0->lock);
+	tb = mftb();
+	purr = mfspr(SPRN_PURR) - pme->purr0;
+	if (!phim->initialized || !cpu_online(cpu ^ 1)) {
+		stolen = (tb - pme->tb) - (purr - pme->purr);
+	} else {
+		t0 = pme->tb0;
+		if (phim->tb0 < t0)
+			t0 = phim->tb0;
+		stolen = phim->tb - t0 - phim->purr - purr - p0->stolen;
+	}
+	if (stolen > 0) {
+		account_steal_time(current, stolen);
+		p0->stolen += stolen;
+	}
+	pme->tb = tb;
+	pme->purr = purr;
+	spin_unlock(&p0->lock);
+}
+
+/*
+ * Must be called before the cpu is added to the online map when
+ * a cpu is being brought up at runtime.
+ */
+static void snapshot_purr(void)
+{
+	int cpu;
+	u64 purr;
+	struct cpu_purr_data *p0, *pme, *phim;
+	unsigned long flags;
+
+	if (!cpu_has_feature(CPU_FTR_PURR))
+		return;
+	cpu = smp_processor_id();
+	pme = &per_cpu(cpu_purr_data, cpu);
+	p0 = &per_cpu(cpu_purr_data, cpu & ~1);
+	phim = &per_cpu(cpu_purr_data, cpu ^ 1);
+	spin_lock_irqsave(&p0->lock, flags);
+	pme->tb = pme->tb0 = mftb();
+	purr = mfspr(SPRN_PURR);
+	if (!phim->initialized) {
+		pme->purr = 0;
+		pme->purr0 = purr;
+	} else {
+		/* set p->purr and p->purr0 for no change in p0->stolen */
+		pme->purr = phim->tb - phim->tb0 - phim->purr - p0->stolen;
+		pme->purr0 = purr - pme->purr;
+	}
+	pme->initialized = 1;
+	spin_unlock_irqrestore(&p0->lock, flags);
+}
+
+#endif /* CONFIG_PPC_SPLPAR */
+
+#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
+#define calc_cputime_factors()
+#define account_process_time(regs)	update_process_times(user_mode(regs))
+#define calculate_steal_time()		do { } while (0)
+#endif
+
+#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR))
+#define snapshot_purr()			do { } while (0)
+#endif
+
+/*
+ * Called when a cpu comes up after the system has finished booting,
+ * i.e. as a result of a hotplug cpu action.
+ */
+void snapshot_timebase(void)
+{
+	__get_cpu_var(last_jiffy) = get_tb();
+	snapshot_purr();
+}
+
 void __delay(unsigned long loops)
 {
 	unsigned long start;
@@ -382,6 +597,7 @@ static void iSeries_tb_recal(void)
 						new_tb_ticks_per_jiffy, sign, tick_diff );
 				tb_ticks_per_jiffy = new_tb_ticks_per_jiffy;
 				tb_ticks_per_sec   = new_tb_ticks_per_sec;
+				calc_cputime_factors();
 				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
 				do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
 				tb_to_xs = divres.result_low;
@@ -430,6 +646,7 @@ void timer_interrupt(struct pt_regs * regs)
 	irq_enter();
 
 	profile_tick(CPU_PROFILING, regs);
+	calculate_steal_time();
 
 #ifdef CONFIG_PPC_ISERIES
 	get_lppaca()->int_dword.fields.decr_int = 0;
@@ -451,7 +668,7 @@ void timer_interrupt(struct pt_regs * regs)
 		 * is the case.
 		 */
 		if (!cpu_is_offline(cpu))
-			update_process_times(user_mode(regs));
+			account_process_time(regs);
 
 		/*
 		 * No need to check whether cpu is offline here; boot_cpuid
@@ -508,13 +725,27 @@ void wakeup_decrementer(void)
 void __init smp_space_timers(unsigned int max_cpus)
 {
 	int i;
+	unsigned long half = tb_ticks_per_jiffy / 2;
 	unsigned long offset = tb_ticks_per_jiffy / max_cpus;
 	unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid);
 
 	/* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */
 	previous_tb -= tb_ticks_per_jiffy;
+	/*
+	 * The stolen time calculation for POWER5 shared-processor LPAR
+	 * systems works better if the two threads' timebase interrupts
+	 * are staggered by half a jiffy with respect to each other.
+	 */
 	for_each_cpu(i) {
-		if (i != boot_cpuid) {
+		if (i == boot_cpuid)
+			continue;
+		if (i == (boot_cpuid ^ 1))
+			per_cpu(last_jiffy, i) =
+				per_cpu(last_jiffy, boot_cpuid) - half;
+		else if (i & 1)
+			per_cpu(last_jiffy, i) =
+				per_cpu(last_jiffy, i ^ 1) + half;
+		else {
 			previous_tb += offset;
 			per_cpu(last_jiffy, i) = previous_tb;
 		}
@@ -706,6 +937,7 @@ void __init time_init(void)
 	tb_ticks_per_sec = ppc_tb_freq;
 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
 	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
+	calc_cputime_factors();
 
 	/*
 	 * Calculate the length of each tick in ns.  It will not be
-- 
cgit v1.2.3


From 2cf82c0256b198ae28c465f2c4d7c12c836ea5ea Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 27 Feb 2006 15:41:47 +1100
Subject: powerpc: Export variables used in conversions to/from cputime_t

The inline cputime_to_foo and foo_to_cputime conversion functions in
include/asm-powerpc/cputime.h refer to 5 variables, which need to be
exported if those functions are to be usable from modules.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/time.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/powerpc/kernel/time.c')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0b34db28916f..4f20a5f15d49 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -99,6 +99,7 @@ unsigned long tb_ticks_per_jiffy;
 unsigned long tb_ticks_per_usec = 100; /* sane default */
 EXPORT_SYMBOL(tb_ticks_per_usec);
 unsigned long tb_ticks_per_sec;
+EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
 u64 tb_to_xs;
 unsigned tb_to_us;
 
@@ -143,9 +144,13 @@ DEFINE_PER_CPU(unsigned long, last_jiffy);
  * These are all stored as 0.64 fixed-point binary fractions.
  */
 u64 __cputime_jiffies_factor;
+EXPORT_SYMBOL(__cputime_jiffies_factor);
 u64 __cputime_msec_factor;
+EXPORT_SYMBOL(__cputime_msec_factor);
 u64 __cputime_sec_factor;
+EXPORT_SYMBOL(__cputime_sec_factor);
 u64 __cputime_clockt_factor;
+EXPORT_SYMBOL(__cputime_clockt_factor);
 
 static void calc_cputime_factors(void)
 {
-- 
cgit v1.2.3


From 0a45d4491d0f172e02126370f312405c5d473363 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 15 Mar 2006 13:47:15 +1100
Subject: powerpc: Fix problem with time going backwards

The recent changes to keep gettimeofday in sync with xtime had the side
effect that it was occasionally possible for the time reported by
gettimeofday to go back by a microsecond.  There were two reasons:
(1) when we recalculated the offsets used by gettimeofday every 2^31
timebase ticks, we lost an accumulated fractional microsecond, and
(2) because the update is done some time after the notional start of
jiffy, if ntp is slowing the clock, it is possible to see time go backwards
when the timebase factor gets reduced.

This fixes it by (a) slowing the gettimeofday clock by about 1us in
2^31 timebase ticks (a factor of less than 1 in 3.7 million), and (b)
adjusting the timebase offsets in the rare case that the gettimeofday
result could possibly go backwards (i.e. when ntp is slowing the clock
and the timer interrupt is late).  In this case the adjustment will
reduce to zero eventually because of (a).

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/time.c | 48 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 14 deletions(-)

(limited to 'arch/powerpc/kernel/time.c')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2a7ddc579379..86f7e3d154d8 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -283,9 +283,9 @@ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
 	 * the two values of tb_update_count match and are even then the
 	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
 	 * loops back and reads them again until this criteria is met.
+	 * We expect the caller to have done the first increment of
+	 * vdso_data->tb_update_count already.
 	 */
-	++(vdso_data->tb_update_count);
-	smp_wmb();
 	vdso_data->tb_orig_stamp = new_tb_stamp;
 	vdso_data->stamp_xsec = new_stamp_xsec;
 	vdso_data->tb_to_xs = new_tb_to_xs;
@@ -310,20 +310,15 @@ static __inline__ void timer_recalc_offset(u64 cur_tb)
 	unsigned long offset;
 	u64 new_stamp_xsec;
 	u64 tlen, t2x;
+	u64 tb, xsec_old, xsec_new;
+	struct gettimeofday_vars *varp;
 
 	if (__USE_RTC())
 		return;
 	tlen = current_tick_length();
 	offset = cur_tb - do_gtod.varp->tb_orig_stamp;
-	if (tlen == last_tick_len && offset < 0x80000000u) {
-		/* check that we're still in sync; if not, resync */
-		struct timeval tv;
-		__do_gettimeofday(&tv, cur_tb);
-		if (tv.tv_sec <= xtime.tv_sec &&
-		    (tv.tv_sec < xtime.tv_sec ||
-		     tv.tv_usec * 1000 <= xtime.tv_nsec))
-			return;
-	}
+	if (tlen == last_tick_len && offset < 0x80000000u)
+		return;
 	if (tlen != last_tick_len) {
 		t2x = mulhdu(tlen << TICKLEN_SHIFT, ticklen_to_xs);
 		last_tick_len = tlen;
@@ -332,6 +327,21 @@ static __inline__ void timer_recalc_offset(u64 cur_tb)
 	new_stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
 	do_div(new_stamp_xsec, 1000000000);
 	new_stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
+
+	++vdso_data->tb_update_count;
+	smp_mb();
+
+	/*
+	 * Make sure time doesn't go backwards for userspace gettimeofday.
+	 */
+	tb = get_tb();
+	varp = do_gtod.varp;
+	xsec_old = mulhdu(tb - varp->tb_orig_stamp, varp->tb_to_xs)
+		+ varp->stamp_xsec;
+	xsec_new = mulhdu(tb - cur_tb, t2x) + new_stamp_xsec;
+	if (xsec_new < xsec_old)
+		new_stamp_xsec += xsec_old - xsec_new;
+
 	update_gtod(cur_tb, new_stamp_xsec, t2x);
 }
 
@@ -564,6 +574,10 @@ int do_settimeofday(struct timespec *tv)
 	}
 #endif
 
+	/* Make userspace gettimeofday spin until we're done. */
+	++vdso_data->tb_update_count;
+	smp_mb();
+
 	/*
 	 * Subtract off the number of nanoseconds since the
 	 * beginning of the last tick.
@@ -724,10 +738,16 @@ void __init time_init(void)
 	 * It is computed as:
 	 * ticklen_to_xs = 2^N / (tb_ticks_per_jiffy * 1e9)
 	 * where N = 64 + 20 - TICKLEN_SCALE - TICKLEN_SHIFT
-	 * so as to give the result as a 0.64 fixed-point fraction.
+	 * which turns out to be N = 51 - SHIFT_HZ.
+	 * This gives the result as a 0.64 fixed-point fraction.
+	 * That value is reduced by an offset amounting to 1 xsec per
+	 * 2^31 timebase ticks to avoid problems with time going backwards
+	 * by 1 xsec when we do timer_recalc_offset due to losing the
+	 * fractional xsec.  That offset is equal to ppc_tb_freq/2^51
+	 * since there are 2^20 xsec in a second.
 	 */
-	div128_by_32(1ULL << (64 + 20 - TICKLEN_SCALE - TICKLEN_SHIFT), 0,
-		     tb_ticks_per_jiffy, &res);
+	div128_by_32((1ULL << 51) - ppc_tb_freq, 0,
+		     tb_ticks_per_jiffy << SHIFT_HZ, &res);
 	div128_by_32(res.result_high, res.result_low, NSEC_PER_SEC, &res);
 	ticklen_to_xs = res.result_low;
 
-- 
cgit v1.2.3


From 0e5519548fdc8eadc3eacb49b1908d44d347fb2b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 14:50:51 -0800
Subject: [PATCH] for_each_possible_cpu: powerpc

for_each_cpu() actually iterates across all possible CPUs.  We've had mistakes
in the past where people were using for_each_cpu() where they should have been
iterating across only online or present CPUs.  This is inefficient and
possibly buggy.

We're renaming for_each_cpu() to for_each_possible_cpu() to avoid this in the
future.

This patch replaces for_each_cpu with for_each_possible_cpu.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/irq.c               | 2 +-
 arch/powerpc/kernel/lparcfg.c           | 4 ++--
 arch/powerpc/kernel/rtas.c              | 4 ++--
 arch/powerpc/kernel/setup-common.c      | 2 +-
 arch/powerpc/kernel/setup_32.c          | 2 +-
 arch/powerpc/kernel/setup_64.c          | 6 +++---
 arch/powerpc/kernel/smp.c               | 2 +-
 arch/powerpc/kernel/sysfs.c             | 6 +++---
 arch/powerpc/kernel/time.c              | 4 ++--
 arch/powerpc/mm/stab.c                  | 2 +-
 arch/powerpc/platforms/cell/interrupt.c | 2 +-
 arch/powerpc/platforms/cell/pervasive.c | 2 +-
 arch/powerpc/platforms/pseries/xics.c   | 2 +-
 include/asm-powerpc/percpu.h            | 2 +-
 14 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'arch/powerpc/kernel/time.c')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 771a59cbd213..bb5c9501234c 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -379,7 +379,7 @@ void irq_ctx_init(void)
 	struct thread_info *tp;
 	int i;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
 		tp = softirq_ctx[i];
 		tp->cpu = i;
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index e789fef4eb8a..1b73508ecb2b 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -56,7 +56,7 @@ static unsigned long get_purr(void)
 	unsigned long sum_purr = 0;
 	int cpu;
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		sum_purr += lppaca[cpu].emulated_time_base;
 
 #ifdef PURR_DEBUG
@@ -222,7 +222,7 @@ static unsigned long get_purr(void)
 	int cpu;
 	struct cpu_usage *cu;
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		cu = &per_cpu(cpu_usage_array, cpu);
 		sum_purr += cu->current_tb;
 	}
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 4b78ee0e5867..06636c927a7e 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -593,7 +593,7 @@ static void rtas_percpu_suspend_me(void *info)
 		data->waiting = 0;
 		data->args->args[data->args->nargs] =
 			rtas_call(ibm_suspend_me_token, 0, 1, NULL);
-		for_each_cpu(i)
+		for_each_possible_cpu(i)
 			plpar_hcall_norets(H_PROD,i);
 	} else {
 		data->waiting = -EBUSY;
@@ -626,7 +626,7 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 	/* Prod each CPU.  This won't hurt, and will wake
 	 * anyone we successfully put to sleep with H_Join
 	 */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		plpar_hcall_norets(H_PROD, i);
 
 	return data.waiting;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 3473cb9cb0ab..c607f3b9ca17 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -431,7 +431,7 @@ void __init smp_setup_cpu_maps(void)
 	/*
 	 * Do the sibling map; assume only two threads per processor.
 	 */
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 		if (cpu_has_feature(CPU_FTR_SMT))
 			cpu_set(cpu ^ 0x1, cpu_sibling_map[cpu]);
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index ae9c33d70731..a72bf5dceeee 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -226,7 +226,7 @@ int __init ppc_init(void)
 	if ( ppc_md.progress ) ppc_md.progress("             ", 0xffff);
 
 	/* register CPU devices */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		register_cpu(&cpu_devices[i], i, NULL);
 
 	/* call platform init */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 05b152299396..59aa92cd6fa4 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -474,7 +474,7 @@ static void __init irqstack_early_init(void)
 	 * interrupt stacks must be under 256MB, we cannot afford to take
 	 * SLB misses on them.
 	 */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		softirq_ctx[i] = (struct thread_info *)
 			__va(lmb_alloc_base(THREAD_SIZE,
 					    THREAD_SIZE, 0x10000000));
@@ -507,7 +507,7 @@ static void __init emergency_stack_init(void)
 	 */
 	limit = min(0x10000000UL, lmb.rmo_size);
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		paca[i].emergency_sp =
 		__va(lmb_alloc_base(HW_PAGE_SIZE, 128, limit)) + HW_PAGE_SIZE;
 }
@@ -624,7 +624,7 @@ void __init setup_per_cpu_areas(void)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 805eaedbc308..530f7dba0bd2 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -362,7 +362,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
  
 	smp_space_timers(max_cpus);
 
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		if (cpu != boot_cpuid)
 			smp_create_idle(cpu);
 }
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index aca2f09cd842..73560ef6f802 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -74,7 +74,7 @@ static int __init smt_setup(void)
 	val = (unsigned int *)get_property(options, "ibm,smt-snooze-delay",
 					   NULL);
 	if (!smt_snooze_cmdline && val) {
-		for_each_cpu(cpu)
+		for_each_possible_cpu(cpu)
 			per_cpu(smt_snooze_delay, cpu) = *val;
 	}
 
@@ -93,7 +93,7 @@ static int __init setup_smt_snooze_delay(char *str)
 	smt_snooze_cmdline = 1;
 
 	if (get_option(&str, &snooze)) {
-		for_each_cpu(cpu)
+		for_each_possible_cpu(cpu)
 			per_cpu(smt_snooze_delay, cpu) = snooze;
 	}
 
@@ -347,7 +347,7 @@ static int __init topology_init(void)
 
 	register_cpu_notifier(&sysfs_cpu_nb);
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
 
 #ifdef CONFIG_NUMA
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 4a27218a086c..24e3ad756de0 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -261,7 +261,7 @@ void snapshot_timebases(void)
 
 	if (!cpu_has_feature(CPU_FTR_PURR))
 		return;
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		spin_lock_init(&per_cpu(cpu_purr_data, cpu).lock);
 	on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
 }
@@ -751,7 +751,7 @@ void __init smp_space_timers(unsigned int max_cpus)
 	 * systems works better if the two threads' timebase interrupts
 	 * are staggered by half a jiffy with respect to each other.
 	 */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		if (i == boot_cpuid)
 			continue;
 		if (i == (boot_cpuid ^ 1))
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 91d25fb27f89..4a9291d9fef8 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -239,7 +239,7 @@ void stabs_alloc(void)
 	if (cpu_has_feature(CPU_FTR_SLB))
 		return;
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		unsigned long newstab;
 
 		if (cpu == 0)
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index ae62f5d5c31b..978be1c30c1b 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -364,7 +364,7 @@ void iic_init_IRQ(void)
 		setup_iic_hardcoded();
 
 	irq_offset = 0;
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		iic = &per_cpu(iic, cpu);
 		if (iic->regs)
 			out_be64(&iic->regs->prio, 0xff);
diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c
index 58baeb52f6fc..7eed8c624517 100644
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -217,7 +217,7 @@ void __init cell_pervasive_init(void)
 	if (!cpu_has_feature(CPU_FTR_PAUSE_ZERO))
 		return;
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		p = &cbe_pervasive[cpu];
 		ret = cbe_find_pmd_mmio(cpu, p);
 		if (ret)
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index c60d3ff25a2f..4864cb32be25 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -541,7 +541,7 @@ nextnode:
 		ops = &pSeriesLP_ops;
 	else {
 #ifdef CONFIG_SMP
-		for_each_cpu(i) {
+		for_each_possible_cpu(i) {
 			int hard_id;
 
 			/* FIXME: Do this dynamically! --RR */
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h
index 464301cd0d03..184a7a4d2fdf 100644
--- a/include/asm-powerpc/percpu.h
+++ b/include/asm-powerpc/percpu.h
@@ -27,7 +27,7 @@
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for_each_cpu(__i)					\
+	for_each_possible_cpu(__i)				\
 		memcpy((pcpudst)+__per_cpu_offset(__i),		\
 		       (src), (size));				\
 } while (0)
-- 
cgit v1.2.3