44 files changed, 4166 insertions, 871 deletions
diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 74f83f4a4e5e..d9ac24e8de16 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -1455,7 +1455,8 @@ CONFIG_HAS_DMA=y
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
+CONFIG_OPROFILE_CELL=y
 # CONFIG_KPROBES is not set
 
 #
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index d3f2080d2eee..37658ea417fa 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -219,6 +219,72 @@ void crash_kexec_secondary(struct pt_regs *regs)
 	cpus_in_sr = CPU_MASK_NONE;
 }
 #endif
+#ifdef CONFIG_SPU_BASE
+
+#include <asm/spu.h>
+#include <asm/spu_priv1.h>
+
+struct crash_spu_info {
+	struct spu *spu;
+	u32 saved_spu_runcntl_RW;
+	u32 saved_spu_status_R;
+	u32 saved_spu_npc_RW;
+	u64 saved_mfc_sr1_RW;
+	u64 saved_mfc_dar;
+	u64 saved_mfc_dsisr;
+};
+
+#define CRASH_NUM_SPUS	16	/* Enough for current hardware */
+static struct crash_spu_info crash_spu_info[CRASH_NUM_SPUS];
+
+static void crash_kexec_stop_spus(void)
+{
+	struct spu *spu;
+	int i;
+	u64 tmp;
+
+	for (i = 0; i < CRASH_NUM_SPUS; i++) {
+		if (!crash_spu_info[i].spu)
+			continue;
+
+		spu = crash_spu_info[i].spu;
+
+		crash_spu_info[i].saved_spu_runcntl_RW =
+			in_be32(&spu->problem->spu_runcntl_RW);
+		crash_spu_info[i].saved_spu_status_R =
+			in_be32(&spu->problem->spu_status_R);
+		crash_spu_info[i].saved_spu_npc_RW =
+			in_be32(&spu->problem->spu_npc_RW);
+
+		crash_spu_info[i].saved_mfc_dar    = spu_mfc_dar_get(spu);
+		crash_spu_info[i].saved_mfc_dsisr  = spu_mfc_dsisr_get(spu);
+		tmp = spu_mfc_sr1_get(spu);
+		crash_spu_info[i].saved_mfc_sr1_RW = tmp;
+
+		tmp &= ~MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+		spu_mfc_sr1_set(spu, tmp);
+
+		__delay(200);
+	}
+}
+
+void crash_register_spus(struct list_head *list)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, list, full_list) {
+		if (WARN_ON(spu->number >= CRASH_NUM_SPUS))
+			continue;
+
+		crash_spu_info[spu->number].spu = spu;
+	}
+}
+
+#else
+static inline void crash_kexec_stop_spus(void)
+{
+}
+#endif /* CONFIG_SPU_BASE */
 
 void default_machine_crash_shutdown(struct pt_regs *regs)
 {
@@ -254,6 +320,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	crash_save_cpu(regs, crashing_cpu);
 	crash_kexec_prepare_cpus(crashing_cpu);
 	cpu_set(crashing_cpu, cpus_in_crash);
+	crash_kexec_stop_spus();
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(1, 0);
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e5df167f7824..727a6699f2f4 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -122,6 +122,7 @@ extern struct timezone sys_tz;
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig
index eb2dece76a54..7089e79689b9 100644
--- a/arch/powerpc/oprofile/Kconfig
+++ b/arch/powerpc/oprofile/Kconfig
@@ -15,3 +15,10 @@ config OPROFILE
 
 	  If unsure, say N.
 
+config OPROFILE_CELL
+	bool "OProfile for Cell Broadband Engine"
+	depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
+	default y
+	help
+	  Profiling of Cell BE SPUs requires special support enabled
+	  by this option.
diff --git a/arch/powerpc/oprofile/Makefile b/arch/powerpc/oprofile/Makefile
index 4b5f9528218c..c5f64c3bd668 100644
--- a/arch/powerpc/oprofile/Makefile
+++ b/arch/powerpc/oprofile/Makefile
@@ -11,7 +11,9 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
+		cell/spu_profiler.o cell/vma_map.o \
+		cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o op_model_pa6t.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
new file mode 100644
index 000000000000..e5704f00c8b4
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -0,0 +1,97 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+#include "../../platforms/cell/cbe_regs.h"
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct spu_overlay_info {	/* map of sections within an SPU overlay */
+	unsigned int vma;	/* SPU virtual memory address from elf */
+	unsigned int size;	/* size of section from elf */
+	unsigned int offset;	/* offset of section into elf file */
+	unsigned int buf;
+};
+
+struct vma_to_fileoffset_map {	/* map of sections within an SPU program */
+	struct vma_to_fileoffset_map *next;	/* list pointer */
+	unsigned int vma;	/* SPU virtual memory address from elf */
+	unsigned int size;	/* size of section from elf */
+	unsigned int offset;	/* offset of section into elf file */
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+        /*
+	 * The guard pointer is an entry in the _ovly_buf_table,
+	 * computed using ovly.buf as the index into the table.  Since
+	 * ovly.buf values begin at '1' to reference the first (or 0th)
+	 * entry in the _ovly_buf_table, the computation subtracts 1
+	 * from ovly.buf.
+	 * The guard value is stored in the _ovly_buf_table entry and
+	 * is an index (starting at 1) back to the _ovly_table entry
+	 * that is pointing at this _ovly_buf_table entry.  So, for
+	 * example, for an overlay scenario with one overlay segment
+	 * and two overlay sections:
+	 *      - Section 1 points to the first entry of the
+	 *        _ovly_buf_table, which contains a guard value
+	 *        of '1', referencing the first (index=0) entry of
+	 *        _ovly_table.
+	 *      - Section 2 points to the second entry of the
+	 *        _ovly_buf_table, which contains a guard value
+	 *        of '2', referencing the second (index=1) entry of
+	 *        _ovly_table.
+	 */
+
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map *create_vma_map(const struct spu *spu,
+					     u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map,
+			    unsigned int vma, const struct spu *aSpu,
+			    int *grd_val);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+int start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int *samples,
+		     int num_samples);
+
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif	  /* PR_UTIL_H */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
new file mode 100644
index 000000000000..380d7e217531
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -0,0 +1,221 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *	    Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14
+
+static u32 *samples;
+
+static int spu_prof_running;
+static unsigned int profiling_interval;
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE	     8
+
+#define SPU_PC_MASK	     0xFFFF
+
+static DEFINE_SPINLOCK(sample_array_lock);
+unsigned long sample_array_lock_flags;
+
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long ns_per_cyc;
+
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+	/* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT, which provides 4 decimal places
+	 * of precision.  This is close enough for the purpose at hand.
+	 *
+	 * The value of the timeout should be small enough that the hw
+	 * trace buffer will not get more then about 1/3 full for the
+	 * maximum user specified (the LFSR value) hw sampling frequency.
+	 * This is to ensure the trace buffer will never fill even if the
+	 * kernel thread scheduling varies under a heavy system load.
+	 */
+
+	ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+	/* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_mask;
+	int spu;
+
+	spu_mask = SPU_PC_MASK;
+
+	/* Each SPU PC is 16 bits; hence, four spus in each of
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.	Process two 64-bit values
+	 * simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter
+		 */
+		samples[spu * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[0]) << 2;
+		samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[1]) << 2;
+
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE)
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+
+	return entry;
+}
+
+
+static enum hrtimer_restart profile_spus(struct hrtimer *timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		/* There should only be one kernel thread at a time processing
+		 * the samples.	 In the very unlikely case that the processing
+		 * is taking a very long time and multiple kernel threads are
+		 * started to process the samples.  Make sure only one kernel
+		 * thread is working on the samples array at a time.  The
+		 * sample array must be loaded and then processed for a given
+		 * cpu.	 The sample array is not per cpu.
+		 */
+		spin_lock_irqsave(&sample_array_lock,
+				  sample_array_lock_flags);
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0) {
+			spin_unlock_irqrestore(&sample_array_lock,
+					       sample_array_lock_flags);
+			continue;
+		}
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num,
+					samples + (k * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+
+		spin_unlock_irqrestore(&sample_array_lock,
+				       sample_array_lock_flags);
+
+	}
+	smp_wmb();	/* insure spu event buffer updates are written */
+			/* don't want events intermingled... */
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+int start_spu_profiling(unsigned int cycles_reset)
+{
+	ktime_t kt;
+
+	pr_debug("timer resolution: %lu\n", TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+	/* Allocate arrays for collecting SPU PC samples */
+	samples = kzalloc(SPUS_PER_NODE *
+			  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	if (!samples)
+		return -ENOMEM;
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+
+	return 0;
+}
+
+void stop_spu_profiling(void)
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
new file mode 100644
index 000000000000..133665754a75
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -0,0 +1,484 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer.
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+#include <linux/dcookies.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/numa.h>
+#include <linux/oprofile.h>
+#include <linux/spinlock.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static DEFINE_SPINLOCK(buffer_lock);
+static DEFINE_SPINLOCK(cache_lock);
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+int last_guard_val[MAX_NUMNODES * 8];
+
+/* Container for caching information about an active SPU task. */
+struct cached_info {
+	struct vma_to_fileoffset_map *map;
+	struct spu *the_spu;	/* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info *spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref *kref)
+{
+	struct cached_info *info;
+
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+	module_put(THIS_MODULE);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * ATTENTION:  Callers are responsible for obtaining the
+ *	       cache_lock if needed prior to invoking this function.
+ */
+static struct cached_info *get_cached_info(struct spu *the_spu, int spu_num)
+{
+	struct kref *ref;
+	struct cached_info *ret_info;
+
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num);
+		ret_info = NULL;
+		goto out;
+	}
+	if (!spu_info[spu_num] && the_spu) {
+		ref = spu_get_profile_private_kref(the_spu->ctx);
+		if (ref) {
+			spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
+			kref_get(&spu_info[spu_num]->cache_ref);
+		}
+	}
+
+	ret_info = spu_info[spu_num];
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.
+ */
+static int
+prepare_cached_spu_info(struct spu *spu, unsigned long objectId)
+{
+	unsigned long flags;
+	struct vma_to_fileoffset_map *new_map;
+	int retval = 0;
+	struct cached_info *info;
+
+	/* We won't bother getting cache_lock here since
+	 * don't do anything with the cached_info that's returned.
+	 */
+	info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+
+	/* We increment the module refcount here since SPUFS is
+	 * responsible for the final destruction of the cached_info,
+	 * and it must be able to access the destroy_cached_info()
+	 * function defined in the OProfile module.  We decrement
+	 * the module refcount in destroy_cached_info.
+	 */
+	try_module_get(THIS_MODULE);
+	spu_set_profile_private_kref(spu->ctx, &info->cache_ref,
+				destroy_cached_info);
+	spin_unlock_irqrestore(&cache_lock, flags);
+	goto out;
+
+err_alloc:
+	kfree(info);
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+		if (spu_index >= num_spu_nodes) {
+			printk(KERN_ERR "SPU_PROF: "
+				"%s, line %d: "
+				"Invalid index %d into spu info cache\n",
+				__FUNCTION__, __LINE__, spu_index);
+			goto out;
+		}
+		end = spu_index + 1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref,
+				 destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry *dentry,
+					     struct vfsmount *vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ * For the embedded case, we must determine if SPU ELF is embedded
+ * in the executable application or another file (i.e., shared lib).
+ * If embedded in a shared lib, we must get the dcookie and return
+ * that to the caller.
+ */
+static unsigned long
+get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
+			    unsigned long *spu_bin_dcookie,
+			    unsigned long spu_ref)
+{
+	unsigned long app_cookie = 0;
+	unsigned int my_offset = 0;
+	struct file *app = NULL;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	down_read(&mm->mmap_sem);
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		app_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		app = vma->vm_file;
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		if (!vma->vm_file)
+			goto fail_no_image_cookie;
+
+		pr_debug("Found spu ELF at %X(object-id:%lx) for file %s\n",
+			 my_offset, spu_ref,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		break;
+	}
+
+	*spu_bin_dcookie = fast_get_dcookie(vma->vm_file->f_dentry,
+						 vma->vm_file->f_vfsmnt);
+	pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name);
+
+	up_read(&mm->mmap_sem);
+
+out:
+	return app_cookie;
+
+fail_no_image_cookie:
+	up_read(&mm->mmap_sem);
+
+	printk(KERN_ERR "SPU_PROF: "
+		"%s, line %d: Cannot find dcookie for SPU binary\n",
+		__FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu *spu, unsigned long objectId)
+{
+	unsigned long flags;
+	int retval;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie;
+
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval)
+		goto out;
+
+	/* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
+	if (!app_dcookie || !spu_cookie) {
+		retval  = -ENOENT;
+		goto out;
+	}
+
+	/* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+	add_event_entry(spu_cookie);
+	add_event_entry(offset);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();	/* insure spu event buffer updates are written */
+			/* don't want entries intermingled... */
+out:
+	return retval;
+}
+
+/*
+ * This function is invoked on either a bind_context or unbind_context.
+ * If called for an unbind_context, the val arg is 0; otherwise,
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block *self, unsigned long val,
+				void *data)
+{
+	int retval;
+	unsigned long flags;
+	struct spu *the_spu = data;
+
+	pr_debug("SPU event notification arrived\n");
+	if (!val) {
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+static int number_of_online_nodes(void)
+{
+        u32 cpu; u32 tmp;
+        int nodes = 0;
+        for_each_online_cpu(cpu) {
+                tmp = cbe_cpu_to_node(cpu) + 1;
+                if (tmp > nodes)
+                        nodes++;
+        }
+        return nodes;
+}
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.	 A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void)
+{
+	int k;
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+	/* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	for (k = 0; k < (MAX_NUMNODES * 8); k++)
+		last_guard_val[k] = 0;
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int *samples,
+		     int num_samples)
+{
+	unsigned long long file_offset;
+	unsigned long flags;
+	int i;
+	struct vma_to_fileoffset_map *map;
+	struct spu *the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info *c_info;
+
+	/* We need to obtain the cache_lock here because it's
+	 * possible that after getting the cached_info, the SPU job
+	 * corresponding to this cached_info may end, thus resulting
+	 * in the destruction of the cached_info.
+	 */
+	spin_lock_irqsave(&cache_lock, flags);
+	c_info = get_cached_info(NULL, spu_num);
+	if (!c_info) {
+		/* This legitimately happens when the SPU task ends before all
+		 * samples are recorded.
+		 * No big deal -- so we just drop a few samples.
+		 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		goto out;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock(&buffer_lock);
+	for (i = 0; i < num_samples; i++) {
+		unsigned int sample = *(samples+i);
+		int grd_val = 0;
+		file_offset = 0;
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
+
+		/* If overlays are used by this SPU application, the guard
+		 * value is non-zero, indicating which overlay section is in
+		 * use.	 We need to discard samples taken during the time
+		 * period which an overlay occurs (i.e., guard value changes).
+		 */
+		if (grd_val && grd_val != last_guard_val[spu_num]) {
+			last_guard_val[spu_num] = grd_val;
+			/* Drop the rest of the samples. */
+			break;
+		}
+
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock(&buffer_lock);
+out:
+	spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+			"%s, line %d: spu_switch_event_unregister returned %d\n",
+			__FUNCTION__, __LINE__, ret);
+		goto out;
+	}
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
diff --git a/arch/powerpc/oprofile/cell/vma_map.c b/arch/powerpc/oprofile/cell/vma_map.c
new file mode 100644
index 000000000000..76ec1d16aef7
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/vma_map.c
@@ -0,0 +1,287 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu *aSpu, int *grd_val)
+{
+	/*
+	 * Default the offset to the physical address + a flag value.
+	 * Addresses of dynamically generated code can't be found in the vma
+	 * map.  For those addresses the flagged value will be sent on to
+	 * the user space tools so they can be reported rather than just
+	 * thrown away.
+	 */
+	u32 offset = 0x10000000 + vma;
+	u32 ovly_grd;
+
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+			*grd_val = ovly_grd;
+		}
+		offset = vma - map->vma + map->offset;
+		break;
+	}
+
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new =
+		kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map *create_vma_map(const struct spu *aSpu,
+					     unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	int grd_val;
+	struct vma_to_fileoffset_map *map = NULL;
+	struct spu_overlay_info ovly;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	/* Get and validate ELF header.	 */
+
+	if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr)))
+		goto fail;
+
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_ident parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_machine != EM_SPU) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_machine parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_type parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		if (copy_from_user(&phdr,
+				   (void *) (phdr_start + i * sizeof(phdr)),
+				   sizeof(phdr)))
+			goto fail;
+
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz,
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			goto fail;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		if (copy_from_user(&shdr,
+				   (void *) (shdr_start + i * sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		if (copy_from_user(&shdr_str,
+				   (void *) (shdr_start + shdr.sh_link *
+					     sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr_str.sh_type != SHT_STRTAB)
+			goto fail;;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			if (copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j *
+							   sizeof (sym)),
+					   sizeof (sym)))
+				goto fail;
+
+			if (copy_from_user(name, (void *)
+					   (spu_elf_start + shdr_str.sh_offset +
+					    sym.st_name),
+					   20))
+				goto fail;
+
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		goto out;
+	} else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	/* The _ovly_table symbol represents a table with one entry
+	 * per overlay section.	 The _ovly_buf_table symbol represents
+	 * a table with one entry per overlay region.
+	 * The struct spu_overlay_info gives the structure of the _ovly_table
+	 * entries.  The structure of _ovly_table_buf is simply one
+	 * u32 word per entry.
+	 */
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym,
+					    aSpu, &grd_val);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+
+	n_ovlys = (ovly_table_end_sym -
+		   ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		if (copy_from_user(&ovly, (void *)
+				   (ovly_table + i * sizeof (ovly)),
+				   sizeof (ovly)))
+			goto fail;
+
+		/* The ovly.vma/size/offset arguments are analogous to the same
+		 * arguments used above for non-overlay maps.  The final two
+		 * args are referred to as the guard pointer and the guard
+		 * value.
+		 * The guard pointer is an entry in the _ovly_buf_table,
+		 * computed using ovly.buf as the index into the table.	 Since
+		 * ovly.buf values begin at '1' to reference the first (or 0th)
+		 * entry in the _ovly_buf_table, the computation subtracts 1
+		 * from ovly.buf.
+		 * The guard value is stored in the _ovly_buf_table entry and
+		 * is an index (starting at 1) back to the _ovly_table entry
+		 * that is pointing at this _ovly_buf_table entry.  So, for
+		 * example, for an overlay scenario with one overlay segment
+		 * and two overlay sections:
+		 *	- Section 1 points to the first entry of the
+		 *	  _ovly_buf_table, which contains a guard value
+		 *	  of '1', referencing the first (index=0) entry of
+		 *	  _ovly_table.
+		 *	- Section 2 points to the second entry of the
+		 *	  _ovly_buf_table, which contains a guard value
+		 *	  of '2', referencing the second (index=1) entry of
+		 *	  _ovly_table.
+		 */
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				  ovly_buf_table_sym + (ovly.buf-1) * 4, i+1);
+		if (!map)
+			goto fail;
+	}
+	goto out;
+
+ fail:
+	map = NULL;
+ out:
+	return map;
+}
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 1a7ef7e246d2..a28cce1d6c24 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -29,6 +29,8 @@ static struct op_powerpc_model *model;
 static struct op_counter_config ctr[OP_MAX_COUNTER];
 static struct op_system_config sys;
 
+static int op_per_cpu_rc;
+
 static void op_handle_interrupt(struct pt_regs *regs)
 {
 	model->handle_interrupt(regs, ctr);
@@ -36,25 +38,41 @@ static void op_handle_interrupt(struct pt_regs *regs)
 
 static void op_powerpc_cpu_setup(void *dummy)
 {
-	model->cpu_setup(ctr);
+	int ret;
+
+	ret = model->cpu_setup(ctr);
+
+	if (ret != 0)
+		op_per_cpu_rc = ret;
 }
 
 static int op_powerpc_setup(void)
 {
 	int err;
 
+	op_per_cpu_rc = 0;
+
 	/* Grab the hardware */
 	err = reserve_pmc_hardware(op_handle_interrupt);
 	if (err)
 		return err;
 
 	/* Pre-compute the values to stuff in the hardware registers.  */
-	model->reg_setup(ctr, &sys, model->num_counters);
+	op_per_cpu_rc = model->reg_setup(ctr, &sys, model->num_counters);
 
-	/* Configure the registers on all cpus.  */
+	if (op_per_cpu_rc)
+		goto out;
+
+	/* Configure the registers on all cpus.	 If an error occurs on one
+	 * of the cpus, op_per_cpu_rc will be set to the error */
 	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
 
-	return 0;
+out:	if (op_per_cpu_rc) {
+		/* error on setup release the performance counter hardware */
+		release_pmc_hardware();
+	}
+
+	return op_per_cpu_rc;
 }
 
 static void op_powerpc_shutdown(void)
@@ -64,16 +82,29 @@ static void op_powerpc_shutdown(void)
 
 static void op_powerpc_cpu_start(void *dummy)
 {
-	model->start(ctr);
+	/* If any of the cpus have return an error, set the
+	 * global flag to the error so it can be returned
+	 * to the generic OProfile caller.
+	 */
+	int ret;
+
+	ret = model->start(ctr);
+	if (ret != 0)
+		op_per_cpu_rc = ret;
 }
 
 static int op_powerpc_start(void)
 {
+	op_per_cpu_rc = 0;
+
 	if (model->global_start)
-		model->global_start(ctr);
-	if (model->start)
+		return model->global_start(ctr);
+	if (model->start) {
 		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
-	return 0;
+		return op_per_cpu_rc;
+	}
+	return -EIO; /* No start function is defined for this
+			power architecture */
 }
 
 static inline void op_powerpc_cpu_stop(void *dummy)
@@ -147,11 +178,13 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 
 	switch (cur_cpu_spec->oprofile_type) {
 #ifdef CONFIG_PPC64
-#ifdef CONFIG_PPC_CELL_NATIVE
+#ifdef CONFIG_OPROFILE_CELL
 		case PPC_OPROFILE_CELL:
 			if (firmware_has_feature(FW_FEATURE_LPAR))
 				return -ENODEV;
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
diff --git a/arch/powerpc/oprofile/op_model_7450.c b/arch/powerpc/oprofile/op_model_7450.c
index 5d1bbaf35ccb..cc599eb8768b 100644
--- a/arch/powerpc/oprofile/op_model_7450.c
+++ b/arch/powerpc/oprofile/op_model_7450.c
@@ -81,7 +81,7 @@ static void pmc_stop_ctrs(void)
 
 /* Configures the counters on this CPU based on the global
  * settings */
-static void fsl7450_cpu_setup(struct op_counter_config *ctr)
+static int fsl7450_cpu_setup(struct op_counter_config *ctr)
 {
 	/* freeze all counters */
 	pmc_stop_ctrs();
@@ -89,12 +89,14 @@ static void fsl7450_cpu_setup(struct op_counter_config *ctr)
 	mtspr(SPRN_MMCR0, mmcr0_val);
 	mtspr(SPRN_MMCR1, mmcr1_val);
 	mtspr(SPRN_MMCR2, mmcr2_val);
+
+	return 0;
 }
 
 #define NUM_CTRS 6
 
 /* Configures the global settings for the countes on all CPUs. */
-static void fsl7450_reg_setup(struct op_counter_config *ctr,
+static int fsl7450_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -126,10 +128,12 @@ static void fsl7450_reg_setup(struct op_counter_config *ctr,
 		| mmcr1_event6(ctr[5].event);
 
 	mmcr2_val = 0;
+
+	return 0;
 }
 
 /* Sets the counters on this CPU to the chosen values, and starts them */
-static void fsl7450_start(struct op_counter_config *ctr)
+static int fsl7450_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -148,6 +152,8 @@ static void fsl7450_start(struct op_counter_config *ctr)
 	pmc_start_ctrs();
 
 	oprofile_running = 1;
+
+	return 0;
 }
 
 /* Stop the counters on this CPU */
@@ -193,7 +199,7 @@ static void fsl7450_handle_interrupt(struct pt_regs *regs,
 	/* The freeze bit was set by the interrupt. */
 	/* Clear the freeze bit, and reenable the interrupt.
 	 * The counters won't actually start until the rfi clears
-	 * the PMM bit */
+	 * the PM/M bit */
 	pmc_start_ctrs();
 }
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index c29293befba9..d928b54f3a0f 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -5,8 +5,8 @@
  *
  * Author: David Erb (djerb@us.ibm.com)
  * Modifications:
- *         Carl Love <carll@us.ibm.com>
- *         Maynard Johnson <maynardj@us.ibm.com>
+ *	   Carl Love <carll@us.ibm.com>
+ *	   Maynard Johnson <maynardj@us.ibm.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -38,12 +38,25 @@
 
 #include "../platforms/cell/interrupt.h"
 #include "../platforms/cell/cbe_regs.h"
+#include "cell/pr_util.h"
+
+static void cell_global_stop_spu(void);
+
+/*
+ * spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2	/*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
-#define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
-                                 * PPU_CYCLES event
-                                 */
-#define CBE_COUNT_ALL_CYCLES 0x42800000	/* PPU cycle event specifier */
+#define PPU_CYCLES_GRP_NUM   1	/* special group number for identifying
+				 * PPU_CYCLES event
+				 */
+#define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */
 
 #define NUM_THREADS 2         /* number of physical threads in
 			       * physical processor
@@ -51,6 +64,7 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
+#define MAX_SPU_COUNT 0xFFFFFF	/* maximum 24 bit LFSR value */
 
 struct pmc_cntrl_data {
 	unsigned long vcntr;
@@ -62,11 +76,10 @@ struct pmc_cntrl_data {
 /*
  * ibm,cbe-perftools rtas parameters
  */
-
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
-	u16 sub_unit;		/* hw subunit this applies to (if applicable) */
-	short int signal_group;	/* Signal Group to Enable/Disable */
+	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
+	short int signal_group; /* Signal Group to Enable/Disable */
 	u8 bus_word;		/* Enable/Disable on this Trace/Trigger/Event
 				 * Bus Word(s) (bitmask)
 				 */
@@ -112,21 +125,42 @@ static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
 
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
-/* Interpetation of hdw_thread:
+/*
+ * The CELL profiling code makes rtas calls to setup the debug bus to
+ * route the performance signals.  Additionally, SPU profiling requires
+ * a second rtas call to setup the hardware to capture the SPU PCs.
+ * The EIO error value is returned if the token lookups or the rtas
+ * call fail.  The EIO error number is the best choice of the existing
+ * error numbers.  The probability of rtas related error is very low.  But
+ * by returning EIO and printing additional information to dmsg the user
+ * will know that OProfile did not start and dmesg will tell them why.
+ * OProfile does not support returning errors on Stop.	Not a huge issue
+ * since failure to reset the debug bus or stop the SPU PC collection is
+ * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
+ * either.
+ */
+
+/*
+ * Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
+ *
+ * FIXME: this is strictly wrong, we need to clean this up in a number
+ * of places. It works for now. -arnd
  */
 static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
 
-/* pm_signal needs to be global since it is initialized in
+/*
+ * pm_signal needs to be global since it is initialized in
  * cell_reg_setup at the time when the necessary information
  * is available.
  */
 static struct pm_signal pm_signal[NR_PHYS_CTRS];
-static int pm_rtas_token;
+static int pm_rtas_token;    /* token for debug bus setup call */
+static int spu_rtas_token;   /* token for SPU cycle profiling */
 
 static u32 reset_value[NR_PHYS_CTRS];
 static int num_counters;
@@ -147,8 +181,8 @@ rtas_ibm_cbe_perftools(int subfunc, int passthru,
 {
 	u64 paddr = __pa(address);
 
-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
-			 paddr >> 32, paddr & 0xffffffff, length);
+	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
+			 passthru, paddr >> 32, paddr & 0xffffffff, length);
 }
 
 static void pm_rtas_reset_signals(u32 node)
@@ -156,12 +190,13 @@ static void pm_rtas_reset_signals(u32 node)
 	int ret;
 	struct pm_signal pm_signal_local;
 
-	/*  The debug bus is being set to the passthru disable state.
-	 *  However, the FW still expects atleast one legal signal routing
-	 *  entry or it will return an error on the arguments.  If we don't
-	 *  supply a valid entry, we must ignore all return values.  Ignoring
-	 *  all return values means we might miss an error we should be
-	 *  concerned about.
+	/*
+	 * The debug bus is being set to the passthru disable state.
+	 * However, the FW still expects atleast one legal signal routing
+	 * entry or it will return an error on the arguments.	If we don't
+	 * supply a valid entry, we must ignore all return values.  Ignoring
+	 * all return values means we might miss an error we should be
+	 * concerned about.
 	 */
 
 	/*  fw expects physical cpu #. */
@@ -175,18 +210,24 @@ static void pm_rtas_reset_signals(u32 node)
 				     &pm_signal_local,
 				     sizeof(struct pm_signal));
 
-	if (ret)
+	if (unlikely(ret))
+		/*
+		 * Not a fatal error. For Oprofile stop, the oprofile
+		 * functions do not support returning an error for
+		 * failure to stop OProfile.
+		 */
 		printk(KERN_WARNING "%s: rtas returned: %d\n",
 		       __FUNCTION__, ret);
 }
 
-static void pm_rtas_activate_signals(u32 node, u32 count)
+static int pm_rtas_activate_signals(u32 node, u32 count)
 {
 	int ret;
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* There is no debug setup required for the cycles event.
+	/*
+	 * There is no debug setup required for the cycles event.
 	 * Note that only events in the same group can be used.
 	 * Otherwise, there will be conflicts in correctly routing
 	 * the signals on the debug bus.  It is the responsiblity
@@ -213,10 +254,14 @@ static void pm_rtas_activate_signals(u32 node, u32 count)
 					     pm_signal_local,
 					     i * sizeof(struct pm_signal));
 
-		if (ret)
+		if (unlikely(ret)) {
 			printk(KERN_WARNING "%s: rtas returned: %d\n",
 			       __FUNCTION__, ret);
+			return -EIO;
+		}
 	}
+
+	return 0;
 }
 
 /*
@@ -260,11 +305,12 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
 
-	/* Some of the islands signal selection is based on 64 bit words.
+	/*
+	 * Some of the islands signal selection is based on 64 bit words.
 	 * The debug bus words are 32 bits, the input words to the performance
 	 * counters are defined as 32 bits.  Need to convert the 64 bit island
 	 * specification to the appropriate 32 input bit and bus word for the
-	 * performance counter event selection.  See the CELL Performance
+	 * performance counter event selection.	 See the CELL Performance
 	 * monitoring signals manual and the Perf cntr hardware descriptions
 	 * for the details.
 	 */
@@ -298,6 +344,7 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 					input_bus[j] = i;
 					pm_regs.group_control |=
 					    (i << (31 - i));
+
 					break;
 				}
 			}
@@ -309,7 +356,8 @@ out:
 
 static void write_pm_cntrl(int cpu)
 {
-	/* Oprofile will use 32 bit counters, set bits 7:10 to 0
+	/*
+	 * Oprofile will use 32 bit counters, set bits 7:10 to 0
 	 * pmregs.pm_cntrl is a global
 	 */
 
@@ -326,7 +374,8 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.freeze == 1)
 		val |= CBE_PM_FREEZE_ALL_CTRS;
 
-	/* Routine set_count_mode must be called previously to set
+	/*
+	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
 	 */
 	val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
@@ -336,7 +385,8 @@ static void write_pm_cntrl(int cpu)
 static inline void
 set_count_mode(u32 kernel, u32 user)
 {
-	/* The user must specify user and kernel if they want them. If
+	/*
+	 * The user must specify user and kernel if they want them. If
 	 *  neither is specified, OProfile will count in hypervisor mode.
 	 *  pm_regs.pm_cntrl is a global
 	 */
@@ -364,7 +414,7 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl)
 
 /*
  * Oprofile is expected to collect data on all CPUs simultaneously.
- * However, there is one set of performance counters per node.  There are
+ * However, there is one set of performance counters per node.	There are
  * two hardware threads or virtual CPUs on each node.  Hence, OProfile must
  * multiplex in time the performance counter collection on the two virtual
  * CPUs.  The multiplexing of the performance counters is done by this
@@ -377,19 +427,19 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl)
  * pair of per-cpu arrays is used for storing the previous and next
  * pmc values for a given node.
  * NOTE: We use the per-cpu variable to improve cache performance.
+ *
+ * This routine will alternate loading the virtual counters for
+ * virtual CPUs
  */
 static void cell_virtual_cntr(unsigned long data)
 {
-	/* This routine will alternate loading the virtual counters for
-	 * virtual CPUs
-	 */
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
 	unsigned long flags;
 
-	/* Make sure that the interrupt_hander and
-	 * the virt counter are not both playing with
-	 * the counters on the same node.
+	/*
+	 * Make sure that the interrupt_hander and the virt counter are
+	 * not both playing with the counters on the same node.
 	 */
 
 	spin_lock_irqsave(&virt_cntr_lock, flags);
@@ -400,22 +450,25 @@ static void cell_virtual_cntr(unsigned long data)
 	hdw_thread = 1 ^ hdw_thread;
 	next_hdw_thread = hdw_thread;
 
-	for (i = 0; i < num_counters; i++)
-	/* There are some per thread events.  Must do the
+	/*
+	 * There are some per thread events.  Must do the
 	 * set event, for the thread that is being started
 	 */
+	for (i = 0; i < num_counters; i++)
 		set_pm_event(i,
 			pmc_cntrl[next_hdw_thread][i].evnts,
 			pmc_cntrl[next_hdw_thread][i].masks);
 
-	/* The following is done only once per each node, but
+	/*
+	 * The following is done only once per each node, but
 	 * we need cpu #, not node #, to pass to the cbe_xxx functions.
 	 */
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		/* stop counters, save counter values, restore counts
+		/*
+		 * stop counters, save counter values, restore counts
 		 * for previous thread
 		 */
 		cbe_disable_pm(cpu);
@@ -428,7 +481,7 @@ static void cell_virtual_cntr(unsigned long data)
 			    == 0xFFFFFFFF)
 				/* If the cntr value is 0xffffffff, we must
 				 * reset that to 0xfffffff0 when the current
-				 * thread is restarted.  This will generate a
+				 * thread is restarted.	 This will generate a
 				 * new interrupt and make sure that we never
 				 * restore the counters to the max value.  If
 				 * the counters were restored to the max value,
@@ -444,13 +497,15 @@ static void cell_virtual_cntr(unsigned long data)
 						      next_hdw_thread)[i]);
 		}
 
-		/* Switch to the other thread. Change the interrupt
+		/*
+		 * Switch to the other thread. Change the interrupt
 		 * and control regs to be scheduled on the CPU
 		 * corresponding to the thread to execute.
 		 */
 		for (i = 0; i < num_counters; i++) {
 			if (pmc_cntrl[next_hdw_thread][i].enabled) {
-				/* There are some per thread events.
+				/*
+				 * There are some per thread events.
 				 * Must do the set event, enable_cntr
 				 * for each cpu.
 				 */
@@ -482,17 +537,42 @@ static void start_virt_cntrs(void)
 }
 
 /* This function is called once for all cpus combined */
-static void
-cell_reg_setup(struct op_counter_config *ctr,
-	       struct op_system_config *sys, int num_ctrs)
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
+
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+
+		/*
+		 * Each node will need to make the rtas call to start
+		 * and stop SPU profiling.  Get the token once and store it.
+		 */
+		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+		if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+			return -EIO;
+		}
+	}
 
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
+
+	/*
+	 * For all events excetp PPU CYCLEs, each node will need to make
+	 * the rtas cbe-perftools call to setup and reset the debug bus.
+	 * Make the token lookup call once and store it in the global
+	 * variable pm_rtas_token.
+	 */
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-perftools unknown\n",
 		       __FUNCTION__);
-		goto out;
+		return -EIO;
 	}
 
 	num_counters = num_ctrs;
@@ -520,7 +600,8 @@ cell_reg_setup(struct op_counter_config *ctr,
 			per_cpu(pmc_values, j)[i] = 0;
 	}
 
-	/* Setup the thread 1 events, map the thread 0 event to the
+	/*
+	 * Setup the thread 1 events, map the thread 0 event to the
 	 * equivalent thread 1 event.
 	 */
 	for (i = 0; i < num_ctrs; ++i) {
@@ -544,9 +625,10 @@ cell_reg_setup(struct op_counter_config *ctr,
 	for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 		input_bus[i] = 0xff;
 
-	/* Our counters count up, and "count" refers to
+	/*
+	 * Our counters count up, and "count" refers to
 	 * how much before the next interrupt, and we interrupt
-	 * on overflow.  So we calculate the starting value
+	 * on overflow.	 So we calculate the starting value
 	 * which will give us "count" until overflow.
 	 * Then we set the events on the enabled counters.
 	 */
@@ -569,28 +651,27 @@ cell_reg_setup(struct op_counter_config *ctr,
 		for (i = 0; i < num_counters; ++i) {
 			per_cpu(pmc_values, cpu)[i] = reset_value[i];
 		}
-out:
-	;
+
+	return 0;
 }
 
+
+
 /* This function is called once for each cpu */
-static void cell_cpu_setup(struct op_counter_config *cntr)
+static int cell_cpu_setup(struct op_counter_config *cntr)
 {
 	u32 cpu = smp_processor_id();
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return 0;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
 	if (cbe_get_hw_thread_id(cpu))
-		goto out;
-
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
-		       __FUNCTION__);
-		goto out;
-	}
+		return 0;
 
 	/* Stop all counters */
 	cbe_disable_pm(cpu);
@@ -609,16 +690,286 @@ static void cell_cpu_setup(struct op_counter_config *cntr)
 		}
 	}
 
-	pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+	/*
+	 * The pm_rtas_activate_signals will return -EIO if the FW
+	 * call failed.
+	 */
+	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+}
+
+#define ENTRIES	 303
+#define MAXLFSR	 0xFFFFFF
+
+/* precomputed table of 24 bit LFSR values */
+static int initial_lfsr[] = {
+ 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
+ 15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
+ 4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
+ 3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
+ 9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
+ 2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
+ 3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
+ 14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
+ 11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
+ 6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
+ 15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
+ 7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
+ 16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
+ 15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
+ 15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
+ 10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
+ 3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
+ 3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
+ 8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
+ 8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
+ 4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
+ 16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
+ 2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
+ 14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
+ 1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
+ 6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
+ 10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
+ 10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
+ 14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
+ 7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
+ 9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
+ 14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
+ 13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
+ 5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
+ 3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
+ 6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
+ 7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
+};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.	 An LFSR sequence is like a puesdo random number sequence
+ * where each number occurs once in the sequence but the sequence is not in
+ * numerical order. The SPU PC capture is done when the LFSR sequence reaches
+ * the last value in the sequence.  Hence the user specified value N
+ * corresponds to the LFSR number that is N from the end of the sequence.
+ *
+ * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
+ * LFSR sequence is broken into four ranges.  The spacing of the precomputed
+ * values is adjusted in each range so the error between the user specifed
+ * number (N) of events between samples and the actual number of events based
+ * on the precomputed value will be les then about 6.2%.  Note, if the user
+ * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
+ * This is to prevent the loss of samples because the trace buffer is full.
+ *
+ *	   User specified N		     Step between	   Index in
+ *					 precomputed values	 precomputed
+ *								    table
+ * 0		    to	2^16-1			----		      0
+ * 2^16	    to	2^16+2^19-1		2^12		    1 to 128
+ * 2^16+2^19	    to	2^16+2^19+2^22-1	2^15		  129 to 256
+ * 2^16+2^19+2^22  to	2^24-1			2^18		  257 to 302
+ *
+ *
+ * For example, the LFSR values in the second range are computed for 2^16,
+ * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
+ * 1, 2,..., 127, 128.
+ *
+ * The 24 bit LFSR value for the nth number in the sequence can be
+ * calculated using the following code:
+ *
+ * #define size 24
+ * int calculate_lfsr(int n)
+ * {
+ *	int i;
+ *	unsigned int newlfsr0;
+ *	unsigned int lfsr = 0xFFFFFF;
+ *	unsigned int howmany = n;
+ *
+ *	for (i = 2; i < howmany + 2; i++) {
+ *		newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+ *		((lfsr >> (size - 1 - 1)) & 1) ^
+ *		(((lfsr >> (size - 1 - 6)) & 1) ^
+ *		((lfsr >> (size - 1 - 23)) & 1)));
+ *
+ *		lfsr >>= 1;
+ *		lfsr = lfsr | (newlfsr0 << (size - 1));
+ *	}
+ *	return lfsr;
+ * }
+ */
+
+#define V2_16  (0x1 << 16)
+#define V2_19  (0x1 << 19)
+#define V2_22  (0x1 << 22)
+
+static int calculate_lfsr(int n)
+{
+	/*
+	 * The ranges and steps are in powers of 2 so the calculations
+	 * can be done using shifts rather then divide.
+	 */
+	int index;
+
+	if ((n >> 16) == 0)
+		index = 0;
+	else if (((n - V2_16) >> 19) == 0)
+		index = ((n - V2_16) >> 12) + 1;
+	else if (((n - V2_16 - V2_19) >> 22) == 0)
+		index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+	else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
+		index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
+	else
+		index = ENTRIES-1;
+
+	/* make sure index is valid */
+	if ((index > ENTRIES) || (index < 0))
+		index = ENTRIES-1;
+
+	return initial_lfsr[index];
+}
+
+static int pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/*
+	 * Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU
+	 */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		/* spu i on word (i/2) */
+		pm_signal_local[i].bus_word = 1 << i / 2;
+		/* spu i */
+		pm_signal_local[i].sub_unit = i;
+		pm_signal_local[i].bit = 63;
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
+				     PASSTHRU_ENABLE, pm_signal_local,
+				     (NUM_SPUS_PER_NODE
+				      * sizeof(struct pm_signal)));
+
+	if (unlikely(ret)) {
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs *frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_spu_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static int cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret;
+	int rtas_error;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		/* this is not a fatal error */
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		/*
+		 * Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		if (spu_cycle_reset > MAX_SPU_COUNT)
+			/* use largest possible value */
+			lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
+		else
+			lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		/* must use a non zero value. Zero disables data collection. */
+		if (lfsr_value == 0)
+			lfsr_value = calculate_lfsr(1);
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
+						* register location
+						*/
+
+		/* debug bus setup */
+		ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (unlikely(ret)) {
+			rtas_error = ret;
+			goto out;
+		}
+
+
+		subfunc = 2;	/* 2 - activate SPU tracing, 3 - deactivate */
+
+		/* start profiling */
+		ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
+		  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (unlikely(ret != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, ret);
+			rtas_error = -EIO;
+			goto out;
+		}
+	}
+
+	rtas_error = start_spu_profiling(spu_cycle_reset);
+	if (rtas_error)
+		goto out_stop;
+
+	oprofile_running = 1;
+	return 0;
+
+out_stop:
+	cell_global_stop_spu();		/* clean up the PMU/debug bus */
 out:
-	;
+	return rtas_error;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+static int cell_global_start_ppu(struct op_counter_config *ctr)
 {
-	u32 cpu;
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -651,19 +1002,79 @@ static void cell_global_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 	smp_wmb();
 
-	/* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
-	 * executed which manipulates the PMU.  We start the "virtual counter"
+	/*
+	 * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
+	 * executed which manipulates the PMU.	We start the "virtual counter"
 	 * here so that we do not need to synchronize access to the PMU in
 	 * the above for-loop.
 	 */
 	start_virt_cntrs();
+
+	return 0;
 }
 
-static void cell_global_stop(void)
+static int cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset)
+		return cell_global_start_spu(ctr);
+	else
+		return cell_global_start_ppu(ctr);
+}
+
+/*
+ * Note the generic OProfile stop calls do not support returning
+ * an error on stop.  Hence, will not return an error if the FW
+ * calls fail on stop.	Failure to reset the debug bus is not an issue.
+ * Failure to disable the SPU profiling is not an issue.  The FW calls
+ * to enable the performance counters and debug bus will work even if
+ * the hardware was not cleanly reset.
+ */
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		subfunc = 3;	/*
+				 * 2 - activate SPU tracing,
+				 * 3 - deactivate
+				 */
+		lfsr_value = 0x8f100000;
+
+		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
+				      subfunc, cbe_cpu_to_node(cpu),
+				      lfsr_value);
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+		}
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
-	/* This routine will be called once for the system.
+	/*
+	 * This routine will be called once for the system.
 	 * There is one performance monitor per node, so we
 	 * only need to perform this function once per node.
 	 */
@@ -687,8 +1098,16 @@ static void cell_global_stop(void)
 	}
 }
 
-static void
-cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset)
+		cell_global_stop_spu();
+	else
+		cell_global_stop_ppu();
+}
+
+static void cell_handle_interrupt(struct pt_regs *regs,
+				struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -699,13 +1118,15 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 
 	cpu = smp_processor_id();
 
-	/* Need to make sure the interrupt handler and the virt counter
+	/*
+	 * Need to make sure the interrupt handler and the virt counter
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
 	spin_lock_irqsave(&virt_cntr_lock, flags);
 
-	/* Need to disable and reenable the performance counters
+	/*
+	 * Need to disable and reenable the performance counters
 	 * to get the desired behavior from the hardware.  This
 	 * is hardware specific.
 	 */
@@ -714,7 +1135,8 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 
 	interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
 
-	/* If the interrupt mask has been cleared, then the virt cntr
+	/*
+	 * If the interrupt mask has been cleared, then the virt cntr
 	 * has cleared the interrupt.  When the thread that generated
 	 * the interrupt is restored, the data count will be restored to
 	 * 0xffffff0 to cause the interrupt to be regenerated.
@@ -732,18 +1154,20 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 			}
 		}
 
-		/* The counters were frozen by the interrupt.
+		/*
+		 * The counters were frozen by the interrupt.
 		 * Reenable the interrupt and restart the counters.
 		 * If there was a race between the interrupt handler and
-		 * the virtual counter routine.  The virutal counter
+		 * the virtual counter routine.	 The virutal counter
 		 * routine may have cleared the interrupts.  Hence must
 		 * use the virt_cntr_inter_mask to re-enable the interrupts.
 		 */
 		cbe_enable_pm_interrupts(cpu, hdw_thread,
 					 virt_cntr_inter_mask);
 
-		/* The writes to the various performance counters only writes
-		 * to a latch.  The new values (interrupt setting bits, reset
+		/*
+		 * The writes to the various performance counters only writes
+		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
 		 * this to work as desired, the permormance monitor needs to
@@ -755,10 +1179,33 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/*
+ * This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
diff --git a/arch/powerpc/oprofile/op_model_fsl_booke.c b/arch/powerpc/oprofile/op_model_fsl_booke.c
index 2267eb8c661b..183a28bb1812 100644
--- a/arch/powerpc/oprofile/op_model_fsl_booke.c
+++ b/arch/powerpc/oprofile/op_model_fsl_booke.c
@@ -244,7 +244,7 @@ static void dump_pmcs(void)
 			mfpmr(PMRN_PMLCA3), mfpmr(PMRN_PMLCB3));
 }
 
-static void fsl_booke_cpu_setup(struct op_counter_config *ctr)
+static int fsl_booke_cpu_setup(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -258,9 +258,11 @@ static void fsl_booke_cpu_setup(struct op_counter_config *ctr)
 
 		set_pmc_user_kernel(i, ctr[i].user, ctr[i].kernel);
 	}
+
+	return 0;
 }
 
-static void fsl_booke_reg_setup(struct op_counter_config *ctr,
+static int fsl_booke_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -276,9 +278,10 @@ static void fsl_booke_reg_setup(struct op_counter_config *ctr,
 	for (i = 0; i < num_counters; ++i)
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
+	return 0;
 }
 
-static void fsl_booke_start(struct op_counter_config *ctr)
+static int fsl_booke_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -308,6 +311,8 @@ static void fsl_booke_start(struct op_counter_config *ctr)
 
 	pr_debug("start on cpu %d, pmgc0 %x\n", smp_processor_id(),
 			mfpmr(PMRN_PMGC0));
+
+	return 0;
 }
 
 static void fsl_booke_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_pa6t.c b/arch/powerpc/oprofile/op_model_pa6t.c
index e8a56b0adadc..c40de461fd4e 100644
--- a/arch/powerpc/oprofile/op_model_pa6t.c
+++ b/arch/powerpc/oprofile/op_model_pa6t.c
@@ -89,7 +89,7 @@ static inline void ctr_write(unsigned int i, u64 val)
 
 
 /* precompute the values to stuff in the hardware registers */
-static void pa6t_reg_setup(struct op_counter_config *ctr,
+static int pa6t_reg_setup(struct op_counter_config *ctr,
 			   struct op_system_config *sys,
 			   int num_ctrs)
 {
@@ -135,10 +135,12 @@ static void pa6t_reg_setup(struct op_counter_config *ctr,
 		pr_debug("reset_value for pmc%u inited to 0x%lx\n",
 				 pmc, reset_value[pmc]);
 	}
+
+	return 0;
 }
 
 /* configure registers on this cpu */
-static void pa6t_cpu_setup(struct op_counter_config *ctr)
+static int pa6t_cpu_setup(struct op_counter_config *ctr)
 {
 	u64 mmcr0 = mmcr0_val;
 	u64 mmcr1 = mmcr1_val;
@@ -154,9 +156,11 @@ static void pa6t_cpu_setup(struct op_counter_config *ctr)
 		mfspr(SPRN_PA6T_MMCR0));
 	pr_debug("setup on cpu %d, mmcr1 %016lx\n", smp_processor_id(),
 		mfspr(SPRN_PA6T_MMCR1));
+
+	return 0;
 }
 
-static void pa6t_start(struct op_counter_config *ctr)
+static int pa6t_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -174,6 +178,8 @@ static void pa6t_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 
 	pr_debug("start on cpu %d, mmcr0 %lx\n", smp_processor_id(), mmcr0);
+
+	return 0;
 }
 
 static void pa6t_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c
index a7c206b665af..cddc250a6a5c 100644
--- a/arch/powerpc/oprofile/op_model_power4.c
+++ b/arch/powerpc/oprofile/op_model_power4.c
@@ -32,7 +32,7 @@ static u32 mmcr0_val;
 static u64 mmcr1_val;
 static u64 mmcra_val;
 
-static void power4_reg_setup(struct op_counter_config *ctr,
+static int power4_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -60,6 +60,8 @@ static void power4_reg_setup(struct op_counter_config *ctr,
 		mmcr0_val &= ~MMCR0_PROBLEM_DISABLE;
 	else
 		mmcr0_val |= MMCR0_PROBLEM_DISABLE;
+
+	return 0;
 }
 
 extern void ppc64_enable_pmcs(void);
@@ -84,7 +86,7 @@ static inline int mmcra_must_set_sample(void)
 	return 0;
 }
 
-static void power4_cpu_setup(struct op_counter_config *ctr)
+static int power4_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0 = mmcr0_val;
 	unsigned long mmcra = mmcra_val;
@@ -111,9 +113,11 @@ static void power4_cpu_setup(struct op_counter_config *ctr)
 	    mfspr(SPRN_MMCR1));
 	dbg("setup on cpu %d, mmcra %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCRA));
+
+	return 0;
 }
 
-static void power4_start(struct op_counter_config *ctr)
+static int power4_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -148,6 +152,7 @@ static void power4_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void power4_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_rs64.c b/arch/powerpc/oprofile/op_model_rs64.c
index c731acbfb2a5..a20afe45d936 100644
--- a/arch/powerpc/oprofile/op_model_rs64.c
+++ b/arch/powerpc/oprofile/op_model_rs64.c
@@ -88,7 +88,7 @@ static unsigned long reset_value[OP_MAX_COUNTER];
 
 static int num_counters;
 
-static void rs64_reg_setup(struct op_counter_config *ctr,
+static int rs64_reg_setup(struct op_counter_config *ctr,
 			   struct op_system_config *sys,
 			   int num_ctrs)
 {
@@ -100,9 +100,10 @@ static void rs64_reg_setup(struct op_counter_config *ctr,
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
 	/* XXX setup user and kernel profiling */
+	return 0;
 }
 
-static void rs64_cpu_setup(struct op_counter_config *ctr)
+static int rs64_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0;
 
@@ -125,9 +126,11 @@ static void rs64_cpu_setup(struct op_counter_config *ctr)
 	    mfspr(SPRN_MMCR0));
 	dbg("setup on cpu %d, mmcr1 %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCR1));
+
+	return 0;
 }
 
-static void rs64_start(struct op_counter_config *ctr)
+static int rs64_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -155,6 +158,7 @@ static void rs64_start(struct op_counter_config *ctr)
 	mtspr(SPRN_MMCR0, mmcr0);
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void rs64_stop(void)
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 33545d352e92..932538a93c2b 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -272,4 +272,14 @@ config CPM2
 	  you wish to build a kernel for a machine with a CPM2 coprocessor
 	  on it (826x, 827x, 8560).
 
+config AXON_RAM
+	tristate "Axon DDR2 memory device driver"
+	depends on PPC_IBM_CELL_BLADE
+	default m
+	help
+	  It registers one block device per Axon's DDR2 memory bank found
+	  on a system. Block devices are called axonram?, their major and
+	  minor numbers are available in /proc/devices, /proc/partitions or
+	  in /sys/block/axonram?/dev.
+
 endmenu
diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index 9b2b386ccf48..ac8032034fb8 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -73,4 +73,14 @@ config CBE_CPUFREQ
 	  For details, take a look at <file:Documentation/cpu-freq/>.
 	  If you don't have such processor, say N
 
+config CBE_CPUFREQ_PMI
+	tristate "CBE frequency scaling using PMI interface"
+	depends on CBE_CPUFREQ && PPC_PMI && EXPERIMENTAL
+	default n
+	help
+	  Select this, if you want to use the PMI interface
+	  to switch frequencies. Using PMI, the
+	  processor will not only be able to run at lower speed,
+	  but also at lower core voltage.
+
 endmenu
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index 869af89df6ff..f88a7c76f296 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -4,7 +4,9 @@ obj-$(CONFIG_PPC_CELL_NATIVE)		+= interrupt.o iommu.o setup.o \
 obj-$(CONFIG_CBE_RAS)			+= ras.o
 
 obj-$(CONFIG_CBE_THERM)			+= cbe_thermal.o
-obj-$(CONFIG_CBE_CPUFREQ)		+= cbe_cpufreq.o
+obj-$(CONFIG_CBE_CPUFREQ_PMI)		+= cbe_cpufreq_pmi.o
+obj-$(CONFIG_CBE_CPUFREQ)		+= cbe-cpufreq.o
+cbe-cpufreq-y				+= cbe_cpufreq_pervasive.o cbe_cpufreq.o
 
 ifeq ($(CONFIG_SMP),y)
 obj-$(CONFIG_PPC_CELL_NATIVE)		+= smp.o
@@ -23,3 +25,5 @@ obj-$(CONFIG_SPU_BASE)			+= spu_callbacks.o spu_base.o \
 					   $(spu-priv1-y) \
 					   $(spu-manage-y) \
 					   spufs/
+
+obj-$(CONFIG_PCI_MSI)			+= axon_msi.o
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
new file mode 100644
index 000000000000..4c9ab5b70bae
--- /dev/null
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2007, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/reboot.h>
+
+#include <asm/dcr.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+
+
+/*
+ * MSIC registers, specified as offsets from dcr_base
+ */
+#define MSIC_CTRL_REG	0x0
+
+/* Base Address registers specify FIFO location in BE memory */
+#define MSIC_BASE_ADDR_HI_REG	0x3
+#define MSIC_BASE_ADDR_LO_REG	0x4
+
+/* Hold the read/write offsets into the FIFO */
+#define MSIC_READ_OFFSET_REG	0x5
+#define MSIC_WRITE_OFFSET_REG	0x6
+
+
+/* MSIC control register flags */
+#define MSIC_CTRL_ENABLE		0x0001
+#define MSIC_CTRL_FIFO_FULL_ENABLE	0x0002
+#define MSIC_CTRL_IRQ_ENABLE		0x0008
+#define MSIC_CTRL_FULL_STOP_ENABLE	0x0010
+
+/*
+ * The MSIC can be configured to use a FIFO of 32KB, 64KB, 128KB or 256KB.
+ * Currently we're using a 64KB FIFO size.
+ */
+#define MSIC_FIFO_SIZE_SHIFT	16
+#define MSIC_FIFO_SIZE_BYTES	(1 << MSIC_FIFO_SIZE_SHIFT)
+
+/*
+ * To configure the FIFO size as (1 << n) bytes, we write (n - 15) into bits
+ * 8-9 of the MSIC control reg.
+ */
+#define MSIC_CTRL_FIFO_SIZE	(((MSIC_FIFO_SIZE_SHIFT - 15) << 8) & 0x300)
+
+/*
+ * We need to mask the read/write offsets to make sure they stay within
+ * the bounds of the FIFO. Also they should always be 16-byte aligned.
+ */
+#define MSIC_FIFO_SIZE_MASK	((MSIC_FIFO_SIZE_BYTES - 1) & ~0xFu)
+
+/* Each entry in the FIFO is 16 bytes, the first 4 bytes hold the irq # */
+#define MSIC_FIFO_ENTRY_SIZE	0x10
+
+
+struct axon_msic {
+	struct device_node *dn;
+	struct irq_host *irq_host;
+	__le32 *fifo;
+	dcr_host_t dcr_host;
+	struct list_head list;
+	u32 read_offset;
+	u32 dcr_base;
+};
+
+static LIST_HEAD(axon_msic_list);
+
+static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val)
+{
+	pr_debug("axon_msi: dcr_write(0x%x, 0x%x)\n", val, dcr_n);
+
+	dcr_write(msic->dcr_host, msic->dcr_base + dcr_n, val);
+}
+
+static u32 msic_dcr_read(struct axon_msic *msic, unsigned int dcr_n)
+{
+	return dcr_read(msic->dcr_host, msic->dcr_base + dcr_n);
+}
+
+static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc)
+{
+	struct axon_msic *msic = get_irq_data(irq);
+	u32 write_offset, msi;
+	int idx;
+
+	write_offset = msic_dcr_read(msic, MSIC_WRITE_OFFSET_REG);
+	pr_debug("axon_msi: original write_offset 0x%x\n", write_offset);
+
+	/* write_offset doesn't wrap properly, so we have to mask it */
+	write_offset &= MSIC_FIFO_SIZE_MASK;
+
+	while (msic->read_offset != write_offset) {
+		idx  = msic->read_offset / sizeof(__le32);
+		msi  = le32_to_cpu(msic->fifo[idx]);
+		msi &= 0xFFFF;
+
+		pr_debug("axon_msi: woff %x roff %x msi %x\n",
+			  write_offset, msic->read_offset, msi);
+
+		msic->read_offset += MSIC_FIFO_ENTRY_SIZE;
+		msic->read_offset &= MSIC_FIFO_SIZE_MASK;
+
+		if (msi < NR_IRQS && irq_map[msi].host == msic->irq_host)
+			generic_handle_irq(msi);
+		else
+			pr_debug("axon_msi: invalid irq 0x%x!\n", msi);
+	}
+
+	desc->chip->eoi(irq);
+}
+
+static struct axon_msic *find_msi_translator(struct pci_dev *dev)
+{
+	struct irq_host *irq_host;
+	struct device_node *dn, *tmp;
+	const phandle *ph;
+	struct axon_msic *msic = NULL;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		dev_dbg(&dev->dev, "axon_msi: no pci_dn found\n");
+		return NULL;
+	}
+
+	for (; dn; tmp = of_get_parent(dn), of_node_put(dn), dn = tmp) {
+		ph = of_get_property(dn, "msi-translator", NULL);
+		if (ph)
+			break;
+	}
+
+	if (!ph) {
+		dev_dbg(&dev->dev,
+			"axon_msi: no msi-translator property found\n");
+		goto out_error;
+	}
+
+	tmp = dn;
+	dn = of_find_node_by_phandle(*ph);
+	if (!dn) {
+		dev_dbg(&dev->dev,
+			"axon_msi: msi-translator doesn't point to a node\n");
+		goto out_error;
+	}
+
+	irq_host = irq_find_host(dn);
+	if (!irq_host) {
+		dev_dbg(&dev->dev, "axon_msi: no irq_host found for node %s\n",
+			dn->full_name);
+		goto out_error;
+	}
+
+	msic = irq_host->host_data;
+
+out_error:
+	of_node_put(dn);
+	of_node_put(tmp);
+
+	return msic;
+}
+
+static int axon_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+	if (!find_msi_translator(dev))
+		return -ENODEV;
+
+	return 0;
+}
+
+static int setup_msi_msg_address(struct pci_dev *dev, struct msi_msg *msg)
+{
+	struct device_node *dn, *tmp;
+	struct msi_desc *entry;
+	int len;
+	const u32 *prop;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		dev_dbg(&dev->dev, "axon_msi: no pci_dn found\n");
+		return -ENODEV;
+	}
+
+	entry = list_first_entry(&dev->msi_list, struct msi_desc, list);
+
+	for (; dn; tmp = of_get_parent(dn), of_node_put(dn), dn = tmp) {
+		if (entry->msi_attrib.is_64) {
+			prop = of_get_property(dn, "msi-address-64", &len);
+			if (prop)
+				break;
+		}
+
+		prop = of_get_property(dn, "msi-address-32", &len);
+		if (prop)
+			break;
+	}
+
+	if (!prop) {
+		dev_dbg(&dev->dev,
+			"axon_msi: no msi-address-(32|64) properties found\n");
+		return -ENOENT;
+	}
+
+	switch (len) {
+	case 8:
+		msg->address_hi = prop[0];
+		msg->address_lo = prop[1];
+		break;
+	case 4:
+		msg->address_hi = 0;
+		msg->address_lo = prop[0];
+		break;
+	default:
+		dev_dbg(&dev->dev,
+			"axon_msi: malformed msi-address-(32|64) property\n");
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	of_node_put(dn);
+
+	return 0;
+}
+
+static int axon_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	unsigned int virq, rc;
+	struct msi_desc *entry;
+	struct msi_msg msg;
+	struct axon_msic *msic;
+
+	msic = find_msi_translator(dev);
+	if (!msic)
+		return -ENODEV;
+
+	rc = setup_msi_msg_address(dev, &msg);
+	if (rc)
+		return rc;
+
+	/* We rely on being able to stash a virq in a u16 */
+	BUILD_BUG_ON(NR_IRQS > 65536);
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		virq = irq_create_direct_mapping(msic->irq_host);
+		if (virq == NO_IRQ) {
+			dev_warn(&dev->dev,
+				 "axon_msi: virq allocation failed!\n");
+			return -1;
+		}
+		dev_dbg(&dev->dev, "axon_msi: allocated virq 0x%x\n", virq);
+
+		set_irq_msi(virq, entry);
+		msg.data = virq;
+		write_msi_msg(virq, &msg);
+	}
+
+	return 0;
+}
+
+static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+
+	dev_dbg(&dev->dev, "axon_msi: tearing down msi irqs\n");
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		if (entry->irq == NO_IRQ)
+			continue;
+
+		set_irq_msi(entry->irq, NULL);
+		irq_dispose_mapping(entry->irq);
+	}
+}
+
+static struct irq_chip msic_irq_chip = {
+	.mask		= mask_msi_irq,
+	.unmask		= unmask_msi_irq,
+	.shutdown	= unmask_msi_irq,
+	.typename	= "AXON-MSI",
+};
+
+static int msic_host_map(struct irq_host *h, unsigned int virq,
+			 irq_hw_number_t hw)
+{
+	set_irq_chip_and_handler(virq, &msic_irq_chip, handle_simple_irq);
+
+	return 0;
+}
+
+static int msic_host_match(struct irq_host *host, struct device_node *dn)
+{
+	struct axon_msic *msic = host->host_data;
+
+	return msic->dn == dn;
+}
+
+static struct irq_host_ops msic_host_ops = {
+	.match	= msic_host_match,
+	.map	= msic_host_map,
+};
+
+static int axon_msi_notify_reboot(struct notifier_block *nb,
+				  unsigned long code, void *data)
+{
+	struct axon_msic *msic;
+	u32 tmp;
+
+	list_for_each_entry(msic, &axon_msic_list, list) {
+		pr_debug("axon_msi: disabling %s\n", msic->dn->full_name);
+		tmp  = msic_dcr_read(msic, MSIC_CTRL_REG);
+		tmp &= ~MSIC_CTRL_ENABLE & ~MSIC_CTRL_IRQ_ENABLE;
+		msic_dcr_write(msic, MSIC_CTRL_REG, tmp);
+	}
+
+	return 0;
+}
+
+static struct notifier_block axon_msi_reboot_notifier = {
+	.notifier_call = axon_msi_notify_reboot
+};
+
+static int axon_msi_setup_one(struct device_node *dn)
+{
+	struct page *page;
+	struct axon_msic *msic;
+	unsigned int virq;
+	int dcr_len;
+
+	pr_debug("axon_msi: setting up dn %s\n", dn->full_name);
+
+	msic = kzalloc(sizeof(struct axon_msic), GFP_KERNEL);
+	if (!msic) {
+		printk(KERN_ERR "axon_msi: couldn't allocate msic for %s\n",
+		       dn->full_name);
+		goto out;
+	}
+
+	msic->dcr_base = dcr_resource_start(dn, 0);
+	dcr_len = dcr_resource_len(dn, 0);
+
+	if (msic->dcr_base == 0 || dcr_len == 0) {
+		printk(KERN_ERR
+		       "axon_msi: couldn't parse dcr properties on %s\n",
+			dn->full_name);
+		goto out;
+	}
+
+	msic->dcr_host = dcr_map(dn, msic->dcr_base, dcr_len);
+	if (!DCR_MAP_OK(msic->dcr_host)) {
+		printk(KERN_ERR "axon_msi: dcr_map failed for %s\n",
+		       dn->full_name);
+		goto out_free_msic;
+	}
+
+	page = alloc_pages_node(of_node_to_nid(dn), GFP_KERNEL,
+				get_order(MSIC_FIFO_SIZE_BYTES));
+	if (!page) {
+		printk(KERN_ERR "axon_msi: couldn't allocate fifo for %s\n",
+		       dn->full_name);
+		goto out_free_msic;
+	}
+
+	msic->fifo = page_address(page);
+
+	msic->irq_host = irq_alloc_host(IRQ_HOST_MAP_NOMAP, NR_IRQS,
+					&msic_host_ops, 0);
+	if (!msic->irq_host) {
+		printk(KERN_ERR "axon_msi: couldn't allocate irq_host for %s\n",
+		       dn->full_name);
+		goto out_free_fifo;
+	}
+
+	msic->irq_host->host_data = msic;
+
+	virq = irq_of_parse_and_map(dn, 0);
+	if (virq == NO_IRQ) {
+		printk(KERN_ERR "axon_msi: irq parse and map failed for %s\n",
+		       dn->full_name);
+		goto out_free_host;
+	}
+
+	msic->dn = of_node_get(dn);
+
+	set_irq_data(virq, msic);
+	set_irq_chained_handler(virq, axon_msi_cascade);
+	pr_debug("axon_msi: irq 0x%x setup for axon_msi\n", virq);
+
+	/* Enable the MSIC hardware */
+	msic_dcr_write(msic, MSIC_BASE_ADDR_HI_REG, (u64)msic->fifo >> 32);
+	msic_dcr_write(msic, MSIC_BASE_ADDR_LO_REG,
+				  (u64)msic->fifo & 0xFFFFFFFF);
+	msic_dcr_write(msic, MSIC_CTRL_REG,
+			MSIC_CTRL_IRQ_ENABLE | MSIC_CTRL_ENABLE |
+			MSIC_CTRL_FIFO_SIZE);
+
+	list_add(&msic->list, &axon_msic_list);
+
+	printk(KERN_DEBUG "axon_msi: setup MSIC on %s\n", dn->full_name);
+
+	return 0;
+
+out_free_host:
+	kfree(msic->irq_host);
+out_free_fifo:
+	__free_pages(virt_to_page(msic->fifo), get_order(MSIC_FIFO_SIZE_BYTES));
+out_free_msic:
+	kfree(msic);
+out:
+
+	return -1;
+}
+
+static int axon_msi_init(void)
+{
+	struct device_node *dn;
+	int found = 0;
+
+	pr_debug("axon_msi: initialising ...\n");
+
+	for_each_compatible_node(dn, NULL, "ibm,axon-msic") {
+		if (axon_msi_setup_one(dn) == 0)
+			found++;
+	}
+
+	if (found) {
+		ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs;
+		ppc_md.teardown_msi_irqs = axon_msi_teardown_msi_irqs;
+		ppc_md.msi_check_device = axon_msi_check_device;
+
+		register_reboot_notifier(&axon_msi_reboot_notifier);
+
+		pr_debug("axon_msi: registered callbacks!\n");
+	}
+
+	return 0;
+}
+arch_initcall(axon_msi_init);
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index ab511d5b65a4..0b6e8ee85ab1 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -1,7 +1,7 @@
 /*
  * cpufreq driver for the cell processor
  *
- * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
  *
  * Author: Christian Krafft <krafft@de.ibm.com>
  *
@@ -21,18 +21,11 @@
  */
 
 #include <linux/cpufreq.h>
-#include <linux/timer.h>
-
-#include <asm/hw_irq.h>
-#include <asm/io.h>
 #include <asm/machdep.h>
-#include <asm/processor.h>
-#include <asm/prom.h>
-#include <asm/time.h>
-#include <asm/pmi.h>
 #include <asm/of_platform.h>
-
+#include <asm/prom.h>
 #include "cbe_regs.h"
+#include "cbe_cpufreq.h"
 
 static DEFINE_MUTEX(cbe_switch_mutex);
 
@@ -50,159 +43,24 @@ static struct cpufreq_frequency_table cbe_freqs[] = {
 	{0,	CPUFREQ_TABLE_END},
 };
 
-/* to write to MIC register */
-static u64 MIC_Slow_Fast_Timer_table[] = {
-	[0 ... 7] = 0x007fc00000000000ull,
-};
-
-/* more values for the MIC */
-static u64 MIC_Slow_Next_Timer_table[] = {
-	0x0000240000000000ull,
-	0x0000268000000000ull,
-	0x000029C000000000ull,
-	0x00002D0000000000ull,
-	0x0000300000000000ull,
-	0x0000334000000000ull,
-	0x000039C000000000ull,
-	0x00003FC000000000ull,
-};
-
-static unsigned int pmi_frequency_limit = 0;
 /*
  * hardware specific functions
  */
 
-static struct of_device *pmi_dev;
-
-#ifdef CONFIG_PPC_PMI
-static int set_pmode_pmi(int cpu, unsigned int pmode)
-{
-	int ret;
-	pmi_message_t pmi_msg;
-#ifdef DEBUG
-	u64 time;
-#endif
-
-	pmi_msg.type = PMI_TYPE_FREQ_CHANGE;
-	pmi_msg.data1 =	cbe_cpu_to_node(cpu);
-	pmi_msg.data2 = pmode;
-
-#ifdef DEBUG
-	time = (u64) get_cycles();
-#endif
-
-	pmi_send_message(pmi_dev, pmi_msg);
-	ret = pmi_msg.data2;
-
-	pr_debug("PMI returned slow mode %d\n", ret);
-
-#ifdef DEBUG
-	time = (u64) get_cycles() - time; /* actual cycles (not cpu cycles!) */
-	time = 1000000000 * time / CLOCK_TICK_RATE; /* time in ns (10^-9) */
-	pr_debug("had to wait %lu ns for a transition\n", time);
-#endif
-	return ret;
-}
-#endif
-
-static int get_pmode(int cpu)
+static int set_pmode(unsigned int cpu, unsigned int slow_mode)
 {
-	int ret;
-	struct cbe_pmd_regs __iomem *pmd_regs;
-
-	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
-	ret = in_be64(&pmd_regs->pmsr) & 0x07;
-
-	return ret;
-}
-
-static int set_pmode_reg(int cpu, unsigned int pmode)
-{
-	struct cbe_pmd_regs __iomem *pmd_regs;
-	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
-	u64 flags;
-	u64 value;
-
-	local_irq_save(flags);
-
-	mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu);
-	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
-
-	pr_debug("pm register is mapped at %p\n", &pmd_regs->pmcr);
-	pr_debug("mic register is mapped at %p\n", &mic_tm_regs->slow_fast_timer_0);
-
-	out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]);
-	out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]);
-
-	out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]);
-	out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]);
-
-	value = in_be64(&pmd_regs->pmcr);
-	/* set bits to zero */
-	value &= 0xFFFFFFFFFFFFFFF8ull;
-	/* set bits to next pmode */
-	value |= pmode;
-
-	out_be64(&pmd_regs->pmcr, value);
-
-	/* wait until new pmode appears in status register */
-	value = in_be64(&pmd_regs->pmsr) & 0x07;
-	while(value != pmode) {
-		cpu_relax();
-		value = in_be64(&pmd_regs->pmsr) & 0x07;
-	}
-
-	local_irq_restore(flags);
-
-	return 0;
-}
+	int rc;
 
-static int set_pmode(int cpu, unsigned int slow_mode) {
-#ifdef CONFIG_PPC_PMI
-	if (pmi_dev)
-		return set_pmode_pmi(cpu, slow_mode);
+	if (cbe_cpufreq_has_pmi)
+		rc = cbe_cpufreq_set_pmode_pmi(cpu, slow_mode);
 	else
-#endif
-		return set_pmode_reg(cpu, slow_mode);
-}
-
-static void cbe_cpufreq_handle_pmi(struct of_device *dev, pmi_message_t pmi_msg)
-{
-	u8 cpu;
-	u8 cbe_pmode_new;
-
-	BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE);
+		rc = cbe_cpufreq_set_pmode(cpu, slow_mode);
 
-	cpu = cbe_node_to_cpu(pmi_msg.data1);
-	cbe_pmode_new = pmi_msg.data2;
+	pr_debug("register contains slow mode %d\n", cbe_cpufreq_get_pmode(cpu));
 
-	pmi_frequency_limit = cbe_freqs[cbe_pmode_new].frequency;
-
-	pr_debug("cbe_handle_pmi: max freq=%d\n", pmi_frequency_limit);
-}
-
-static int pmi_notifier(struct notifier_block *nb,
-				       unsigned long event, void *data)
-{
-	struct cpufreq_policy *policy = data;
-
-	if (event != CPUFREQ_INCOMPATIBLE)
-		return 0;
-
-	cpufreq_verify_within_limits(policy, 0, pmi_frequency_limit);
-	return 0;
+	return rc;
 }
 
-static struct notifier_block pmi_notifier_block = {
-	.notifier_call = pmi_notifier,
-};
-
-static struct pmi_handler cbe_pmi_handler = {
-	.type			= PMI_TYPE_FREQ_CHANGE,
-	.handle_pmi_message	= cbe_cpufreq_handle_pmi,
-};
-
-
 /*
  * cpufreq functions
  */
@@ -221,8 +79,19 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	pr_debug("init cpufreq on CPU %d\n", policy->cpu);
 
+	/*
+	 * Let's check we can actually get to the CELL regs
+	 */
+	if (!cbe_get_cpu_pmd_regs(policy->cpu) ||
+	    !cbe_get_cpu_mic_tm_regs(policy->cpu)) {
+		pr_info("invalid CBE regs pointers for cpufreq\n");
+		return -EINVAL;
+	}
+
 	max_freqp = of_get_property(cpu, "clock-frequency", NULL);
 
+	of_node_put(cpu);
+
 	if (!max_freqp)
 		return -EINVAL;
 
@@ -239,10 +108,12 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
-	/* if DEBUG is enabled set_pmode() measures the correct latency of a transition */
+
+	/* if DEBUG is enabled set_pmode() measures the latency
+	 * of a transition */
 	policy->cpuinfo.transition_latency = 25000;
 
-	cur_pmode = get_pmode(policy->cpu);
+	cur_pmode = cbe_cpufreq_get_pmode(policy->cpu);
 	pr_debug("current pmode is at %d\n",cur_pmode);
 
 	policy->cur = cbe_freqs[cur_pmode].frequency;
@@ -253,21 +124,13 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	cpufreq_frequency_table_get_attr(cbe_freqs, policy->cpu);
 
-	if (pmi_dev) {
-		/* frequency might get limited later, initialize limit with max_freq */
-		pmi_frequency_limit = max_freq;
-		cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
-	}
-
-	/* this ensures that policy->cpuinfo_min and policy->cpuinfo_max are set correctly */
+	/* this ensures that policy->cpuinfo_min
+	 * and policy->cpuinfo_max are set correctly */
 	return cpufreq_frequency_table_cpuinfo(policy, cbe_freqs);
 }
 
 static int cbe_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-	if (pmi_dev)
-		cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
-
 	cpufreq_frequency_table_put_attr(policy->cpu);
 	return 0;
 }
@@ -277,13 +140,13 @@ static int cbe_cpufreq_verify(struct cpufreq_policy *policy)
 	return cpufreq_frequency_table_verify(policy, cbe_freqs);
 }
 
-
-static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,
-			    unsigned int relation)
+static int cbe_cpufreq_target(struct cpufreq_policy *policy,
+			      unsigned int target_freq,
+			      unsigned int relation)
 {
 	int rc;
 	struct cpufreq_freqs freqs;
-	int cbe_pmode_new;
+	unsigned int cbe_pmode_new;
 
 	cpufreq_frequency_table_target(policy,
 				       cbe_freqs,
@@ -298,12 +161,14 @@ static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target
 	mutex_lock(&cbe_switch_mutex);
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
-	pr_debug("setting frequency for cpu %d to %d kHz, 1/%d of max frequency\n",
+	pr_debug("setting frequency for cpu %d to %d kHz, " \
+		 "1/%d of max frequency\n",
 		 policy->cpu,
 		 cbe_freqs[cbe_pmode_new].frequency,
 		 cbe_freqs[cbe_pmode_new].index);
 
 	rc = set_pmode(policy->cpu, cbe_pmode_new);
+
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	mutex_unlock(&cbe_switch_mutex);
 
@@ -326,28 +191,14 @@ static struct cpufreq_driver cbe_cpufreq_driver = {
 
 static int __init cbe_cpufreq_init(void)
 {
-#ifdef CONFIG_PPC_PMI
-	struct device_node *np;
-#endif
 	if (!machine_is(cell))
 		return -ENODEV;
-#ifdef CONFIG_PPC_PMI
-	np = of_find_node_by_type(NULL, "ibm,pmi");
-
-	pmi_dev = of_find_device_by_node(np);
 
-	if (pmi_dev)
-		pmi_register_handler(pmi_dev, &cbe_pmi_handler);
-#endif
 	return cpufreq_register_driver(&cbe_cpufreq_driver);
 }
 
 static void __exit cbe_cpufreq_exit(void)
 {
-#ifdef CONFIG_PPC_PMI
-	if (pmi_dev)
-		pmi_unregister_handler(pmi_dev, &cbe_pmi_handler);
-#endif
 	cpufreq_unregister_driver(&cbe_cpufreq_driver);
 }
 
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.h b/arch/powerpc/platforms/cell/cbe_cpufreq.h
new file mode 100644
index 000000000000..c1d86bfa92ff
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.h
@@ -0,0 +1,24 @@
+/*
+ * cbe_cpufreq.h
+ *
+ * This file contains the definitions used by the cbe_cpufreq driver.
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/types.h>
+
+int cbe_cpufreq_set_pmode(int cpu, unsigned int pmode);
+int cbe_cpufreq_get_pmode(int cpu);
+
+int cbe_cpufreq_set_pmode_pmi(int cpu, unsigned int pmode);
+
+#if defined(CONFIG_CBE_CPUFREQ_PMI) || defined(CONFIG_CBE_CPUFREQ_PMI_MODULE)
+extern bool cbe_cpufreq_has_pmi;
+#else
+#define cbe_cpufreq_has_pmi (0)
+#endif
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c
new file mode 100644
index 000000000000..163263b3e1cd
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c
@@ -0,0 +1,115 @@
+/*
+ * pervasive backend for the cbe_cpufreq driver
+ *
+ * This driver makes use of the pervasive unit to
+ * engage the desired frequency.
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <asm/machdep.h>
+#include <asm/hw_irq.h>
+
+#include "cbe_regs.h"
+#include "cbe_cpufreq.h"
+
+/* to write to MIC register */
+static u64 MIC_Slow_Fast_Timer_table[] = {
+	[0 ... 7] = 0x007fc00000000000ull,
+};
+
+/* more values for the MIC */
+static u64 MIC_Slow_Next_Timer_table[] = {
+	0x0000240000000000ull,
+	0x0000268000000000ull,
+	0x000029C000000000ull,
+	0x00002D0000000000ull,
+	0x0000300000000000ull,
+	0x0000334000000000ull,
+	0x000039C000000000ull,
+	0x00003FC000000000ull,
+};
+
+
+int cbe_cpufreq_set_pmode(int cpu, unsigned int pmode)
+{
+	struct cbe_pmd_regs __iomem *pmd_regs;
+	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
+	u64 flags;
+	u64 value;
+#ifdef DEBUG
+	long time;
+#endif
+
+	local_irq_save(flags);
+
+	mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu);
+	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+
+#ifdef DEBUG
+	time = jiffies;
+#endif
+
+	out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]);
+	out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]);
+
+	out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]);
+	out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]);
+
+	value = in_be64(&pmd_regs->pmcr);
+	/* set bits to zero */
+	value &= 0xFFFFFFFFFFFFFFF8ull;
+	/* set bits to next pmode */
+	value |= pmode;
+
+	out_be64(&pmd_regs->pmcr, value);
+
+#ifdef DEBUG
+	/* wait until new pmode appears in status register */
+	value = in_be64(&pmd_regs->pmsr) & 0x07;
+	while (value != pmode) {
+		cpu_relax();
+		value = in_be64(&pmd_regs->pmsr) & 0x07;
+	}
+
+	time = jiffies  - time;
+	time = jiffies_to_msecs(time);
+	pr_debug("had to wait %lu ms for a transition using " \
+		 "pervasive unit\n", time);
+#endif
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+
+int cbe_cpufreq_get_pmode(int cpu)
+{
+	int ret;
+	struct cbe_pmd_regs __iomem *pmd_regs;
+
+	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+	ret = in_be64(&pmd_regs->pmsr) & 0x07;
+
+	return ret;
+}
+
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
new file mode 100644
index 000000000000..fc6f38982ff4
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
@@ -0,0 +1,148 @@
+/*
+ * pmi backend for the cbe_cpufreq driver
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <asm/of_platform.h>
+#include <asm/processor.h>
+#include <asm/prom.h>
+#include <asm/pmi.h>
+
+#ifdef DEBUG
+#include <asm/time.h>
+#endif
+
+#include "cbe_regs.h"
+#include "cbe_cpufreq.h"
+
+static u8 pmi_slow_mode_limit[MAX_CBE];
+
+bool cbe_cpufreq_has_pmi = false;
+EXPORT_SYMBOL_GPL(cbe_cpufreq_has_pmi);
+
+/*
+ * hardware specific functions
+ */
+
+int cbe_cpufreq_set_pmode_pmi(int cpu, unsigned int pmode)
+{
+	int ret;
+	pmi_message_t pmi_msg;
+#ifdef DEBUG
+	long time;
+#endif
+	pmi_msg.type = PMI_TYPE_FREQ_CHANGE;
+	pmi_msg.data1 =	cbe_cpu_to_node(cpu);
+	pmi_msg.data2 = pmode;
+
+#ifdef DEBUG
+	time = jiffies;
+#endif
+	pmi_send_message(pmi_msg);
+
+#ifdef DEBUG
+	time = jiffies  - time;
+	time = jiffies_to_msecs(time);
+	pr_debug("had to wait %lu ms for a transition using " \
+		 "PMI\n", time);
+#endif
+	ret = pmi_msg.data2;
+	pr_debug("PMI returned slow mode %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cbe_cpufreq_set_pmode_pmi);
+
+
+static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
+{
+	u8 node, slow_mode;
+
+	BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE);
+
+	node = pmi_msg.data1;
+	slow_mode = pmi_msg.data2;
+
+	pmi_slow_mode_limit[node] = slow_mode;
+
+	pr_debug("cbe_handle_pmi: node: %d max_freq: %d\n", node, slow_mode);
+}
+
+static int pmi_notifier(struct notifier_block *nb,
+				       unsigned long event, void *data)
+{
+	struct cpufreq_policy *policy = data;
+	struct cpufreq_frequency_table *cbe_freqs;
+	u8 node;
+
+	cbe_freqs = cpufreq_frequency_get_table(policy->cpu);
+	node = cbe_cpu_to_node(policy->cpu);
+
+	pr_debug("got notified, event=%lu, node=%u\n", event, node);
+
+	if (pmi_slow_mode_limit[node] != 0) {
+		pr_debug("limiting node %d to slow mode %d\n",
+			 node, pmi_slow_mode_limit[node]);
+
+		cpufreq_verify_within_limits(policy, 0,
+
+			cbe_freqs[pmi_slow_mode_limit[node]].frequency);
+	}
+
+	return 0;
+}
+
+static struct notifier_block pmi_notifier_block = {
+	.notifier_call = pmi_notifier,
+};
+
+static struct pmi_handler cbe_pmi_handler = {
+	.type			= PMI_TYPE_FREQ_CHANGE,
+	.handle_pmi_message	= cbe_cpufreq_handle_pmi,
+};
+
+
+
+static int __init cbe_cpufreq_pmi_init(void)
+{
+	cbe_cpufreq_has_pmi = pmi_register_handler(&cbe_pmi_handler) == 0;
+
+	if (!cbe_cpufreq_has_pmi)
+		return -ENODEV;
+
+	cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
+
+	return 0;
+}
+
+static void __exit cbe_cpufreq_pmi_exit(void)
+{
+	cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
+	pmi_unregister_handler(&cbe_pmi_handler);
+}
+
+module_init(cbe_cpufreq_pmi_init);
+module_exit(cbe_cpufreq_pmi_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c
index 12c9674b4b1f..c8f7f0007422 100644
--- a/arch/powerpc/platforms/cell/cbe_regs.c
+++ b/arch/powerpc/platforms/cell/cbe_regs.c
@@ -174,6 +174,13 @@ static struct device_node *cbe_get_be_node(int cpu_id)
 
 		cpu_handle = of_get_property(np, "cpus", &len);
 
+		/*
+		 * the CAB SLOF tree is non compliant, so we just assume
+		 * there is only one node
+		 */
+		if (WARN_ON_ONCE(!cpu_handle))
+			return np;
+
 		for (i=0; i<len; i++)
 			if (of_find_node_by_phandle(cpu_handle[i]) == of_get_cpu_node(cpu_id, NULL))
 				return np;
diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
index f370f0fa6f4c..e4132f8f51b3 100644
--- a/arch/powerpc/platforms/cell/cbe_thermal.c
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -292,7 +292,7 @@ static struct attribute_group ppe_attribute_group = {
 /*
  * initialize throttling with default values
  */
-static void __init init_default_values(void)
+static int __init init_default_values(void)
 {
 	int cpu;
 	struct cbe_pmd_regs __iomem *pmd_regs;
@@ -339,25 +339,40 @@ static void __init init_default_values(void)
 	for_each_possible_cpu (cpu) {
 		pr_debug("processing cpu %d\n", cpu);
 		sysdev = get_cpu_sysdev(cpu);
+
+		if (!sysdev) {
+			pr_info("invalid sysdev pointer for cbe_thermal\n");
+			return -EINVAL;
+		}
+
 		pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
 
+		if (!pmd_regs) {
+			pr_info("invalid CBE regs pointer for cbe_thermal\n");
+			return -EINVAL;
+		}
+
 		out_be64(&pmd_regs->tm_str2, str2);
 		out_be64(&pmd_regs->tm_str1.val, str1.val);
 		out_be64(&pmd_regs->tm_tpr.val, tpr.val);
 		out_be64(&pmd_regs->tm_cr1.val, cr1.val);
 		out_be64(&pmd_regs->tm_cr2, cr2);
 	}
+
+	return 0;
 }
 
 
 static int __init thermal_init(void)
 {
-	init_default_values();
+	int rc = init_default_values();
 
-	spu_add_sysdev_attr_group(&spu_attribute_group);
-	cpu_add_sysdev_attr_group(&ppe_attribute_group);
+	if (rc == 0) {
+		spu_add_sysdev_attr_group(&spu_attribute_group);
+		cpu_add_sysdev_attr_group(&ppe_attribute_group);
+	}
 
-	return 0;
+	return rc;
 }
 module_init(thermal_init);
 
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 96a8f609690c..90124228b8f4 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -35,18 +35,37 @@
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
 #include <asm/xmon.h>
+#include <asm/prom.h>
+#include "spu_priv1_mmio.h"
 
 const struct spu_management_ops *spu_management_ops;
 EXPORT_SYMBOL_GPL(spu_management_ops);
 
 const struct spu_priv1_ops *spu_priv1_ops;
+EXPORT_SYMBOL_GPL(spu_priv1_ops);
 
-static struct list_head spu_list[MAX_NUMNODES];
-static LIST_HEAD(spu_full_list);
-static DEFINE_MUTEX(spu_mutex);
-static DEFINE_SPINLOCK(spu_list_lock);
+struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
+EXPORT_SYMBOL_GPL(cbe_spu_info);
 
-EXPORT_SYMBOL_GPL(spu_priv1_ops);
+/*
+ * Protects cbe_spu_info and spu->number.
+ */
+static DEFINE_SPINLOCK(spu_lock);
+
+/*
+ * List of all spus in the system.
+ *
+ * This list is iterated by callers from irq context and callers that
+ * want to sleep.  Thus modifications need to be done with both
+ * spu_full_list_lock and spu_full_list_mutex held, while iterating
+ * through it requires either of these locks.
+ *
+ * In addition spu_full_list_lock protects all assignmens to
+ * spu->mm.
+ */
+static LIST_HEAD(spu_full_list);
+static DEFINE_SPINLOCK(spu_full_list_lock);
+static DEFINE_MUTEX(spu_full_list_mutex);
 
 void spu_invalidate_slbs(struct spu *spu)
 {
@@ -65,12 +84,12 @@ void spu_flush_all_slbs(struct mm_struct *mm)
 	struct spu *spu;
 	unsigned long flags;
 
-	spin_lock_irqsave(&spu_list_lock, flags);
+	spin_lock_irqsave(&spu_full_list_lock, flags);
 	list_for_each_entry(spu, &spu_full_list, full_list) {
 		if (spu->mm == mm)
 			spu_invalidate_slbs(spu);
 	}
-	spin_unlock_irqrestore(&spu_list_lock, flags);
+	spin_unlock_irqrestore(&spu_full_list_lock, flags);
 }
 
 /* The hack below stinks... try to do something better one of
@@ -88,9 +107,9 @@ void spu_associate_mm(struct spu *spu, struct mm_struct *mm)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&spu_list_lock, flags);
+	spin_lock_irqsave(&spu_full_list_lock, flags);
 	spu->mm = mm;
-	spin_unlock_irqrestore(&spu_list_lock, flags);
+	spin_unlock_irqrestore(&spu_full_list_lock, flags);
 	if (mm)
 		mm_needs_global_tlbie(mm);
 }
@@ -390,7 +409,7 @@ static void spu_free_irqs(struct spu *spu)
 		free_irq(spu->irqs[2], spu);
 }
 
-static void spu_init_channels(struct spu *spu)
+void spu_init_channels(struct spu *spu)
 {
 	static const struct {
 		 unsigned channel;
@@ -423,46 +442,7 @@ static void spu_init_channels(struct spu *spu)
 		out_be64(&priv2->spu_chnlcnt_RW, count_list[i].count);
 	}
 }
-
-struct spu *spu_alloc_node(int node)
-{
-	struct spu *spu = NULL;
-
-	mutex_lock(&spu_mutex);
-	if (!list_empty(&spu_list[node])) {
-		spu = list_entry(spu_list[node].next, struct spu, list);
-		list_del_init(&spu->list);
-		pr_debug("Got SPU %d %d\n", spu->number, spu->node);
-	}
-	mutex_unlock(&spu_mutex);
-
-	if (spu)
-		spu_init_channels(spu);
-	return spu;
-}
-EXPORT_SYMBOL_GPL(spu_alloc_node);
-
-struct spu *spu_alloc(void)
-{
-	struct spu *spu = NULL;
-	int node;
-
-	for (node = 0; node < MAX_NUMNODES; node++) {
-		spu = spu_alloc_node(node);
-		if (spu)
-			break;
-	}
-
-	return spu;
-}
-
-void spu_free(struct spu *spu)
-{
-	mutex_lock(&spu_mutex);
-	list_add_tail(&spu->list, &spu_list[spu->node]);
-	mutex_unlock(&spu_mutex);
-}
-EXPORT_SYMBOL_GPL(spu_free);
+EXPORT_SYMBOL_GPL(spu_init_channels);
 
 static int spu_shutdown(struct sys_device *sysdev)
 {
@@ -481,12 +461,12 @@ struct sysdev_class spu_sysdev_class = {
 int spu_add_sysdev_attr(struct sysdev_attribute *attr)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysdev_create_file(&spu->sysdev, attr);
+	mutex_unlock(&spu_full_list_mutex);
 
-	mutex_unlock(&spu_mutex);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
@@ -494,12 +474,12 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
 int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysfs_create_group(&spu->sysdev.kobj, attrs);
+	mutex_unlock(&spu_full_list_mutex);
 
-	mutex_unlock(&spu_mutex);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
@@ -508,24 +488,22 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
 void spu_remove_sysdev_attr(struct sysdev_attribute *attr)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysdev_remove_file(&spu->sysdev, attr);
-
-	mutex_unlock(&spu_mutex);
+	mutex_unlock(&spu_full_list_mutex);
 }
 EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr);
 
 void spu_remove_sysdev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysfs_remove_group(&spu->sysdev.kobj, attrs);
-
-	mutex_unlock(&spu_mutex);
+	mutex_unlock(&spu_full_list_mutex);
 }
 EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr_group);
 
@@ -553,16 +531,19 @@ static int __init create_spu(void *data)
 	int ret;
 	static int number;
 	unsigned long flags;
+	struct timespec ts;
 
 	ret = -ENOMEM;
 	spu = kzalloc(sizeof (*spu), GFP_KERNEL);
 	if (!spu)
 		goto out;
 
+	spu->alloc_state = SPU_FREE;
+
 	spin_lock_init(&spu->register_lock);
-	mutex_lock(&spu_mutex);
+	spin_lock(&spu_lock);
 	spu->number = number++;
-	mutex_unlock(&spu_mutex);
+	spin_unlock(&spu_lock);
 
 	ret = spu_create_spu(spu, data);
 
@@ -579,15 +560,22 @@ static int __init create_spu(void *data)
 	if (ret)
 		goto out_free_irqs;
 
-	mutex_lock(&spu_mutex);
-	spin_lock_irqsave(&spu_list_lock, flags);
-	list_add(&spu->list, &spu_list[spu->node]);
+	mutex_lock(&cbe_spu_info[spu->node].list_mutex);
+	list_add(&spu->cbe_list, &cbe_spu_info[spu->node].spus);
+	cbe_spu_info[spu->node].n_spus++;
+	mutex_unlock(&cbe_spu_info[spu->node].list_mutex);
+
+	mutex_lock(&spu_full_list_mutex);
+	spin_lock_irqsave(&spu_full_list_lock, flags);
 	list_add(&spu->full_list, &spu_full_list);
-	spin_unlock_irqrestore(&spu_list_lock, flags);
-	mutex_unlock(&spu_mutex);
+	spin_unlock_irqrestore(&spu_full_list_lock, flags);
+	mutex_unlock(&spu_full_list_mutex);
+
+	spu->stats.util_state = SPU_UTIL_IDLE_LOADED;
+	ktime_get_ts(&ts);
+	spu->stats.tstamp = timespec_to_ns(&ts);
 
-	spu->stats.utilization_state = SPU_UTIL_IDLE;
-	spu->stats.tstamp = jiffies;
+	INIT_LIST_HEAD(&spu->aff_list);
 
 	goto out;
 
@@ -608,12 +596,20 @@ static const char *spu_state_names[] = {
 static unsigned long long spu_acct_time(struct spu *spu,
 		enum spu_utilization_state state)
 {
+	struct timespec ts;
 	unsigned long long time = spu->stats.times[state];
 
-	if (spu->stats.utilization_state == state)
-		time += jiffies - spu->stats.tstamp;
+	/*
+	 * If the spu is idle or the context is stopped, utilization
+	 * statistics are not updated.  Apply the time delta from the
+	 * last recorded state of the spu.
+	 */
+	if (spu->stats.util_state == state) {
+		ktime_get_ts(&ts);
+		time += timespec_to_ns(&ts) - spu->stats.tstamp;
+	}
 
-	return jiffies_to_msecs(time);
+	return time / NSEC_PER_MSEC;
 }
 
 
@@ -623,11 +619,11 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
 
 	return sprintf(buf, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
-		spu_state_names[spu->stats.utilization_state],
+		spu_state_names[spu->stats.util_state],
 		spu_acct_time(spu, SPU_UTIL_USER),
 		spu_acct_time(spu, SPU_UTIL_SYSTEM),
 		spu_acct_time(spu, SPU_UTIL_IOWAIT),
-		spu_acct_time(spu, SPU_UTIL_IDLE),
+		spu_acct_time(spu, SPU_UTIL_IDLE_LOADED),
 		spu->stats.vol_ctx_switch,
 		spu->stats.invol_ctx_switch,
 		spu->stats.slb_flt,
@@ -640,12 +636,146 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
 
 static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL);
 
+/* Hardcoded affinity idxs for QS20 */
+#define SPES_PER_BE 8
+static int QS20_reg_idxs[SPES_PER_BE] =   { 0, 2, 4, 6, 7, 5, 3, 1 };
+static int QS20_reg_memory[SPES_PER_BE] = { 1, 1, 0, 0, 0, 0, 0, 0 };
+
+static struct spu *spu_lookup_reg(int node, u32 reg)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+		if (*(u32 *)get_property(spu_devnode(spu), "reg", NULL) == reg)
+			return spu;
+	}
+	return NULL;
+}
+
+static void init_aff_QS20_harcoded(void)
+{
+	int node, i;
+	struct spu *last_spu, *spu;
+	u32 reg;
+
+	for (node = 0; node < MAX_NUMNODES; node++) {
+		last_spu = NULL;
+		for (i = 0; i < SPES_PER_BE; i++) {
+			reg = QS20_reg_idxs[i];
+			spu = spu_lookup_reg(node, reg);
+			if (!spu)
+				continue;
+			spu->has_mem_affinity = QS20_reg_memory[reg];
+			if (last_spu)
+				list_add_tail(&spu->aff_list,
+						&last_spu->aff_list);
+			last_spu = spu;
+		}
+	}
+}
+
+static int of_has_vicinity(void)
+{
+	struct spu* spu;
+
+	spu = list_entry(cbe_spu_info[0].spus.next, struct spu, cbe_list);
+	return of_find_property(spu_devnode(spu), "vicinity", NULL) != NULL;
+}
+
+static struct spu *aff_devnode_spu(int cbe, struct device_node *dn)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, &cbe_spu_info[cbe].spus, cbe_list)
+		if (spu_devnode(spu) == dn)
+			return spu;
+	return NULL;
+}
+
+static struct spu *
+aff_node_next_to(int cbe, struct device_node *target, struct device_node *avoid)
+{
+	struct spu *spu;
+	const phandle *vic_handles;
+	int lenp, i;
+
+	list_for_each_entry(spu, &cbe_spu_info[cbe].spus, cbe_list) {
+		if (spu_devnode(spu) == avoid)
+			continue;
+		vic_handles = get_property(spu_devnode(spu), "vicinity", &lenp);
+		for (i=0; i < (lenp / sizeof(phandle)); i++) {
+			if (vic_handles[i] == target->linux_phandle)
+				return spu;
+		}
+	}
+	return NULL;
+}
+
+static void init_aff_fw_vicinity_node(int cbe)
+{
+	struct spu *spu, *last_spu;
+	struct device_node *vic_dn, *last_spu_dn;
+	phandle avoid_ph;
+	const phandle *vic_handles;
+	const char *name;
+	int lenp, i, added, mem_aff;
+
+	last_spu = list_entry(cbe_spu_info[cbe].spus.next, struct spu, cbe_list);
+	avoid_ph = 0;
+	for (added = 1; added < cbe_spu_info[cbe].n_spus; added++) {
+		last_spu_dn = spu_devnode(last_spu);
+		vic_handles = get_property(last_spu_dn, "vicinity", &lenp);
+
+		for (i = 0; i < (lenp / sizeof(phandle)); i++) {
+			if (vic_handles[i] == avoid_ph)
+				continue;
+
+			vic_dn = of_find_node_by_phandle(vic_handles[i]);
+			if (!vic_dn)
+				continue;
+
+			name = get_property(vic_dn, "name", NULL);
+			if (strcmp(name, "spe") == 0) {
+				spu = aff_devnode_spu(cbe, vic_dn);
+				avoid_ph = last_spu_dn->linux_phandle;
+			}
+			else {
+				mem_aff = strcmp(name, "mic-tm") == 0;
+				spu = aff_node_next_to(cbe, vic_dn, last_spu_dn);
+				if (!spu)
+					continue;
+				if (mem_aff) {
+					last_spu->has_mem_affinity = 1;
+					spu->has_mem_affinity = 1;
+				}
+				avoid_ph = vic_dn->linux_phandle;
+			}
+			list_add_tail(&spu->aff_list, &last_spu->aff_list);
+			last_spu = spu;
+			break;
+		}
+	}
+}
+
+static void init_aff_fw_vicinity(void)
+{
+	int cbe;
+
+	/* sets has_mem_affinity for each spu, as long as the
+	 * spu->aff_list list, linking each spu to its neighbors
+	 */
+	for (cbe = 0; cbe < MAX_NUMNODES; cbe++)
+		init_aff_fw_vicinity_node(cbe);
+}
+
 static int __init init_spu_base(void)
 {
 	int i, ret = 0;
 
-	for (i = 0; i < MAX_NUMNODES; i++)
-		INIT_LIST_HEAD(&spu_list[i]);
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		mutex_init(&cbe_spu_info[i].list_mutex);
+		INIT_LIST_HEAD(&cbe_spu_info[i].spus);
+	}
 
 	if (!spu_management_ops)
 		goto out;
@@ -675,16 +805,25 @@ static int __init init_spu_base(void)
 		fb_append_extra_logo(&logo_spe_clut224, ret);
 	}
 
+	mutex_lock(&spu_full_list_mutex);
 	xmon_register_spus(&spu_full_list);
-
+	crash_register_spus(&spu_full_list);
+	mutex_unlock(&spu_full_list_mutex);
 	spu_add_sysdev_attr(&attr_stat);
 
+	if (of_has_vicinity()) {
+		init_aff_fw_vicinity();
+	} else {
+		long root = of_get_flat_dt_root();
+		if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0"))
+			init_aff_QS20_harcoded();
+	}
+
 	return 0;
 
  out_unregister_sysdev_class:
 	sysdev_class_unregister(&spu_sysdev_class);
  out:
-
 	return ret;
 }
 module_init(init_spu_base);
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index 261b507a901a..dd2c6688c8aa 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -34,14 +34,27 @@ struct spufs_calls spufs_calls = {
  * this file is not used and the syscalls directly enter the fs code */
 
 asmlinkage long sys_spu_create(const char __user *name,
-		unsigned int flags, mode_t mode)
+		unsigned int flags, mode_t mode, int neighbor_fd)
 {
 	long ret;
 	struct module *owner = spufs_calls.owner;
+	struct file *neighbor;
+	int fput_needed;
 
 	ret = -ENOSYS;
 	if (owner && try_module_get(owner)) {
-		ret = spufs_calls.create_thread(name, flags, mode);
+		if (flags & SPU_CREATE_AFFINITY_SPU) {
+			neighbor = fget_light(neighbor_fd, &fput_needed);
+			if (neighbor) {
+				ret = spufs_calls.create_thread(name, flags,
+								mode, neighbor);
+				fput_light(neighbor, fput_needed);
+			}
+		}
+		else {
+			ret = spufs_calls.create_thread(name, flags,
+							mode, NULL);
+		}
 		module_put(owner);
 	}
 	return ret;
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 6d7bd60f5380..6694f86d7000 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/atomic.h>
 #include <asm/spu.h>
@@ -55,12 +56,12 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	ctx->ops = &spu_backing_ops;
 	ctx->owner = get_task_mm(current);
 	INIT_LIST_HEAD(&ctx->rq);
+	INIT_LIST_HEAD(&ctx->aff_list);
 	if (gang)
 		spu_gang_add_ctx(gang, ctx);
 	ctx->cpus_allowed = current->cpus_allowed;
 	spu_set_timeslice(ctx);
-	ctx->stats.execution_state = SPUCTX_UTIL_USER;
-	ctx->stats.tstamp = jiffies;
+	ctx->stats.util_state = SPU_UTIL_IDLE_LOADED;
 
 	atomic_inc(&nr_spu_contexts);
 	goto out;
@@ -81,6 +82,8 @@ void destroy_spu_context(struct kref *kref)
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	BUG_ON(!list_empty(&ctx->rq));
 	atomic_dec(&nr_spu_contexts);
 	kfree(ctx);
@@ -166,6 +169,39 @@ int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags)
 void spu_acquire_saved(struct spu_context *ctx)
 {
 	spu_acquire(ctx);
-	if (ctx->state != SPU_STATE_SAVED)
+	if (ctx->state != SPU_STATE_SAVED) {
+		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
 		spu_deactivate(ctx);
+	}
+}
+
+/**
+ * spu_release_saved - unlock spu context and return it to the runqueue
+ * @ctx:	context to unlock
+ */
+void spu_release_saved(struct spu_context *ctx)
+{
+	BUG_ON(ctx->state != SPU_STATE_SAVED);
+
+	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags))
+		spu_activate(ctx, 0);
+
+	spu_release(ctx);
 }
+
+void spu_set_profile_private_kref(struct spu_context *ctx,
+				  struct kref *prof_info_kref,
+				  void ( * prof_info_release) (struct kref *kref))
+{
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private_kref);
+
+void *spu_get_profile_private_kref(struct spu_context *ctx)
+{
+	return ctx->prof_priv_kref;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private_kref);
+
+
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 5d9ad5a0307b..5e31799b1e3f 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -226,7 +226,7 @@ static void spufs_arch_write_notes(struct file *file)
 		spu_acquire_saved(ctx_info->ctx);
 		for (j = 0; j < spufs_coredump_num_notes; j++)
 			spufs_arch_write_note(ctx_info, j, file);
-		spu_release(ctx_info->ctx);
+		spu_release_saved(ctx_info->ctx);
 		list_del(&ctx_info->list);
 		kfree(ctx_info);
 	}
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index f53a07437472..917eab4be486 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -179,16 +179,14 @@ int spufs_handle_class1(struct spu_context *ctx)
 	if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)))
 		return 0;
 
-	spuctx_switch_state(ctx, SPUCTX_UTIL_IOWAIT);
+	spuctx_switch_state(ctx, SPU_UTIL_IOWAIT);
 
 	pr_debug("ctx %p: ea %016lx, dsisr %016lx state %d\n", ctx, ea,
 		dsisr, ctx->state);
 
 	ctx->stats.hash_flt++;
-	if (ctx->state == SPU_STATE_RUNNABLE) {
+	if (ctx->state == SPU_STATE_RUNNABLE)
 		ctx->spu->stats.hash_flt++;
-		spu_switch_state(ctx->spu, SPU_UTIL_IOWAIT);
-	}
 
 	/* we must not hold the lock when entering spu_handle_mm_fault */
 	spu_release(ctx);
@@ -226,7 +224,7 @@ int spufs_handle_class1(struct spu_context *ctx)
 	} else
 		spufs_handle_dma_error(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
 
-	spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM);
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(spufs_handle_class1);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index c2814ea96af2..7de4e919687b 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -370,7 +370,7 @@ spufs_regs_read(struct file *file, char __user *buffer,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_regs_read(ctx, buffer, size, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -392,7 +392,7 @@ spufs_regs_write(struct file *file, const char __user *buffer,
 	ret = copy_from_user(lscsa->gprs + *pos - size,
 			     buffer, size) ? -EFAULT : size;
 
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -421,7 +421,7 @@ spufs_fpcr_read(struct file *file, char __user * buffer,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_fpcr_read(ctx, buffer, size, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -443,7 +443,7 @@ spufs_fpcr_write(struct file *file, const char __user * buffer,
 	ret = copy_from_user((char *)&lscsa->fpcr + *pos - size,
 			     buffer, size) ? -EFAULT : size;
 
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -868,7 +868,7 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_signal1_read(ctx, buf, len, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -934,6 +934,13 @@ static const struct file_operations spufs_signal1_fops = {
 	.mmap = spufs_signal1_mmap,
 };
 
+static const struct file_operations spufs_signal1_nosched_fops = {
+	.open = spufs_signal1_open,
+	.release = spufs_signal1_release,
+	.write = spufs_signal1_write,
+	.mmap = spufs_signal1_mmap,
+};
+
 static int spufs_signal2_open(struct inode *inode, struct file *file)
 {
 	struct spufs_inode_info *i = SPUFS_I(inode);
@@ -992,7 +999,7 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_signal2_read(ctx, buf, len, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1062,6 +1069,13 @@ static const struct file_operations spufs_signal2_fops = {
 	.mmap = spufs_signal2_mmap,
 };
 
+static const struct file_operations spufs_signal2_nosched_fops = {
+	.open = spufs_signal2_open,
+	.release = spufs_signal2_release,
+	.write = spufs_signal2_write,
+	.mmap = spufs_signal2_mmap,
+};
+
 static void spufs_signal1_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
@@ -1612,7 +1626,7 @@ static void spufs_decr_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->decr.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 __spufs_decr_get(void *data)
@@ -1628,7 +1642,7 @@ static u64 spufs_decr_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = __spufs_decr_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
@@ -1637,17 +1651,21 @@ DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
 static void spufs_decr_status_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
-	lscsa->decr_status.slot[0] = (u32) val;
-	spu_release(ctx);
+	if (val)
+		ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
+	else
+		ctx->csa.priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
+	spu_release_saved(ctx);
 }
 
 static u64 __spufs_decr_status_get(void *data)
 {
 	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	return lscsa->decr_status.slot[0];
+	if (ctx->csa.priv2.mfc_control_RW & MFC_CNTL_DECREMENTER_RUNNING)
+		return SPU_DECR_STATUS_RUNNING;
+	else
+		return 0;
 }
 
 static u64 spufs_decr_status_get(void *data)
@@ -1656,7 +1674,7 @@ static u64 spufs_decr_status_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = __spufs_decr_status_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
@@ -1668,7 +1686,7 @@ static void spufs_event_mask_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->event_mask.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 __spufs_event_mask_get(void *data)
@@ -1684,7 +1702,7 @@ static u64 spufs_event_mask_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = __spufs_event_mask_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get,
@@ -1708,7 +1726,7 @@ static u64 spufs_event_status_get(void *data)
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_event_status_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
@@ -1720,7 +1738,7 @@ static void spufs_srr0_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->srr0.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 spufs_srr0_get(void *data)
@@ -1730,7 +1748,7 @@ static u64 spufs_srr0_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = lscsa->srr0.slot[0];
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_srr0_ops, spufs_srr0_get, spufs_srr0_set,
@@ -1786,7 +1804,7 @@ static u64 spufs_lslr_get(void *data)
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_lslr_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1850,7 +1868,7 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_mbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1888,7 +1906,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_ibox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1929,7 +1947,7 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_wbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1979,7 +1997,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_dma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -2030,7 +2048,7 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -2065,14 +2083,26 @@ static const char *ctx_state_names[] = {
 };
 
 static unsigned long long spufs_acct_time(struct spu_context *ctx,
-		enum spuctx_execution_state state)
+		enum spu_utilization_state state)
 {
-	unsigned long time = ctx->stats.times[state];
+	struct timespec ts;
+	unsigned long long time = ctx->stats.times[state];
 
-	if (ctx->stats.execution_state == state)
-		time += jiffies - ctx->stats.tstamp;
+	/*
+	 * In general, utilization statistics are updated by the controlling
+	 * thread as the spu context moves through various well defined
+	 * state transitions, but if the context is lazily loaded its
+	 * utilization statistics are not updated as the controlling thread
+	 * is not tightly coupled with the execution of the spu context.  We
+	 * calculate and apply the time delta from the last recorded state
+	 * of the spu context.
+	 */
+	if (ctx->spu && ctx->stats.util_state == state) {
+		ktime_get_ts(&ts);
+		time += timespec_to_ns(&ts) - ctx->stats.tstamp;
+	}
 
-	return jiffies_to_msecs(time);
+	return time / NSEC_PER_MSEC;
 }
 
 static unsigned long long spufs_slb_flts(struct spu_context *ctx)
@@ -2107,11 +2137,11 @@ static int spufs_show_stat(struct seq_file *s, void *private)
 	spu_acquire(ctx);
 	seq_printf(s, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
-		ctx_state_names[ctx->stats.execution_state],
-		spufs_acct_time(ctx, SPUCTX_UTIL_USER),
-		spufs_acct_time(ctx, SPUCTX_UTIL_SYSTEM),
-		spufs_acct_time(ctx, SPUCTX_UTIL_IOWAIT),
-		spufs_acct_time(ctx, SPUCTX_UTIL_LOADED),
+		ctx_state_names[ctx->stats.util_state],
+		spufs_acct_time(ctx, SPU_UTIL_USER),
+		spufs_acct_time(ctx, SPU_UTIL_SYSTEM),
+		spufs_acct_time(ctx, SPU_UTIL_IOWAIT),
+		spufs_acct_time(ctx, SPU_UTIL_IDLE_LOADED),
 		ctx->stats.vol_ctx_switch,
 		ctx->stats.invol_ctx_switch,
 		spufs_slb_flts(ctx),
@@ -2184,8 +2214,8 @@ struct tree_descr spufs_dir_nosched_contents[] = {
 	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
 	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
 	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
-	{ "signal1", &spufs_signal1_fops, 0666, },
-	{ "signal2", &spufs_signal2_fops, 0666, },
+	{ "signal1", &spufs_signal1_nosched_fops, 0222, },
+	{ "signal2", &spufs_signal2_nosched_fops, 0222, },
 	{ "signal1_type", &spufs_signal1_type, 0666, },
 	{ "signal2_type", &spufs_signal2_type, 0666, },
 	{ "mss", &spufs_mss_fops, 0666, },
diff --git a/arch/powerpc/platforms/cell/spufs/gang.c b/arch/powerpc/platforms/cell/spufs/gang.c
index 212ea78f9051..71a443253021 100644
--- a/arch/powerpc/platforms/cell/spufs/gang.c
+++ b/arch/powerpc/platforms/cell/spufs/gang.c
@@ -35,7 +35,9 @@ struct spu_gang *alloc_spu_gang(void)
 
 	kref_init(&gang->kref);
 	mutex_init(&gang->mutex);
+	mutex_init(&gang->aff_mutex);
 	INIT_LIST_HEAD(&gang->list);
+	INIT_LIST_HEAD(&gang->aff_list_head);
 
 out:
 	return gang;
@@ -73,6 +75,10 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx)
 {
 	mutex_lock(&gang->mutex);
 	WARN_ON(ctx->gang != gang);
+	if (!list_empty(&ctx->aff_list)) {
+		list_del_init(&ctx->aff_list);
+		gang->aff_flags &= ~AFF_OFFSETS_SET;
+	}
 	list_del_init(&ctx->gang_list);
 	gang->contexts--;
 	mutex_unlock(&gang->mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7eb4d6cbcb74..b3d0dd118dd0 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -316,11 +316,107 @@ out:
 	return ret;
 }
 
-static int spufs_create_context(struct inode *inode,
-			struct dentry *dentry,
-			struct vfsmount *mnt, int flags, int mode)
+static struct spu_context *
+spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
+						struct file *filp)
+{
+	struct spu_context *tmp, *neighbor;
+	int count, node;
+	int aff_supp;
+
+	aff_supp = !list_empty(&(list_entry(cbe_spu_info[0].spus.next,
+					struct spu, cbe_list))->aff_list);
+
+	if (!aff_supp)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_GANG)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_AFFINITY_MEM &&
+	    gang->aff_ref_ctx &&
+	    gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM)
+		return ERR_PTR(-EEXIST);
+
+	if (gang->aff_flags & AFF_MERGED)
+		return ERR_PTR(-EBUSY);
+
+	neighbor = NULL;
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (!filp || filp->f_op != &spufs_context_fops)
+			return ERR_PTR(-EINVAL);
+
+		neighbor = get_spu_context(
+				SPUFS_I(filp->f_dentry->d_inode)->i_ctx);
+
+		if (!list_empty(&neighbor->aff_list) && !(neighbor->aff_head) &&
+		    !list_is_last(&neighbor->aff_list, &gang->aff_list_head) &&
+		    !list_entry(neighbor->aff_list.next, struct spu_context,
+		    aff_list)->aff_head)
+			return ERR_PTR(-EEXIST);
+
+		if (gang != neighbor->gang)
+			return ERR_PTR(-EINVAL);
+
+		count = 1;
+		list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+			count++;
+		if (list_empty(&neighbor->aff_list))
+			count++;
+
+		for (node = 0; node < MAX_NUMNODES; node++) {
+			if ((cbe_spu_info[node].n_spus - atomic_read(
+				&cbe_spu_info[node].reserved_spus)) >= count)
+				break;
+		}
+
+		if (node == MAX_NUMNODES)
+			return ERR_PTR(-EEXIST);
+	}
+
+	return neighbor;
+}
+
+static void
+spufs_set_affinity(unsigned int flags, struct spu_context *ctx,
+					struct spu_context *neighbor)
+{
+	if (flags & SPU_CREATE_AFFINITY_MEM)
+		ctx->gang->aff_ref_ctx = ctx;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (list_empty(&neighbor->aff_list)) {
+			list_add_tail(&neighbor->aff_list,
+				&ctx->gang->aff_list_head);
+			neighbor->aff_head = 1;
+		}
+
+		if (list_is_last(&neighbor->aff_list, &ctx->gang->aff_list_head)
+		    || list_entry(neighbor->aff_list.next, struct spu_context,
+							aff_list)->aff_head) {
+			list_add(&ctx->aff_list, &neighbor->aff_list);
+		} else  {
+			list_add_tail(&ctx->aff_list, &neighbor->aff_list);
+			if (neighbor->aff_head) {
+				neighbor->aff_head = 0;
+				ctx->aff_head = 1;
+			}
+		}
+
+		if (!ctx->gang->aff_ref_ctx)
+			ctx->gang->aff_ref_ctx = ctx;
+	}
+}
+
+static int
+spufs_create_context(struct inode *inode, struct dentry *dentry,
+			struct vfsmount *mnt, int flags, int mode,
+			struct file *aff_filp)
 {
 	int ret;
+	int affinity;
+	struct spu_gang *gang;
+	struct spu_context *neighbor;
 
 	ret = -EPERM;
 	if ((flags & SPU_CREATE_NOSCHED) &&
@@ -336,9 +432,29 @@ static int spufs_create_context(struct inode *inode,
 	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
 		goto out_unlock;
 
+	gang = NULL;
+	neighbor = NULL;
+	affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU);
+	if (affinity) {
+		gang = SPUFS_I(inode)->i_gang;
+		ret = -EINVAL;
+		if (!gang)
+			goto out_unlock;
+		mutex_lock(&gang->aff_mutex);
+		neighbor = spufs_assert_affinity(flags, gang, aff_filp);
+		if (IS_ERR(neighbor)) {
+			ret = PTR_ERR(neighbor);
+			goto out_aff_unlock;
+		}
+	}
+
 	ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO);
 	if (ret)
-		goto out_unlock;
+		goto out_aff_unlock;
+
+	if (affinity)
+		spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx,
+								neighbor);
 
 	/*
 	 * get references for dget and mntget, will be released
@@ -352,6 +468,9 @@ static int spufs_create_context(struct inode *inode,
 		goto out;
 	}
 
+out_aff_unlock:
+	if (affinity)
+		mutex_unlock(&gang->aff_mutex);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
@@ -450,7 +569,8 @@ out:
 
 static struct file_system_type spufs_type;
 
-long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
+long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode,
+							struct file *filp)
 {
 	struct dentry *dentry;
 	int ret;
@@ -487,7 +607,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
 					dentry, nd->mnt, mode);
 	else
 		return spufs_create_context(nd->dentry->d_inode,
-					dentry, nd->mnt, flags, mode);
+					dentry, nd->mnt, flags, mode, filp);
 
 out_dput:
 	dput(dentry);
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 58ae13b7de84..0b50fa5cb39d 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -18,15 +18,17 @@ void spufs_stop_callback(struct spu *spu)
 	wake_up_all(&ctx->stop_wq);
 }
 
-static inline int spu_stopped(struct spu_context *ctx, u32 * stat)
+static inline int spu_stopped(struct spu_context *ctx, u32 *stat)
 {
 	struct spu *spu;
 	u64 pte_fault;
 
 	*stat = ctx->ops->status_read(ctx);
-	if (ctx->state != SPU_STATE_RUNNABLE)
-		return 1;
+
 	spu = ctx->spu;
+	if (ctx->state != SPU_STATE_RUNNABLE ||
+	    test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
+		return 1;
 	pte_fault = spu->dsisr &
 	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
 	return (!(*stat & SPU_STATUS_RUNNING) || pte_fault || spu->class_0_pending) ?
@@ -124,8 +126,10 @@ out:
 	return ret;
 }
 
-static int spu_run_init(struct spu_context *ctx, u32 * npc)
+static int spu_run_init(struct spu_context *ctx, u32 *npc)
 {
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
 	if (ctx->flags & SPU_CREATE_ISOLATE) {
 		unsigned long runcntl;
 
@@ -151,16 +155,20 @@ static int spu_run_init(struct spu_context *ctx, u32 * npc)
 		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
 	}
 
+	spuctx_switch_state(ctx, SPU_UTIL_USER);
+
 	return 0;
 }
 
-static int spu_run_fini(struct spu_context *ctx, u32 * npc,
-			       u32 * status)
+static int spu_run_fini(struct spu_context *ctx, u32 *npc,
+			       u32 *status)
 {
 	int ret = 0;
 
 	*status = ctx->ops->status_read(ctx);
 	*npc = ctx->ops->npc_read(ctx);
+
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
 	spu_release(ctx);
 
 	if (signal_pending(current))
@@ -289,10 +297,10 @@ static inline int spu_process_events(struct spu_context *ctx)
 	return ret;
 }
 
-long spufs_run_spu(struct file *file, struct spu_context *ctx,
-		   u32 *npc, u32 *event)
+long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 {
 	int ret;
+	struct spu *spu;
 	u32 status;
 
 	if (mutex_lock_interruptible(&ctx->run_mutex))
@@ -328,6 +336,17 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 		ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, &status));
 		if (unlikely(ret))
 			break;
+		spu = ctx->spu;
+		if (unlikely(test_and_clear_bit(SPU_SCHED_NOTIFY_ACTIVE,
+						&ctx->sched_flags))) {
+			if (!(status & SPU_STATUS_STOPPED_BY_STOP)) {
+				spu_switch_notify(spu, ctx);
+				continue;
+			}
+		}
+
+		spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
 		if ((status & SPU_STATUS_STOPPED_BY_STOP) &&
 		    (status >> SPU_STOP_STATUS_SHIFT == 0x2104)) {
 			ret = spu_process_callback(ctx);
@@ -356,6 +375,7 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 	    (ctx->state == SPU_STATE_RUNNABLE))
 		ctx->stats.libassist++;
 
+
 	ctx->ops->master_stop(ctx);
 	ret = spu_run_fini(ctx, npc, &status);
 	spu_yield(ctx);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e5b4dd1db286..227968b4779d 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -51,9 +51,6 @@ struct spu_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_PRIO);
 	struct list_head runq[MAX_PRIO];
 	spinlock_t runq_lock;
-	struct list_head active_list[MAX_NUMNODES];
-	struct mutex active_mutex[MAX_NUMNODES];
-	int nr_active[MAX_NUMNODES];
 	int nr_waiting;
 };
 
@@ -127,7 +124,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	ctx->policy = current->policy;
 
 	/*
-	 * A lot of places that don't hold active_mutex poke into
+	 * A lot of places that don't hold list_mutex poke into
 	 * cpus_allowed, including grab_runnable_context which
 	 * already holds the runq_lock.  So abuse runq_lock
 	 * to protect this field aswell.
@@ -141,9 +138,9 @@ void spu_update_sched_info(struct spu_context *ctx)
 {
 	int node = ctx->spu->node;
 
-	mutex_lock(&spu_prio->active_mutex[node]);
+	mutex_lock(&cbe_spu_info[node].list_mutex);
 	__spu_update_sched_info(ctx);
-	mutex_unlock(&spu_prio->active_mutex[node]);
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
 }
 
 static int __node_allowed(struct spu_context *ctx, int node)
@@ -169,56 +166,56 @@ static int node_allowed(struct spu_context *ctx, int node)
 	return rval;
 }
 
-/**
- * spu_add_to_active_list - add spu to active list
- * @spu:	spu to add to the active list
- */
-static void spu_add_to_active_list(struct spu *spu)
-{
-	int node = spu->node;
-
-	mutex_lock(&spu_prio->active_mutex[node]);
-	spu_prio->nr_active[node]++;
-	list_add_tail(&spu->list, &spu_prio->active_list[node]);
-	mutex_unlock(&spu_prio->active_mutex[node]);
-}
+static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
 
-static void __spu_remove_from_active_list(struct spu *spu)
+void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
 {
-	list_del_init(&spu->list);
-	spu_prio->nr_active[spu->node]--;
+	blocking_notifier_call_chain(&spu_switch_notifier,
+			    ctx ? ctx->object_id : 0, spu);
 }
 
-/**
- * spu_remove_from_active_list - remove spu from active list
- * @spu:       spu to remove from the active list
- */
-static void spu_remove_from_active_list(struct spu *spu)
+static void notify_spus_active(void)
 {
-	int node = spu->node;
-
-	mutex_lock(&spu_prio->active_mutex[node]);
-	__spu_remove_from_active_list(spu);
-	mutex_unlock(&spu_prio->active_mutex[node]);
-}
+	int node;
 
-static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
+	/*
+	 * Wake up the active spu_contexts.
+	 *
+	 * When the awakened processes see their "notify_active" flag is set,
+	 * they will call spu_switch_notify();
+	 */
+	for_each_online_node(node) {
+		struct spu *spu;
 
-static void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
-{
-	blocking_notifier_call_chain(&spu_switch_notifier,
-			    ctx ? ctx->object_id : 0, spu);
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->alloc_state != SPU_FREE) {
+				struct spu_context *ctx = spu->ctx;
+				set_bit(SPU_SCHED_NOTIFY_ACTIVE,
+					&ctx->sched_flags);
+				mb();
+				wake_up_all(&ctx->stop_wq);
+			}
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	}
 }
 
 int spu_switch_event_register(struct notifier_block * n)
 {
-	return blocking_notifier_chain_register(&spu_switch_notifier, n);
+	int ret;
+	ret = blocking_notifier_chain_register(&spu_switch_notifier, n);
+	if (!ret)
+		notify_spus_active();
+	return ret;
 }
+EXPORT_SYMBOL_GPL(spu_switch_event_register);
 
 int spu_switch_event_unregister(struct notifier_block * n)
 {
 	return blocking_notifier_chain_unregister(&spu_switch_notifier, n);
 }
+EXPORT_SYMBOL_GPL(spu_switch_event_unregister);
 
 /**
  * spu_bind_context - bind spu context to physical spu
@@ -229,6 +226,12 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 {
 	pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid,
 		 spu->number, spu->node);
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
+	if (ctx->flags & SPU_CREATE_NOSCHED)
+		atomic_inc(&cbe_spu_info[spu->node].reserved_spus);
+	if (!list_empty(&ctx->aff_list))
+		atomic_inc(&ctx->gang->aff_sched_count);
 
 	ctx->stats.slb_flt_base = spu->stats.slb_flt;
 	ctx->stats.class2_intr_base = spu->stats.class2_intr;
@@ -238,6 +241,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu_associate_mm(spu, ctx->owner);
 	spu->ibox_callback = spufs_ibox_callback;
 	spu->wbox_callback = spufs_wbox_callback;
@@ -251,7 +255,153 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spu_cpu_affinity_set(spu, raw_smp_processor_id());
 	spu_switch_notify(spu, ctx);
 	ctx->state = SPU_STATE_RUNNABLE;
-	spu_switch_state(spu, SPU_UTIL_SYSTEM);
+
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+}
+
+/*
+ * Must be used with the list_mutex held.
+ */
+static inline int sched_spu(struct spu *spu)
+{
+	BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex));
+
+	return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
+}
+
+static void aff_merge_remaining_ctxs(struct spu_gang *gang)
+{
+	struct spu_context *ctx;
+
+	list_for_each_entry(ctx, &gang->aff_list_head, aff_list) {
+		if (list_empty(&ctx->aff_list))
+			list_add(&ctx->aff_list, &gang->aff_list_head);
+	}
+	gang->aff_flags |= AFF_MERGED;
+}
+
+static void aff_set_offsets(struct spu_gang *gang)
+{
+	struct spu_context *ctx;
+	int offset;
+
+	offset = -1;
+	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
+								aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		ctx->aff_offset = offset--;
+	}
+
+	offset = 0;
+	list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		ctx->aff_offset = offset++;
+	}
+
+	gang->aff_flags |= AFF_OFFSETS_SET;
+}
+
+static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
+		 int group_size, int lowest_offset)
+{
+	struct spu *spu;
+	int node, n;
+
+	/*
+	 * TODO: A better algorithm could be used to find a good spu to be
+	 *       used as reference location for the ctxs chain.
+	 */
+	node = cpu_to_node(raw_smp_processor_id());
+	for (n = 0; n < MAX_NUMNODES; n++, node++) {
+		node = (node < MAX_NUMNODES) ? node : 0;
+		if (!node_allowed(ctx, node))
+			continue;
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if ((!mem_aff || spu->has_mem_affinity) &&
+							sched_spu(spu)) {
+				mutex_unlock(&cbe_spu_info[node].list_mutex);
+				return spu;
+			}
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	}
+	return NULL;
+}
+
+static void aff_set_ref_point_location(struct spu_gang *gang)
+{
+	int mem_aff, gs, lowest_offset;
+	struct spu_context *ctx;
+	struct spu *tmp;
+
+	mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM;
+	lowest_offset = 0;
+	gs = 0;
+
+	list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+		gs++;
+
+	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
+								aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		lowest_offset = ctx->aff_offset;
+	}
+
+	gang->aff_ref_spu = aff_ref_location(ctx, mem_aff, gs, lowest_offset);
+}
+
+static struct spu *ctx_location(struct spu *ref, int offset, int node)
+{
+	struct spu *spu;
+
+	spu = NULL;
+	if (offset >= 0) {
+		list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
+			BUG_ON(spu->node != node);
+			if (offset == 0)
+				break;
+			if (sched_spu(spu))
+				offset--;
+		}
+	} else {
+		list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
+			BUG_ON(spu->node != node);
+			if (offset == 0)
+				break;
+			if (sched_spu(spu))
+				offset++;
+		}
+	}
+
+	return spu;
+}
+
+/*
+ * affinity_check is called each time a context is going to be scheduled.
+ * It returns the spu ptr on which the context must run.
+ */
+static int has_affinity(struct spu_context *ctx)
+{
+	struct spu_gang *gang = ctx->gang;
+
+	if (list_empty(&ctx->aff_list))
+		return 0;
+
+	mutex_lock(&gang->aff_mutex);
+	if (!gang->aff_ref_spu) {
+		if (!(gang->aff_flags & AFF_MERGED))
+			aff_merge_remaining_ctxs(gang);
+		if (!(gang->aff_flags & AFF_OFFSETS_SET))
+			aff_set_offsets(gang);
+		aff_set_ref_point_location(gang);
+	}
+	mutex_unlock(&gang->aff_mutex);
+
+	return gang->aff_ref_spu != NULL;
 }
 
 /**
@@ -263,9 +413,13 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 {
 	pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__,
 		 spu->pid, spu->number, spu->node);
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
-	spu_switch_state(spu, SPU_UTIL_IDLE);
-
+ 	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
+		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
+ 	if (!list_empty(&ctx->aff_list))
+ 		if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
+ 			ctx->gang->aff_ref_spu = NULL;
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
 	spu_save(&ctx->csa, spu);
@@ -278,8 +432,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 	spu->dma_callback = NULL;
 	spu_associate_mm(spu, NULL);
 	spu->pid = 0;
+	spu->tgid = 0;
 	ctx->ops = &spu_backing_ops;
-	ctx->spu = NULL;
 	spu->flags = 0;
 	spu->ctx = NULL;
 
@@ -287,6 +441,10 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 		(spu->stats.slb_flt - ctx->stats.slb_flt_base);
 	ctx->stats.class2_intr +=
 		(spu->stats.class2_intr - ctx->stats.class2_intr_base);
+
+	/* This maps the underlying spu state to idle */
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+	ctx->spu = NULL;
 }
 
 /**
@@ -352,18 +510,41 @@ static void spu_prio_wait(struct spu_context *ctx)
 
 static struct spu *spu_get_idle(struct spu_context *ctx)
 {
-	struct spu *spu = NULL;
-	int node = cpu_to_node(raw_smp_processor_id());
-	int n;
+	struct spu *spu;
+	int node, n;
+
+	if (has_affinity(ctx)) {
+		node = ctx->gang->aff_ref_spu->node;
 
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		spu = ctx_location(ctx->gang->aff_ref_spu, ctx->aff_offset, node);
+		if (spu && spu->alloc_state == SPU_FREE)
+			goto found;
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+		return NULL;
+	}
+
+	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
 			continue;
-		spu = spu_alloc_node(node);
-		if (spu)
-			break;
+
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->alloc_state == SPU_FREE)
+				goto found;
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
+
+	return NULL;
+
+ found:
+	spu->alloc_state = SPU_USED;
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+	pr_debug("Got SPU %d %d\n", spu->number, spu->node);
+	spu_init_channels(spu);
 	return spu;
 }
 
@@ -393,15 +574,15 @@ static struct spu *find_victim(struct spu_context *ctx)
 		if (!node_allowed(ctx, node))
 			continue;
 
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry(spu, &spu_prio->active_list[node], list) {
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			struct spu_context *tmp = spu->ctx;
 
 			if (tmp->prio > ctx->prio &&
 			    (!victim || tmp->prio > victim->prio))
 				victim = spu->ctx;
 		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 
 		if (victim) {
 			/*
@@ -426,7 +607,11 @@ static struct spu *find_victim(struct spu_context *ctx)
 				victim = NULL;
 				goto restart;
 			}
-			spu_remove_from_active_list(spu);
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
+			cbe_spu_info[node].nr_active--;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+
 			spu_unbind_context(spu, victim);
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
@@ -455,8 +640,6 @@ static struct spu *find_victim(struct spu_context *ctx)
  */
 int spu_activate(struct spu_context *ctx, unsigned long flags)
 {
-	spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM);
-
 	do {
 		struct spu *spu;
 
@@ -477,8 +660,12 @@ int spu_activate(struct spu_context *ctx, unsigned long flags)
 		if (!spu && rt_prio(ctx->prio))
 			spu = find_victim(ctx);
 		if (spu) {
+			int node = spu->node;
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
 			spu_bind_context(spu, ctx);
-			spu_add_to_active_list(spu);
+			cbe_spu_info[node].nr_active++;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
 			return 0;
 		}
 
@@ -500,7 +687,7 @@ static struct spu_context *grab_runnable_context(int prio, int node)
 	int best;
 
 	spin_lock(&spu_prio->runq_lock);
-	best = sched_find_first_bit(spu_prio->bitmap);
+	best = find_first_bit(spu_prio->bitmap, prio);
 	while (best < prio) {
 		struct list_head *rq = &spu_prio->runq[best];
 
@@ -527,11 +714,17 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
 		if (new || force) {
-			spu_remove_from_active_list(spu);
+			int node = spu->node;
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
 			spu_unbind_context(spu, ctx);
+			spu->alloc_state = SPU_FREE;
+			cbe_spu_info[node].nr_active--;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+
 			ctx->stats.vol_ctx_switch++;
 			spu->stats.vol_ctx_switch++;
-			spu_free(spu);
+
 			if (new)
 				wake_up(&new->stop_wq);
 		}
@@ -550,21 +743,11 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
  */
 void spu_deactivate(struct spu_context *ctx)
 {
-	/*
-	 * We must never reach this for a nosched context,
-	 * but handle the case gracefull instead of panicing.
-	 */
-	if (ctx->flags & SPU_CREATE_NOSCHED) {
-		WARN_ON(1);
-		return;
-	}
-
 	__spu_deactivate(ctx, 1, MAX_PRIO);
-	spuctx_switch_state(ctx, SPUCTX_UTIL_USER);
 }
 
 /**
- * spu_yield -  yield a physical spu if others are waiting
+ * spu_yield -	yield a physical spu if others are waiting
  * @ctx:	spu context to yield
  *
  * Check if there is a higher priority context waiting and if yes
@@ -575,17 +758,12 @@ void spu_yield(struct spu_context *ctx)
 {
 	if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
 		mutex_lock(&ctx->state_mutex);
-		if (__spu_deactivate(ctx, 0, MAX_PRIO))
-			spuctx_switch_state(ctx, SPUCTX_UTIL_USER);
-		else {
-			spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED);
-			spu_switch_state(ctx->spu, SPU_UTIL_USER);
-		}
+		__spu_deactivate(ctx, 0, MAX_PRIO);
 		mutex_unlock(&ctx->state_mutex);
 	}
 }
 
-static void spusched_tick(struct spu_context *ctx)
+static noinline void spusched_tick(struct spu_context *ctx)
 {
 	if (ctx->flags & SPU_CREATE_NOSCHED)
 		return;
@@ -596,7 +774,7 @@ static void spusched_tick(struct spu_context *ctx)
 		return;
 
 	/*
-	 * Unfortunately active_mutex ranks outside of state_mutex, so
+	 * Unfortunately list_mutex ranks outside of state_mutex, so
 	 * we have to trylock here.  If we fail give the context another
 	 * tick and try again.
 	 */
@@ -606,12 +784,11 @@ static void spusched_tick(struct spu_context *ctx)
 
 		new = grab_runnable_context(ctx->prio + 1, spu->node);
 		if (new) {
-
-			__spu_remove_from_active_list(spu);
 			spu_unbind_context(spu, ctx);
 			ctx->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
-			spu_free(spu);
+			spu->alloc_state = SPU_FREE;
+			cbe_spu_info[spu->node].nr_active--;
 			wake_up(&new->stop_wq);
 			/*
 			 * We need to break out of the wait loop in
@@ -632,7 +809,7 @@ static void spusched_tick(struct spu_context *ctx)
  *
  * Return the number of tasks currently running or waiting to run.
  *
- * Note that we don't take runq_lock / active_mutex here.  Reading
+ * Note that we don't take runq_lock / list_mutex here.  Reading
  * a single 32bit value is atomic on powerpc, and we don't care
  * about memory ordering issues here.
  */
@@ -641,7 +818,7 @@ static unsigned long count_active_contexts(void)
 	int nr_active = 0, node;
 
 	for (node = 0; node < MAX_NUMNODES; node++)
-		nr_active += spu_prio->nr_active[node];
+		nr_active += cbe_spu_info[node].nr_active;
 	nr_active += spu_prio->nr_waiting;
 
 	return nr_active;
@@ -681,19 +858,18 @@ static void spusched_wake(unsigned long data)
 
 static int spusched_thread(void *unused)
 {
-	struct spu *spu, *next;
+	struct spu *spu;
 	int node;
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		for (node = 0; node < MAX_NUMNODES; node++) {
-			mutex_lock(&spu_prio->active_mutex[node]);
-			list_for_each_entry_safe(spu, next,
-						 &spu_prio->active_list[node],
-						 list)
-				spusched_tick(spu->ctx);
-			mutex_unlock(&spu_prio->active_mutex[node]);
+			mutex_lock(&cbe_spu_info[node].list_mutex);
+			list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
+				if (spu->ctx)
+					spusched_tick(spu->ctx);
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
 		}
 	}
 
@@ -751,10 +927,9 @@ int __init spu_sched_init(void)
 		INIT_LIST_HEAD(&spu_prio->runq[i]);
 		__clear_bit(i, spu_prio->bitmap);
 	}
-	__set_bit(MAX_PRIO, spu_prio->bitmap);
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		mutex_init(&spu_prio->active_mutex[i]);
-		INIT_LIST_HEAD(&spu_prio->active_list[i]);
+		mutex_init(&cbe_spu_info[i].list_mutex);
+		INIT_LIST_HEAD(&cbe_spu_info[i].spus);
 	}
 	spin_lock_init(&spu_prio->runq_lock);
 
@@ -783,9 +958,9 @@ int __init spu_sched_init(void)
 	return err;
 }
 
-void __exit spu_sched_exit(void)
+void spu_sched_exit(void)
 {
-	struct spu *spu, *tmp;
+	struct spu *spu;
 	int node;
 
 	remove_proc_entry("spu_loadavg", NULL);
@@ -794,13 +969,11 @@ void __exit spu_sched_exit(void)
 	kthread_stop(spusched_task);
 
 	for (node = 0; node < MAX_NUMNODES; node++) {
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node],
-					 list) {
-			list_del_init(&spu->list);
-			spu_free(spu);
-		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
+			if (spu->alloc_state != SPU_FREE)
+				spu->alloc_state = SPU_FREE;
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
 	kfree(spu_prio);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore.c b/arch/powerpc/platforms/cell/spufs/spu_restore.c
index 4e19ed7a0756..21a9c952d88b 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore.c
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore.c
@@ -84,13 +84,13 @@ static inline void restore_decr(void)
 	unsigned int decr_running;
 	unsigned int decr;
 
-	/* Restore, Step 6:
+	/* Restore, Step 6(moved):
 	 *    If the LSCSA "decrementer running" flag is set
 	 *    then write the SPU_WrDec channel with the
 	 *    decrementer value from LSCSA.
 	 */
 	offset = LSCSA_QW_OFFSET(decr_status);
-	decr_running = regs_spill[offset].slot[0];
+	decr_running = regs_spill[offset].slot[0] & SPU_DECR_STATUS_RUNNING;
 	if (decr_running) {
 		offset = LSCSA_QW_OFFSET(decr);
 		decr = regs_spill[offset].slot[0];
@@ -318,10 +318,10 @@ int main()
 	build_dma_list(lscsa_ea);	/* Step 3.  */
 	restore_upper_240kb(lscsa_ea);	/* Step 4.  */
 					/* Step 5: done by 'exit'. */
-	restore_decr();			/* Step 6. */
 	enqueue_putllc(lscsa_ea);	/* Step 7. */
 	set_tag_update();		/* Step 8. */
 	read_tag_status();		/* Step 9. */
+	restore_decr();			/* moved Step 6. */
 	read_llar_status();		/* Step 10. */
 	write_ppu_mb();			/* Step 11. */
 	write_ppuint_mb();		/* Step 12. */
diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
index 15183d209b58..f383b027e8bf 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
@@ -10,7 +10,7 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x24fd8081,
 0x1cd80081,
 0x33001180,
-0x42030003,
+0x42034003,
 0x33800284,
 0x1c010204,
 0x40200000,
@@ -24,22 +24,22 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x23fffd84,
 0x1c100183,
 0x217ffa85,
-0x3080a000,
-0x3080a201,
-0x3080a402,
-0x3080a603,
-0x3080a804,
-0x3080aa05,
-0x3080ac06,
-0x3080ae07,
-0x3080b008,
-0x3080b209,
-0x3080b40a,
-0x3080b60b,
-0x3080b80c,
-0x3080ba0d,
-0x3080bc0e,
-0x3080be0f,
+0x3080b000,
+0x3080b201,
+0x3080b402,
+0x3080b603,
+0x3080b804,
+0x3080ba05,
+0x3080bc06,
+0x3080be07,
+0x3080c008,
+0x3080c209,
+0x3080c40a,
+0x3080c60b,
+0x3080c80c,
+0x3080ca0d,
+0x3080cc0e,
+0x3080ce0f,
 0x00003ffc,
 0x00000000,
 0x00000000,
@@ -48,19 +48,18 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x3ec00083,
 0xb0a14103,
 0x01a00204,
-0x3ec10082,
-0x4202800e,
-0x04000703,
-0xb0a14202,
-0x21a00803,
-0x3fbf028d,
-0x3f20068d,
-0x3fbe0682,
+0x3ec10083,
+0x4202c002,
+0xb0a14203,
+0x21a00802,
+0x3fbf028a,
+0x3f20050a,
+0x3fbe0502,
 0x3fe30102,
 0x21a00882,
-0x3f82028f,
-0x3fe3078f,
-0x3fbf0784,
+0x3f82028b,
+0x3fe3058b,
+0x3fbf0584,
 0x3f200204,
 0x3fbe0204,
 0x3fe30204,
@@ -75,252 +74,285 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x21a00083,
 0x40800082,
 0x21a00b02,
-0x10002818,
-0x42a00002,
-0x32800007,
-0x4207000c,
-0x18008208,
-0x40a0000b,
-0x4080020a,
-0x40800709,
-0x00200000,
-0x42070002,
-0x3ac30384,
+0x10002612,
+0x42a00003,
+0x42074006,
+0x1800c204,
+0x40a00008,
+0x40800789,
+0x1c010305,
+0x34000302,
 0x1cffc489,
-0x00200000,
-0x18008383,
-0x38830382,
-0x4cffc486,
-0x3ac28185,
-0xb0408584,
-0x28830382,
-0x1c020387,
-0x38828182,
-0xb0408405,
-0x1802c408,
-0x28828182,
-0x217ff886,
-0x04000583,
-0x21a00803,
-0x3fbe0682,
-0x3fe30102,
-0x04000106,
-0x21a00886,
-0x04000603,
-0x21a00903,
-0x40803c02,
-0x21a00982,
-0x40800003,
-0x04000184,
-0x21a00a04,
+0x3ec00303,
+0x3ec00287,
+0xb0408403,
+0x24000302,
+0x34000282,
+0x1c020306,
+0xb0408207,
+0x18020204,
+0x24000282,
+0x217ffa09,
+0x04000402,
+0x21a00802,
+0x3fbe0504,
+0x3fe30204,
+0x21a00884,
+0x42074002,
+0x21a00902,
+0x40803c03,
+0x21a00983,
+0x04000485,
+0x21a00a05,
 0x40802202,
 0x21a00a82,
-0x42028005,
-0x34208702,
-0x21002282,
-0x21a00804,
-0x21a00886,
-0x3fbf0782,
+0x21a00805,
+0x21a00884,
+0x3fbf0582,
 0x3f200102,
 0x3fbe0102,
 0x3fe30102,
 0x21a00902,
 0x40804003,
 0x21a00983,
-0x21a00a04,
+0x21a00a05,
 0x40805a02,
 0x21a00a82,
 0x40800083,
 0x21a00b83,
 0x01a00c02,
-0x01a00d83,
-0x3420c282,
+0x30809c03,
+0x34000182,
+0x14004102,
+0x21002082,
+0x01a00d82,
+0x3080a003,
+0x34000182,
 0x21a00e02,
-0x34210283,
-0x21a00f03,
-0x34200284,
-0x77400200,
-0x3421c282,
+0x3080a203,
+0x34000182,
+0x21a00f02,
+0x3080a403,
+0x34000182,
+0x77400100,
+0x3080a603,
+0x34000182,
 0x21a00702,
-0x34218283,
-0x21a00083,
-0x34214282,
+0x3080a803,
+0x34000182,
+0x21a00082,
+0x3080aa03,
+0x34000182,
 0x21a00b02,
-0x4200480c,
-0x00200000,
-0x1c010286,
-0x34220284,
-0x34220302,
-0x0f608203,
-0x5c024204,
-0x3b81810b,
-0x42013c02,
-0x00200000,
-0x18008185,
-0x38808183,
-0x3b814182,
-0x21004e84,
+0x4020007f,
+0x3080ae02,
+0x42004805,
+0x3080ac04,
+0x34000103,
+0x34000202,
+0x1cffc183,
+0x3b810106,
+0x0f608184,
+0x42013802,
+0x5c020183,
+0x38810102,
+0x3b810102,
+0x21000e83,
 0x4020007f,
 0x35000100,
-0x000004e0,
-0x000002a0,
-0x000002e8,
-0x00000428,
+0x00000470,
+0x000002f8,
+0x00000430,
 0x00000360,
-0x000002e8,
-0x000004a0,
-0x00000468,
+0x000002f8,
 0x000003c8,
+0x000004a8,
+0x00000298,
 0x00000360,
+0x00200000,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40085,
-0x10009c09,
-0x3ac10606,
-0xb060c105,
-0x4020007f,
-0x4020007f,
+0x40800208,
+0x3ec40084,
+0x40800407,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
-0x38810602,
-0xb0408586,
-0x28810602,
-0x32004180,
-0x34204702,
+0x38820282,
+0x41004003,
+0xb0408189,
+0x28820282,
+0x3881c282,
+0xb0408304,
+0x2881c282,
+0x00400000,
+0x40800003,
+0x35000000,
+0x30809e03,
+0x34000182,
 0x21a00382,
 0x4020007f,
-0x327fdc80,
+0x327fde00,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x40800405,
-0x00200000,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800206,
+0x3ec40084,
+0x40800407,
+0x40800608,
+0x3ac1828a,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
+0x38818282,
 0x41004003,
-0x38810602,
-0x4020007f,
-0xb0408188,
-0x4020007f,
-0x28810602,
-0x41201002,
-0x38814603,
-0x10009c09,
-0xb060c109,
-0x4020007f,
-0x28814603,
+0xb040818a,
+0x10005b0b,
+0x41201003,
+0x28818282,
+0x3881c282,
+0xb0408184,
 0x41193f83,
-0x38818602,
 0x60ffc003,
-0xb040818a,
-0x28818602,
-0x32003080,
+0x2881c282,
+0x38820282,
+0xb0408189,
+0x28820282,
+0x327fef80,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x41201008,
-0x10009c14,
-0x40800405,
-0x3ac10609,
-0x40800606,
-0x3ac1460a,
-0xb060c107,
-0x3ac1860b,
+0x40800207,
+0x3ec40086,
+0x4120100b,
+0x10005b14,
+0x40800404,
+0x3ac1c289,
+0x40800608,
+0xb060c106,
+0x3ac10286,
+0x3ac2028a,
 0x20801203,
-0x38810602,
-0xb0408409,
-0x28810602,
-0x38814603,
-0xb060c40a,
-0x4020007f,
-0x28814603,
+0x3881c282,
 0x41193f83,
-0x38818602,
 0x60ffc003,
-0xb040818b,
-0x28818602,
-0x32002380,
-0x409ffe02,
-0x30801204,
-0x40800205,
-0x3ec40083,
-0x40800406,
-0x3ac14607,
-0x3ac18608,
-0xb0810103,
-0x41004002,
-0x20801204,
-0x4020007f,
-0x38814603,
-0x10009c0b,
-0xb060c107,
-0x4020007f,
-0x4020007f,
-0x28814603,
-0x38818602,
-0x4020007f,
+0xb0408589,
+0x2881c282,
+0x38810282,
+0xb0408586,
+0x28810282,
+0x38820282,
+0xb040818a,
+0x28820282,
 0x4020007f,
-0xb0408588,
-0x28818602,
+0x327fe280,
+0x409ffe02,
+0x30801203,
+0x40800207,
+0x3ec40084,
+0x40800408,
+0x10005b14,
+0x40800609,
+0x3ac1c28a,
+0x3ac2028b,
+0xb060c104,
+0x3ac24284,
+0x20801203,
+0x41201003,
+0x3881c282,
+0xb040830a,
+0x2881c282,
+0x38820282,
+0xb040818b,
+0x41193f83,
+0x60ffc003,
+0x28820282,
+0x38824282,
+0xb0408184,
+0x28824282,
 0x4020007f,
-0x32001780,
+0x327fd580,
 0x409ffe02,
-0x1000640e,
-0x40800204,
+0x1000658e,
+0x40800206,
 0x30801203,
-0x40800405,
-0x3ec40087,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800407,
+0x3ec40084,
+0x40800608,
+0x3ac1828a,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
 0x413d8003,
-0x38810602,
+0x38818282,
 0x4020007f,
-0x327fd780,
-0x409ffe02,
-0x10007f0c,
-0x40800205,
-0x30801204,
-0x40800406,
-0x3ec40083,
-0x3ac14607,
-0x3ac18608,
-0xb0810103,
-0x413d8002,
-0x20801204,
-0x38814603,
+0x327fd800,
+0x409ffe03,
+0x30801202,
+0x40800207,
+0x3ec40084,
+0x10005b09,
+0x3ac1c288,
+0xb0408184,
 0x4020007f,
-0x327feb80,
+0x4020007f,
+0x20801202,
+0x3881c282,
+0xb0408308,
+0x2881c282,
+0x327fc680,
 0x409ffe02,
+0x1000588b,
+0x40800208,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x40800405,
-0x1000650a,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800407,
+0x3ec40084,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
-0x38810602,
-0xb0408588,
-0x4020007f,
-0x327fc980,
-0x00400000,
-0x40800003,
-0x4020007f,
-0x35000000,
+0x413d8003,
+0x38820282,
+0x327fbd80,
+0x00200000,
+0x00000da0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000d90,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000db0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000dc0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000d80,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000df0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000de0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000dd0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000e04,
+0x00000000,
+0x00000000,
 0x00000000,
+0x00000e00,
 0x00000000,
 0x00000000,
 0x00000000,
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 08b3530288ac..8b20c0c1556f 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -40,17 +40,13 @@ enum {
 struct spu_context_ops;
 struct spu_gang;
 
-/*
- * This is the state for spu utilization reporting to userspace.
- * Because this state is visible to userspace it must never change and needs
- * to be kept strictly separate from any internal state kept by the kernel.
- */
-enum spuctx_execution_state {
-	SPUCTX_UTIL_USER = 0,
-	SPUCTX_UTIL_SYSTEM,
-	SPUCTX_UTIL_IOWAIT,
-	SPUCTX_UTIL_LOADED,
-	SPUCTX_UTIL_MAX
+enum {
+	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
+};
+
+/* ctx->sched_flags */
+enum {
+	SPU_SCHED_NOTIFY_ACTIVE,
 };
 
 struct spu_context {
@@ -89,6 +85,8 @@ struct spu_context {
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	struct kref *prof_priv_kref;
+	void ( * prof_priv_release) (struct kref *kref);
 
 	/* owner thread */
 	pid_t tid;
@@ -104,9 +102,9 @@ struct spu_context {
 	/* statistics */
 	struct {
 		/* updates protected by ctx->state_mutex */
-		enum spuctx_execution_state execution_state;
-		unsigned long tstamp;		/* time of last ctx switch */
-		unsigned long times[SPUCTX_UTIL_MAX];
+		enum spu_utilization_state util_state;
+		unsigned long long tstamp;	/* time of last state switch */
+		unsigned long long times[SPU_UTIL_MAX];
 		unsigned long long vol_ctx_switch;
 		unsigned long long invol_ctx_switch;
 		unsigned long long min_flt;
@@ -118,6 +116,10 @@ struct spu_context {
 		unsigned long long class2_intr_base; /* # at last ctx switch */
 		unsigned long long libassist;
 	} stats;
+
+	struct list_head aff_list;
+	int aff_head;
+	int aff_offset;
 };
 
 struct spu_gang {
@@ -125,8 +127,19 @@ struct spu_gang {
 	struct mutex mutex;
 	struct kref kref;
 	int contexts;
+
+	struct spu_context *aff_ref_ctx;
+	struct list_head aff_list_head;
+	struct mutex aff_mutex;
+	int aff_flags;
+	struct spu *aff_ref_spu;
+	atomic_t aff_sched_count;
 };
 
+/* Flag bits for spu_gang aff_flags */
+#define AFF_OFFSETS_SET		1
+#define AFF_MERGED		2
+
 struct mfc_dma_command {
 	int32_t pad;	/* reserved */
 	uint32_t lsa;	/* local storage address */
@@ -190,10 +203,9 @@ extern struct tree_descr spufs_dir_contents[];
 extern struct tree_descr spufs_dir_nosched_contents[];
 
 /* system call implementation */
-long spufs_run_spu(struct file *file,
-		   struct spu_context *ctx, u32 *npc, u32 *status);
-long spufs_create(struct nameidata *nd,
-			 unsigned int flags, mode_t mode);
+long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status);
+long spufs_create(struct nameidata *nd, unsigned int flags,
+			mode_t mode, struct file *filp);
 extern const struct file_operations spufs_context_fops;
 
 /* gang management */
@@ -206,6 +218,9 @@ void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 /* fault handling */
 int spufs_handle_class1(struct spu_context *ctx);
 
+/* affinity */
+struct spu *affinity_check(struct spu_context *ctx);
+
 /* context management */
 extern atomic_t nr_spu_contexts;
 static inline void spu_acquire(struct spu_context *ctx)
@@ -227,15 +242,17 @@ void spu_unmap_mappings(struct spu_context *ctx);
 void spu_forget(struct spu_context *ctx);
 int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags);
 void spu_acquire_saved(struct spu_context *ctx);
+void spu_release_saved(struct spu_context *ctx);
 
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
+void spu_switch_notify(struct spu *spu, struct spu_context *ctx);
 void spu_set_timeslice(struct spu_context *ctx);
 void spu_update_sched_info(struct spu_context *ctx);
 void __spu_update_sched_info(struct spu_context *ctx);
 int __init spu_sched_init(void);
-void __exit spu_sched_exit(void);
+void spu_sched_exit(void);
 
 extern char *isolated_loader;
 
@@ -293,30 +310,34 @@ extern int spufs_coredump_num_notes;
  * line.
  */
 static inline void spuctx_switch_state(struct spu_context *ctx,
-		enum spuctx_execution_state new_state)
+		enum spu_utilization_state new_state)
 {
-	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
-
-	if (ctx->stats.execution_state != new_state) {
-		unsigned long curtime = jiffies;
-
-		ctx->stats.times[ctx->stats.execution_state] +=
-				 curtime - ctx->stats.tstamp;
-		ctx->stats.tstamp = curtime;
-		ctx->stats.execution_state = new_state;
-	}
-}
+	unsigned long long curtime;
+	signed long long delta;
+	struct timespec ts;
+	struct spu *spu;
+	enum spu_utilization_state old_state;
 
-static inline void spu_switch_state(struct spu *spu,
-		enum spuctx_execution_state new_state)
-{
-	if (spu->stats.utilization_state != new_state) {
-		unsigned long curtime = jiffies;
+	ktime_get_ts(&ts);
+	curtime = timespec_to_ns(&ts);
+	delta = curtime - ctx->stats.tstamp;
 
-		spu->stats.times[spu->stats.utilization_state] +=
-				 curtime - spu->stats.tstamp;
+	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
+	WARN_ON(delta < 0);
+
+	spu = ctx->spu;
+	old_state = ctx->stats.util_state;
+	ctx->stats.util_state = new_state;
+	ctx->stats.tstamp = curtime;
+
+	/*
+	 * Update the physical SPU utilization statistics.
+	 */
+	if (spu) {
+		ctx->stats.times[old_state] += delta;
+		spu->stats.times[old_state] += delta;
+		spu->stats.util_state = new_state;
 		spu->stats.tstamp = curtime;
-		spu->stats.utilization_state = new_state;
 	}
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 9c506ba08cdc..27ffdae98e5a 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -180,7 +180,7 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
 	case MFC_CNTL_SUSPEND_COMPLETE:
 		if (csa) {
 			csa->priv2.mfc_control_RW =
-				in_be64(&priv2->mfc_control_RW) |
+				MFC_CNTL_SUSPEND_MASK |
 				MFC_CNTL_SUSPEND_DMA_QUEUE;
 		}
 		break;
@@ -190,9 +190,7 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
 				  MFC_CNTL_SUSPEND_DMA_STATUS_MASK) ==
 				 MFC_CNTL_SUSPEND_COMPLETE);
 		if (csa) {
-			csa->priv2.mfc_control_RW =
-				in_be64(&priv2->mfc_control_RW) &
-				~MFC_CNTL_SUSPEND_DMA_QUEUE;
+			csa->priv2.mfc_control_RW = 0;
 		}
 		break;
 	}
@@ -251,16 +249,8 @@ static inline void save_mfc_decr(struct spu_state *csa, struct spu *spu)
 	 *     Read MFC_CNTL[Ds].  Update saved copy of
 	 *     CSA.MFC_CNTL[Ds].
 	 */
-	if (in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING) {
-		csa->priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
-		csa->suspend_time = get_cycles();
-		out_be64(&priv2->spu_chnlcntptr_RW, 7ULL);
-		eieio();
-		csa->spu_chnldata_RW[7] = in_be64(&priv2->spu_chnldata_RW);
-		eieio();
-	} else {
-		csa->priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
-	}
+	csa->priv2.mfc_control_RW |=
+		in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING;
 }
 
 static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
@@ -271,7 +261,8 @@ static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
 	 *     Write MFC_CNTL[Dh] set to a '1' to halt
 	 *     the decrementer.
 	 */
-	out_be64(&priv2->mfc_control_RW, MFC_CNTL_DECREMENTER_HALTED);
+	out_be64(&priv2->mfc_control_RW,
+		 MFC_CNTL_DECREMENTER_HALTED | MFC_CNTL_SUSPEND_MASK);
 	eieio();
 }
 
@@ -615,7 +606,7 @@ static inline void save_ppuint_mb(struct spu_state *csa, struct spu *spu)
 static inline void save_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	int i;
 
 	/* Save, Step 42:
@@ -626,7 +617,7 @@ static inline void save_ch_part1(struct spu_state *csa, struct spu *spu)
 	csa->spu_chnldata_RW[1] = in_be64(&priv2->spu_chnldata_RW);
 
 	/* Save the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -983,13 +974,13 @@ static inline void terminate_spu_app(struct spu_state *csa, struct spu *spu)
 	 */
 }
 
-static inline void suspend_mfc(struct spu_state *csa, struct spu *spu)
+static inline void suspend_mfc_and_halt_decr(struct spu_state *csa,
+		struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
 	/* Restore, Step 7:
-	 * Restore, Step 47.
-	 *     Write MFC_Cntl[Dh,Sc]='1','1' to suspend
+	 *     Write MFC_Cntl[Dh,Sc,Sm]='1','1','0' to suspend
 	 *     the queue and halt the decrementer.
 	 */
 	out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE |
@@ -1090,7 +1081,7 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu)
 static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	u64 idx;
 	int i;
 
@@ -1102,7 +1093,7 @@ static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->spu_chnldata_RW, 0UL);
 
 	/* Reset the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -1289,7 +1280,15 @@ static inline void setup_decr(struct spu_state *csa, struct spu *spu)
 		cycles_t resume_time = get_cycles();
 		cycles_t delta_time = resume_time - csa->suspend_time;
 
+		csa->lscsa->decr_status.slot[0] = SPU_DECR_STATUS_RUNNING;
+		if (csa->lscsa->decr.slot[0] < delta_time) {
+			csa->lscsa->decr_status.slot[0] |=
+				 SPU_DECR_STATUS_WRAPPED;
+		}
+
 		csa->lscsa->decr.slot[0] -= delta_time;
+	} else {
+		csa->lscsa->decr_status.slot[0] = 0;
 	}
 }
 
@@ -1398,6 +1397,18 @@ static inline void restore_ls_16kb(struct spu_state *csa, struct spu *spu)
 	send_mfc_dma(spu, addr, ls_offset, size, tag, rclass, cmd);
 }
 
+static inline void suspend_mfc(struct spu_state *csa, struct spu *spu)
+{
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
+
+	/* Restore, Step 47.
+	 *     Write MFC_Cntl[Sc,Sm]='1','0' to suspend
+	 *     the queue.
+	 */
+	out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE);
+	eieio();
+}
+
 static inline void clear_interrupts(struct spu_state *csa, struct spu *spu)
 {
 	/* Restore, Step 49:
@@ -1548,10 +1559,10 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu)
 	 *     "wrapped" flag is set, OR in a '1' to
 	 *     CSA.SPU_Event_Status[Tm].
 	 */
-	if (csa->lscsa->decr_status.slot[0] == 1) {
+	if (csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED) {
 		csa->spu_chnldata_RW[0] |= 0x20;
 	}
-	if ((csa->lscsa->decr_status.slot[0] == 1) &&
+	if ((csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED) &&
 	    (csa->spu_chnlcnt_RW[0] == 0 &&
 	     ((csa->spu_chnldata_RW[2] & 0x20) == 0x0) &&
 	     ((csa->spu_chnldata_RW[0] & 0x20) != 0x1))) {
@@ -1562,18 +1573,13 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu)
 static inline void restore_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	int i;
 
 	/* Restore, Step 59:
+	 *	Restore the following CH: [0,3,4,24,25,27]
 	 */
-
-	/* Restore CH 1 without count */
-	out_be64(&priv2->spu_chnlcntptr_RW, 1);
-	out_be64(&priv2->spu_chnldata_RW, csa->spu_chnldata_RW[1]);
-
-	/* Restore the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -1932,7 +1938,7 @@ static void harvest(struct spu_state *prev, struct spu *spu)
 	set_switch_pending(prev, spu);	        /* Step 5.  */
 	stop_spu_isolate(spu);			/* NEW.     */
 	remove_other_spu_access(prev, spu);	/* Step 6.  */
-	suspend_mfc(prev, spu);	                /* Step 7.  */
+	suspend_mfc_and_halt_decr(prev, spu);	/* Step 7.  */
 	wait_suspend_mfc_complete(prev, spu);	/* Step 8.  */
 	if (!suspend_spe(prev, spu))	        /* Step 9.  */
 		clear_spu_status(prev, spu);	/* Step 10. */
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 8e37bdf4dfda..43f0fb88abbc 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -47,7 +47,7 @@ static long do_spu_run(struct file *filp,
 		goto out;
 
 	i = SPUFS_I(filp->f_path.dentry->d_inode);
-	ret = spufs_run_spu(filp, i->i_ctx, &npc, &status);
+	ret = spufs_run_spu(i->i_ctx, &npc, &status);
 
 	if (put_user(npc, unpc))
 		ret = -EFAULT;
@@ -76,8 +76,8 @@ asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
 }
 #endif
 
-asmlinkage long sys_spu_create(const char __user *pathname,
-					unsigned int flags, mode_t mode)
+asmlinkage long do_spu_create(const char __user *pathname, unsigned int flags,
+				mode_t mode, struct file *neighbor)
 {
 	char *tmp;
 	int ret;
@@ -90,7 +90,7 @@ asmlinkage long sys_spu_create(const char __user *pathname,
 		ret = path_lookup(tmp, LOOKUP_PARENT|
 				LOOKUP_OPEN|LOOKUP_CREATE, &nd);
 		if (!ret) {
-			ret = spufs_create(&nd, flags, mode);
+			ret = spufs_create(&nd, flags, mode, neighbor);
 			path_release(&nd);
 		}
 		putname(tmp);
@@ -99,8 +99,32 @@ asmlinkage long sys_spu_create(const char __user *pathname,
 	return ret;
 }
 
+#ifndef MODULE
+asmlinkage long sys_spu_create(const char __user *pathname, unsigned int flags,
+				mode_t mode, int neighbor_fd)
+{
+	int fput_needed;
+	struct file *neighbor;
+	long ret;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		ret = -EBADF;
+		neighbor = fget_light(neighbor_fd, &fput_needed);
+		if (neighbor) {
+			ret = do_spu_create(pathname, flags, mode, neighbor);
+			fput_light(neighbor, fput_needed);
+		}
+	}
+	else {
+		ret = do_spu_create(pathname, flags, mode, NULL);
+	}
+
+	return ret;
+}
+#endif
+
 struct spufs_calls spufs_calls = {
-	.create_thread = sys_spu_create,
+	.create_thread = do_spu_create,
 	.spu_run = do_spu_run,
 	.owner = THIS_MODULE,
 };
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index f65078c3d3b3..484eb4e0e9db 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_QUICC_ENGINE)	+= qe_lib/
 mv64x60-$(CONFIG_PCI)		+= mv64x60_pci.o
 obj-$(CONFIG_MV64X60)		+= $(mv64x60-y) mv64x60_pic.o mv64x60_dev.o
 obj-$(CONFIG_RTC_DRV_CMOS)	+= rtc_cmos_setup.o
+obj-$(CONFIG_AXON_RAM)		+= axonram.o
 
 # contains only the suspend handler for time
 ifeq ($(CONFIG_RTC_CLASS),)
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
new file mode 100644
index 000000000000..2326d5dc5752
--- /dev/null
+++ b/arch/powerpc/sysdev/axonram.c
@@ -0,0 +1,381 @@
+/*
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2006
+ *
+ * Author: Maxim Shchetynin <maxim@de.ibm.com>
+ *
+ * Axon DDR2 device driver.
+ * It registers one block device per Axon's DDR2 memory bank found on a system.
+ * Block devices are called axonram?, their major and minor numbers are
+ * available in /proc/devices, /proc/partitions or in /sys/block/axonram?/dev.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/irqreturn.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <asm/of_device.h>
+#include <asm/of_platform.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+
+#define AXON_RAM_MODULE_NAME		"axonram"
+#define AXON_RAM_DEVICE_NAME		"axonram"
+#define AXON_RAM_MINORS_PER_DISK	16
+#define AXON_RAM_BLOCK_SHIFT		PAGE_SHIFT
+#define AXON_RAM_BLOCK_SIZE		1 << AXON_RAM_BLOCK_SHIFT
+#define AXON_RAM_SECTOR_SHIFT		9
+#define AXON_RAM_SECTOR_SIZE		1 << AXON_RAM_SECTOR_SHIFT
+#define AXON_RAM_IRQ_FLAGS		IRQF_SHARED | IRQF_TRIGGER_RISING
+
+struct axon_ram_bank {
+	struct of_device	*device;
+	struct gendisk		*disk;
+	unsigned int		irq_correctable;
+	unsigned int		irq_uncorrectable;
+	unsigned long		ph_addr;
+	unsigned long		io_addr;
+	unsigned long		size;
+	unsigned long		ecc_counter;
+};
+
+static ssize_t
+axon_ram_sysfs_ecc(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct of_device *device = to_of_device(dev);
+	struct axon_ram_bank *bank = device->dev.platform_data;
+
+	BUG_ON(!bank);
+
+	return sprintf(buf, "%ld\n", bank->ecc_counter);
+}
+
+static DEVICE_ATTR(ecc, S_IRUGO, axon_ram_sysfs_ecc, NULL);
+
+/**
+ * axon_ram_irq_handler - interrupt handler for Axon RAM ECC
+ * @irq: interrupt ID
+ * @dev: pointer to of_device
+ */
+static irqreturn_t
+axon_ram_irq_handler(int irq, void *dev)
+{
+	struct of_device *device = dev;
+	struct axon_ram_bank *bank = device->dev.platform_data;
+
+	BUG_ON(!bank);
+
+	if (irq == bank->irq_correctable) {
+		dev_err(&device->dev, "Correctable memory error occured\n");
+		bank->ecc_counter++;
+		return IRQ_HANDLED;
+	} else if (irq == bank->irq_uncorrectable) {
+		dev_err(&device->dev, "Uncorrectable memory error occured\n");
+		panic("Critical ECC error on %s", device->node->full_name);
+	}
+
+	return IRQ_NONE;
+}
+
+/**
+ * axon_ram_make_request - make_request() method for block device
+ * @queue, @bio: see blk_queue_make_request()
+ */
+static int
+axon_ram_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
+	unsigned long phys_mem, phys_end;
+	void *user_mem;
+	struct bio_vec *vec;
+	unsigned int transfered;
+	unsigned short idx;
+	int rc = 0;
+
+	phys_mem = bank->io_addr + (bio->bi_sector << AXON_RAM_SECTOR_SHIFT);
+	phys_end = bank->io_addr + bank->size;
+	transfered = 0;
+	bio_for_each_segment(vec, bio, idx) {
+		if (unlikely(phys_mem + vec->bv_len > phys_end)) {
+			bio_io_error(bio, bio->bi_size);
+			rc = -ERANGE;
+			break;
+		}
+
+		user_mem = page_address(vec->bv_page) + vec->bv_offset;
+		if (bio_data_dir(bio) == READ)
+			memcpy(user_mem, (void *) phys_mem, vec->bv_len);
+		else
+			memcpy((void *) phys_mem, user_mem, vec->bv_len);
+
+		phys_mem += vec->bv_len;
+		transfered += vec->bv_len;
+	}
+	bio_endio(bio, transfered, 0);
+
+	return rc;
+}
+
+/**
+ * axon_ram_direct_access - direct_access() method for block device
+ * @device, @sector, @data: see block_device_operations method
+ */
+static int
+axon_ram_direct_access(struct block_device *device, sector_t sector,
+		       unsigned long *data)
+{
+	struct axon_ram_bank *bank = device->bd_disk->private_data;
+	loff_t offset;
+
+	offset = sector << AXON_RAM_SECTOR_SHIFT;
+	if (offset >= bank->size) {
+		dev_err(&bank->device->dev, "Access outside of address space\n");
+		return -ERANGE;
+	}
+
+	*data = bank->ph_addr + offset;
+
+	return 0;
+}
+
+static struct block_device_operations axon_ram_devops = {
+	.owner		= THIS_MODULE,
+	.direct_access	= axon_ram_direct_access
+};
+
+/**
+ * axon_ram_probe - probe() method for platform driver
+ * @device, @device_id: see of_platform_driver method
+ */
+static int
+axon_ram_probe(struct of_device *device, const struct of_device_id *device_id)
+{
+	static int axon_ram_bank_id = -1;
+	struct axon_ram_bank *bank;
+	struct resource resource;
+	int rc = 0;
+
+	axon_ram_bank_id++;
+
+	dev_info(&device->dev, "Found memory controller on %s\n",
+			device->node->full_name);
+
+	bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL);
+	if (bank == NULL) {
+		dev_err(&device->dev, "Out of memory\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	device->dev.platform_data = bank;
+
+	bank->device = device;
+
+	if (of_address_to_resource(device->node, 0, &resource) != 0) {
+		dev_err(&device->dev, "Cannot access device tree\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->size = resource.end - resource.start + 1;
+
+	if (bank->size == 0) {
+		dev_err(&device->dev, "No DDR2 memory found for %s%d\n",
+				AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
+		rc = -ENODEV;
+		goto failed;
+	}
+
+	dev_info(&device->dev, "Register DDR2 memory device %s%d with %luMB\n",
+			AXON_RAM_DEVICE_NAME, axon_ram_bank_id, bank->size >> 20);
+
+	bank->ph_addr = resource.start;
+	bank->io_addr = (unsigned long) ioremap_flags(
+			bank->ph_addr, bank->size, _PAGE_NO_CACHE);
+	if (bank->io_addr == 0) {
+		dev_err(&device->dev, "ioremap() failed\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->disk = alloc_disk(AXON_RAM_MINORS_PER_DISK);
+	if (bank->disk == NULL) {
+		dev_err(&device->dev, "Cannot register disk\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->disk->first_minor = 0;
+	bank->disk->fops = &axon_ram_devops;
+	bank->disk->private_data = bank;
+	bank->disk->driverfs_dev = &device->dev;
+
+	sprintf(bank->disk->disk_name, "%s%d",
+			AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
+	bank->disk->major = register_blkdev(0, bank->disk->disk_name);
+	if (bank->disk->major < 0) {
+		dev_err(&device->dev, "Cannot register block device\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->disk->queue = blk_alloc_queue(GFP_KERNEL);
+	if (bank->disk->queue == NULL) {
+		dev_err(&device->dev, "Cannot register disk queue\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
+	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
+	blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
+	add_disk(bank->disk);
+
+	bank->irq_correctable = irq_of_parse_and_map(device->node, 0);
+	bank->irq_uncorrectable = irq_of_parse_and_map(device->node, 1);
+	if ((bank->irq_correctable <= 0) || (bank->irq_uncorrectable <= 0)) {
+		dev_err(&device->dev, "Cannot access ECC interrupt ID\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	rc = request_irq(bank->irq_correctable, axon_ram_irq_handler,
+			AXON_RAM_IRQ_FLAGS, bank->disk->disk_name, device);
+	if (rc != 0) {
+		dev_err(&device->dev, "Cannot register ECC interrupt handler\n");
+		bank->irq_correctable = bank->irq_uncorrectable = 0;
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	rc = request_irq(bank->irq_uncorrectable, axon_ram_irq_handler,
+			AXON_RAM_IRQ_FLAGS, bank->disk->disk_name, device);
+	if (rc != 0) {
+		dev_err(&device->dev, "Cannot register ECC interrupt handler\n");
+		bank->irq_uncorrectable = 0;
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	rc = device_create_file(&device->dev, &dev_attr_ecc);
+	if (rc != 0) {
+		dev_err(&device->dev, "Cannot create sysfs file\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	if (bank != NULL) {
+		if (bank->irq_uncorrectable > 0)
+			free_irq(bank->irq_uncorrectable, device);
+		if (bank->irq_correctable > 0)
+			free_irq(bank->irq_correctable, device);
+		if (bank->disk != NULL) {
+			if (bank->disk->queue != NULL)
+				blk_cleanup_queue(bank->disk->queue);
+			if (bank->disk->major > 0)
+				unregister_blkdev(bank->disk->major,
+						bank->disk->disk_name);
+			del_gendisk(bank->disk);
+		}
+		device->dev.platform_data = NULL;
+		if (bank->io_addr != 0)
+			iounmap((void __iomem *) bank->io_addr);
+		kfree(bank);
+	}
+
+	return rc;
+}
+
+/**
+ * axon_ram_remove - remove() method for platform driver
+ * @device: see of_platform_driver method
+ */
+static int
+axon_ram_remove(struct of_device *device)
+{
+	struct axon_ram_bank *bank = device->dev.platform_data;
+
+	BUG_ON(!bank || !bank->disk);
+
+	device_remove_file(&device->dev, &dev_attr_ecc);
+	free_irq(bank->irq_uncorrectable, device);
+	free_irq(bank->irq_correctable, device);
+	blk_cleanup_queue(bank->disk->queue);
+	unregister_blkdev(bank->disk->major, bank->disk->disk_name);
+	del_gendisk(bank->disk);
+	iounmap((void __iomem *) bank->io_addr);
+	kfree(bank);
+
+	return 0;
+}
+
+static struct of_device_id axon_ram_device_id[] = {
+	{
+		.type	= "dma-memory"
+	},
+	{}
+};
+
+static struct of_platform_driver axon_ram_driver = {
+	.owner		= THIS_MODULE,
+	.name		= AXON_RAM_MODULE_NAME,
+	.match_table	= axon_ram_device_id,
+	.probe		= axon_ram_probe,
+	.remove		= axon_ram_remove
+};
+
+/**
+ * axon_ram_init
+ */
+static int __init
+axon_ram_init(void)
+{
+	return of_register_platform_driver(&axon_ram_driver);
+}
+
+/**
+ * axon_ram_exit
+ */
+static void __exit
+axon_ram_exit(void)
+{
+	of_unregister_platform_driver(&axon_ram_driver);
+}
+
+module_init(axon_ram_init);
+module_exit(axon_ram_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Maxim Shchetynin <maxim@de.ibm.com>");
+MODULE_DESCRIPTION("Axon DDR2 RAM device driver for IBM Cell BE");
diff --git a/arch/powerpc/sysdev/pmi.c b/arch/powerpc/sysdev/pmi.c
index 85a7c99c1003..2f91b55b7754 100644
--- a/arch/powerpc/sysdev/pmi.c
+++ b/arch/powerpc/sysdev/pmi.c
@@ -48,15 +48,13 @@ struct pmi_data {
 	struct work_struct	work;
 };
 
+static struct pmi_data *data;
 
 static int pmi_irq_handler(int irq, void *dev_id)
 {
-	struct pmi_data *data;
 	u8 type;
 	int rc;
 
-	data = dev_id;
-
 	spin_lock(&data->pmi_spinlock);
 
 	type = ioread8(data->pmi_reg + PMI_READ_TYPE);
@@ -111,16 +109,13 @@ MODULE_DEVICE_TABLE(of, pmi_match);
 
 static void pmi_notify_handlers(struct work_struct *work)
 {
-	struct pmi_data *data;
 	struct pmi_handler *handler;
 
-	data = container_of(work, struct pmi_data, work);
-
 	spin_lock(&data->handler_spinlock);
 	list_for_each_entry(handler, &data->handler, node) {
 		pr_debug(KERN_INFO "pmi: notifying handler %p\n", handler);
 		if (handler->type == data->msg.type)
-			handler->handle_pmi_message(data->dev, data->msg);
+			handler->handle_pmi_message(data->msg);
 	}
 	spin_unlock(&data->handler_spinlock);
 }
@@ -129,9 +124,14 @@ static int pmi_of_probe(struct of_device *dev,
 			const struct of_device_id *match)
 {
 	struct device_node *np = dev->node;
-	struct pmi_data *data;
 	int rc;
 
+	if (data) {
+		printk(KERN_ERR "pmi: driver has already been initialized.\n");
+		rc = -EBUSY;
+		goto out;
+	}
+
 	data = kzalloc(sizeof(struct pmi_data), GFP_KERNEL);
 	if (!data) {
 		printk(KERN_ERR "pmi: could not allocate memory.\n");
@@ -154,7 +154,6 @@ static int pmi_of_probe(struct of_device *dev,
 
 	INIT_WORK(&data->work, pmi_notify_handlers);
 
-	dev->dev.driver_data = data;
 	data->dev = dev;
 
 	data->irq = irq_of_parse_and_map(np, 0);
@@ -164,7 +163,7 @@ static int pmi_of_probe(struct of_device *dev,
 		goto error_cleanup_iomap;
 	}
 
-	rc = request_irq(data->irq, pmi_irq_handler, 0, "pmi", data);
+	rc = request_irq(data->irq, pmi_irq_handler, 0, "pmi", NULL);
 	if (rc) {
 		printk(KERN_ERR "pmi: can't request IRQ %d: returned %d\n",
 				data->irq, rc);
@@ -187,12 +186,9 @@ out:
 
 static int pmi_of_remove(struct of_device *dev)
 {
-	struct pmi_data *data;
 	struct pmi_handler *handler, *tmp;
 
-	data = dev->dev.driver_data;
-
-	free_irq(data->irq, data);
+	free_irq(data->irq, NULL);
 	iounmap(data->pmi_reg);
 
 	spin_lock(&data->handler_spinlock);
@@ -202,7 +198,8 @@ static int pmi_of_remove(struct of_device *dev)
 
 	spin_unlock(&data->handler_spinlock);
 
-	kfree(dev->dev.driver_data);
+	kfree(data);
+	data = NULL;
 
 	return 0;
 }
@@ -226,13 +223,13 @@ static void __exit pmi_module_exit(void)
 }
 module_exit(pmi_module_exit);
 
-void pmi_send_message(struct of_device *device, pmi_message_t msg)
+int pmi_send_message(pmi_message_t msg)
 {
-	struct pmi_data *data;
 	unsigned long flags;
 	DECLARE_COMPLETION_ONSTACK(completion);
 
-	data = device->dev.driver_data;
+	if (!data)
+		return -ENODEV;
 
 	mutex_lock(&data->msg_mutex);
 
@@ -256,30 +253,26 @@ void pmi_send_message(struct of_device *device, pmi_message_t msg)
 	data->completion = NULL;
 
 	mutex_unlock(&data->msg_mutex);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pmi_send_message);
 
-void pmi_register_handler(struct of_device *device,
-			  struct pmi_handler *handler)
+int pmi_register_handler(struct pmi_handler *handler)
 {
-	struct pmi_data *data;
-	data = device->dev.driver_data;
-
 	if (!data)
-		return;
+		return -ENODEV;
 
 	spin_lock(&data->handler_spinlock);
 	list_add_tail(&handler->node, &data->handler);
 	spin_unlock(&data->handler_spinlock);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pmi_register_handler);
 
-void pmi_unregister_handler(struct of_device *device,
-			    struct pmi_handler *handler)
+void pmi_unregister_handler(struct pmi_handler *handler)
 {
-	struct pmi_data *data;
-	data = device->dev.driver_data;
-
 	if (!data)
 		return;