From 1d880992fd8c8457a2d990ac6622cfd58fb1b261 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 2 Jul 2014 15:36:20 +1000
Subject: Initial commit of Open Source release

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 hw/Makefile.inc           |   15 +
 hw/ast-bmc/Makefile.inc   |    5 +
 hw/ast-bmc/ast-io.c       |  324 ++++
 hw/ast-bmc/ast-sf-ctrl.c  |  412 +++++
 hw/cec.c                  |   84 +
 hw/centaur.c              |  326 ++++
 hw/chiptod.c              |  685 ++++++++
 hw/ec/Makefile.inc        |    8 +
 hw/ec/gpio.c              |   87 +
 hw/ec/makefile            |    8 +
 hw/fsi-master.c           |  297 ++++
 hw/fsp/Makefile.inc       |    9 +
 hw/fsp/fsp-codeupdate.c   | 1197 ++++++++++++++
 hw/fsp/fsp-console.c      |  922 +++++++++++
 hw/fsp/fsp-diag.c         |   58 +
 hw/fsp/fsp-dump.c         |  917 +++++++++++
 hw/fsp/fsp-elog-read.c    |  520 ++++++
 hw/fsp/fsp-elog-write.c   |  643 ++++++++
 hw/fsp/fsp-leds.c         | 1080 +++++++++++++
 hw/fsp/fsp-mdst-table.c   |  252 +++
 hw/fsp/fsp-mem-err.c      |  415 +++++
 hw/fsp/fsp-nvram.c        |  414 +++++
 hw/fsp/fsp-op-panel.c     |  249 +++
 hw/fsp/fsp-rtc.c          |  572 +++++++
 hw/fsp/fsp-sensor.c       |  788 +++++++++
 hw/fsp/fsp-surveillance.c |  209 +++
 hw/fsp/fsp-sysparam.c     |  454 ++++++
 hw/fsp/fsp.c              | 2147 +++++++++++++++++++++++++
 hw/gx.c                   |  158 ++
 hw/homer.c                |  143 ++
 hw/lpc-uart.c             |  343 ++++
 hw/lpc.c                  |  500 ++++++
 hw/nx.c                   |  127 ++
 hw/occ.c                  |  477 ++++++
 hw/p5ioc2-phb.c           | 1233 ++++++++++++++
 hw/p5ioc2.c               |  297 ++++
 hw/p7ioc-inits.c          | 1096 +++++++++++++
 hw/p7ioc-phb.c            | 3206 +++++++++++++++++++++++++++++++++++++
 hw/p7ioc.c                |  677 ++++++++
 hw/phb3.c                 | 3880 +++++++++++++++++++++++++++++++++++++++++++++
 hw/psi.c                  |  873 ++++++++++
 hw/sfc-ctrl.c             |  523 ++++++
 hw/slw.c                  |  875 ++++++++++
 hw/xscom.c                |  518 ++++++
 44 files changed, 28023 insertions(+)
 create mode 100644 hw/Makefile.inc
 create mode 100644 hw/ast-bmc/Makefile.inc
 create mode 100644 hw/ast-bmc/ast-io.c
 create mode 100644 hw/ast-bmc/ast-sf-ctrl.c
 create mode 100644 hw/cec.c
 create mode 100644 hw/centaur.c
 create mode 100644 hw/chiptod.c
 create mode 100644 hw/ec/Makefile.inc
 create mode 100644 hw/ec/gpio.c
 create mode 100644 hw/ec/makefile
 create mode 100644 hw/fsi-master.c
 create mode 100644 hw/fsp/Makefile.inc
 create mode 100644 hw/fsp/fsp-codeupdate.c
 create mode 100644 hw/fsp/fsp-console.c
 create mode 100644 hw/fsp/fsp-diag.c
 create mode 100644 hw/fsp/fsp-dump.c
 create mode 100644 hw/fsp/fsp-elog-read.c
 create mode 100644 hw/fsp/fsp-elog-write.c
 create mode 100644 hw/fsp/fsp-leds.c
 create mode 100644 hw/fsp/fsp-mdst-table.c
 create mode 100644 hw/fsp/fsp-mem-err.c
 create mode 100644 hw/fsp/fsp-nvram.c
 create mode 100644 hw/fsp/fsp-op-panel.c
 create mode 100644 hw/fsp/fsp-rtc.c
 create mode 100644 hw/fsp/fsp-sensor.c
 create mode 100644 hw/fsp/fsp-surveillance.c
 create mode 100644 hw/fsp/fsp-sysparam.c
 create mode 100644 hw/fsp/fsp.c
 create mode 100644 hw/gx.c
 create mode 100644 hw/homer.c
 create mode 100644 hw/lpc-uart.c
 create mode 100644 hw/lpc.c
 create mode 100644 hw/nx.c
 create mode 100644 hw/occ.c
 create mode 100644 hw/p5ioc2-phb.c
 create mode 100644 hw/p5ioc2.c
 create mode 100644 hw/p7ioc-inits.c
 create mode 100644 hw/p7ioc-phb.c
 create mode 100644 hw/p7ioc.c
 create mode 100644 hw/phb3.c
 create mode 100644 hw/psi.c
 create mode 100644 hw/sfc-ctrl.c
 create mode 100644 hw/slw.c
 create mode 100644 hw/xscom.c

(limited to 'hw')

diff --git a/hw/Makefile.inc b/hw/Makefile.inc
new file mode 100644
index 00000000..14bf8e78
--- /dev/null
+++ b/hw/Makefile.inc
@@ -0,0 +1,15 @@
+# -*-Makefile-*-
+
+SUBDIRS += hw
+HW_OBJS  = xscom.o chiptod.o gx.o cec.o lpc.o lpc-uart.o psi.o
+HW_OBJS += homer.o slw.o occ.o nx.o fsi-master.o centaur.o
+HW_OBJS += p7ioc.o p7ioc-inits.o p7ioc-phb.o p5ioc2.o p5ioc2-phb.o
+HW_OBJS += phb3.o sfc-ctrl.o
+HW=hw/built-in.o
+
+include $(SRC)/hw/fsp/Makefile.inc
+include $(SRC)/hw/ec/Makefile.inc
+include $(SRC)/hw/ast-bmc/Makefile.inc
+
+$(HW): $(HW_OBJS:%=hw/%) $(FSP) $(EC) $(AST_BMC)
+
diff --git a/hw/ast-bmc/Makefile.inc b/hw/ast-bmc/Makefile.inc
new file mode 100644
index 00000000..a97c0dbe
--- /dev/null
+++ b/hw/ast-bmc/Makefile.inc
@@ -0,0 +1,5 @@
+SUBDIRS += hw/ast-bmc
+
+AST_BMC_OBJS  = ast-io.o ast-sf-ctrl.o
+AST_BMC = hw/ast-bmc/built-in.o
+$(AST_BMC): $(AST_BMC_OBJS:%=hw/ast-bmc/%)
diff --git a/hw/ast-bmc/ast-io.c b/hw/ast-bmc/ast-io.c
new file mode 100644
index 00000000..e89bf7f5
--- /dev/null
+++ b/hw/ast-bmc/ast-io.c
@@ -0,0 +1,324 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Note about accesses to the AST2400 internal memory map:
+ *
+ * There are two ways to genrate accesses to the AHB bus of the AST2400
+ * from the host. The LPC->AHB bridge and the iLPC->AHB bridge.
+ *
+ * LPC->AHB bridge
+ * ---------------
+ *
+ * This bridge directly converts memory or firmware accesses using
+ * a set of registers for establishing a remapping window. We prefer
+ * using FW space as normal memory space is limited to byte accesses
+ * to a fixed 256M window, while FW space allows us to use different
+ * access sizes and to control the IDSEL bits which essentially enable
+ * a full 4G addres space.
+ *
+ * The way FW accesses map onto AHB is controlled via two registers
+ * in the BMC's LPC host controller:
+ *
+ * HICR7 at 0x1e789088 [31:16] : ADRBASE
+ *                     [15:00] : HWMBASE
+ *
+ * HICR8 at 0x1e78908c [31:16] : ADRMASK
+ *		       [15:00] : HWNCARE
+ *
+ * All decoding/remapping happens on the top 16 bits of the LPC address
+ * named LPC_ADDR as follow:
+ *
+ *  - For decoding, LPC_ADDR bits are compared with HWMBASE if the
+ *    corresponding bit in HWNCARE is 0.
+ *
+ *  - For remapping, the AHB address is constructed by taking bits
+ *    from LPC_ADDR if the corresponding bit in ADRMASK is 0 or in
+ *    ADRBASE if the corresponding bit in ADRMASK is 1
+ *
+ * Example of 2MB SPI flash, LPC 0xFCE00000~0xFCFFFFFF onto
+ *                           AHB 0x30000000~0x301FFFFF (SPI flash)
+ *
+ * ADRBASE=0x3000 HWMBASE=0xFCE0
+ * ADRMASK=0xFFE0 HWNCARE=0x001F
+ *
+ * This comes pre-configured by the BMC or HostBoot to access the PNOR
+ * flash from IDSEL 0 as follow:
+ *
+ * ADRBASE=0x3000 HWMBASE=0x0e00
+ * ADRMASK=0xfe00 HWNCARE=0x01ff 
+ *
+ * Which means mapping of   LPC 0x0e000000..0x0fffffff onto
+ *                          AHB 0x30000000..0x31ffffff
+ *
+ * iLPC->AHB bridge
+ * ---------------
+ *
+ * This bridge is hosted in the SuperIO part of the BMC and is
+ * controlled by a series of byte-sized registers accessed indirectly
+ * via IO ports 0x2e and 0x2f.
+ *
+ * Via these, byte by byte, we can construct an AHB address and
+ * fill a data buffer to trigger a write cycle, or we can do a
+ * read cycle and read back the data, byte after byte.
+ *
+ * This is fairly convoluted and slow but works regardless of what
+ * mapping was established in the LPC->AHB bridge.
+ *
+ * For the time being, we use the iLPC->AHB for everything except
+ * pnor accesses. In the long run, we will reconfigure the LPC->AHB
+ * to provide more direct access to all of the BMC addres space but
+ * we'll only do that after the boot script/program on the BMC is
+ * updated to restore the bridge to a state compatible with the SBE
+ * expectations on boot.
+ */ 
+ 
+#include <skiboot.h>
+#include <lpc.h>
+#include <lock.h>
+
+#include "ast.h"
+
+static struct lock bmc_sio_lock = LOCK_UNLOCKED;
+
+/*
+ * SuperIO indirect accesses
+ */
+static void bmc_sio_outb(uint8_t val, uint8_t reg)
+{
+	lpc_outb(reg, 0x2e);
+	lpc_outb(val, 0x2f);
+}
+
+static uint8_t bmc_sio_inb(uint8_t reg)
+{
+	lpc_outb(reg, 0x2e);
+	return lpc_inb(0x2f);
+}
+
+/*
+ * AHB accesses via iLPC->AHB in SuperIO. Works on byteswapped
+ * values (ie. Little Endian registers)
+ */
+static void bmc_sio_ahb_prep(uint32_t reg, uint8_t type)
+{
+	/* Address */
+	bmc_sio_outb((reg >> 24) & 0xff, 0xf0);
+	bmc_sio_outb((reg >> 16) & 0xff, 0xf1);
+	bmc_sio_outb((reg >>  8) & 0xff, 0xf2);
+	bmc_sio_outb((reg      ) & 0xff, 0xf3);
+
+	/* bytes cycle type */
+	bmc_sio_outb(type, 0xf8);
+}
+
+static void bmc_sio_ahb_writel(uint32_t val, uint32_t reg)
+{
+	lock(&bmc_sio_lock);
+
+	bmc_sio_ahb_prep(reg, 2);
+
+	/* Write data */
+	bmc_sio_outb(val >> 24, 0xf4);
+	bmc_sio_outb(val >> 16, 0xf5);
+	bmc_sio_outb(val >>  8, 0xf6);
+	bmc_sio_outb(val      , 0xf7);
+
+	/* Trigger */
+	bmc_sio_outb(0xcf, 0xfe);
+
+	unlock(&bmc_sio_lock);
+}
+
+static uint32_t bmc_sio_ahb_readl(uint32_t reg)
+{
+	uint32_t val = 0;
+
+	lock(&bmc_sio_lock);
+
+	bmc_sio_ahb_prep(reg, 2);
+
+	/* Trigger */	
+	bmc_sio_inb(0xfe);
+
+	/* Read results */
+	val = (val << 8) | bmc_sio_inb(0xf4);
+	val = (val << 8) | bmc_sio_inb(0xf5);
+	val = (val << 8) | bmc_sio_inb(0xf6);
+	val = (val << 8) | bmc_sio_inb(0xf7);
+
+	unlock(&bmc_sio_lock);
+
+	return val;
+}
+
+static void bmc_sio_ahb_init(void)
+{
+	/* Send SuperIO password */
+	lpc_outb(0xa5, 0x2e);
+	lpc_outb(0xa5, 0x2e);
+
+	/* Select logical dev d */
+	bmc_sio_outb(0x0d, 0x07);
+
+	/* Enable iLPC->AHB */
+	bmc_sio_outb(0x01, 0x30);
+
+	/* We leave the SuperIO enabled and unlocked for
+	 * subsequent accesses.
+	 */
+}
+
+/*
+ * External API
+ *
+ * We only support 4-byte accesses to all of AHB. We additionally
+ * support 1-byte accesses to the flash area only.
+ *
+ * We could support all access sizes via iLPC but we don't need
+ * that for now.
+ */
+#define PNOR_AHB_ADDR	0x30000000
+#define PNOR_LPC_OFFSET	0x0e000000
+
+void ast_ahb_writel(uint32_t val, uint32_t reg)
+{
+	/* For now, always use iLPC->AHB, it will byteswap */
+	bmc_sio_ahb_writel(val, reg);
+}
+
+uint32_t ast_ahb_readl(uint32_t reg)
+{
+	/* For now, always use iLPC->AHB, it will byteswap */
+	return bmc_sio_ahb_readl(reg);
+}
+
+int ast_copy_to_ahb(uint32_t reg, const void *src, uint32_t len)
+{
+	/* Check we don't cross IDSEL segments */
+	if ((reg ^ (reg + len - 1)) >> 28)
+		return -EINVAL;
+
+	/* SPI flash, use LPC->AHB bridge */	
+	if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) {
+		uint32_t chunk, off = reg - PNOR_AHB_ADDR + PNOR_LPC_OFFSET;
+		int64_t rc;
+
+		while(len) {
+			/* Chose access size */
+			if (len > 3 && !(off & 3)) {
+				rc = lpc_write(OPAL_LPC_FW, off,
+					       *(uint32_t *)src, 4);
+				chunk = 4;
+			} else {
+				rc = lpc_write(OPAL_LPC_FW, off,
+					       *(uint8_t *)src, 1);
+				chunk = 1;
+			}
+			if (rc) {
+				prerror("AST_IO: lpc_write.sb failure %lld"
+					" to FW 0x%08x\n", rc, off);
+				return rc;
+			}
+			len -= chunk;
+			off += chunk;
+			src += chunk;
+		}
+		return 0;
+	}
+
+	/* Otherwise we don't do byte access (... yet)  */
+	prerror("AST_IO: Attempted write bytes access to %08x\n", reg);
+	return -EINVAL;
+}
+
+int ast_copy_from_ahb(void *dst, uint32_t reg, uint32_t len)
+{
+	/* Check we don't cross IDSEL segments */
+	if ((reg ^ (reg + len - 1)) >> 28)
+		return -EINVAL;
+
+	/* SPI flash, use LPC->AHB bridge */
+	if ((reg >> 28) == (PNOR_AHB_ADDR >> 28)) {
+		uint32_t chunk, off = reg - PNOR_AHB_ADDR + PNOR_LPC_OFFSET;
+		int64_t rc;
+
+		while(len) {
+			uint32_t dat;
+
+			/* Chose access size */
+			if (len > 3 && !(off & 3)) {
+				rc = lpc_read(OPAL_LPC_FW, off, &dat, 4);
+				if (!rc)
+					*(uint32_t *)dst = dat;
+				chunk = 4;
+			} else {
+				rc = lpc_read(OPAL_LPC_FW, off, &dat, 1);
+				if (!rc)
+					*(uint8_t *)dst = dat;
+				chunk = 1;
+			}
+			if (rc) {
+				prerror("AST_IO: lpc_read.sb failure %lld"
+					" to FW 0x%08x\n", rc, off);
+				return rc;
+			}
+			len -= chunk;
+			off += chunk;
+			dst += chunk;
+		}
+		return 0;
+	}
+	/* Otherwise we don't do byte access (... yet)  */
+	prerror("AST_IO: Attempted read bytes access to %08x\n", reg);
+	return -EINVAL;
+}
+
+void ast_io_init(void)
+{
+	/* Initialize iLPC->AHB bridge */
+	bmc_sio_ahb_init();
+
+	/* Configure the LPC->AHB bridge for PNOR access (just in case) */
+	bmc_sio_ahb_writel(0x30000e00, LPC_HICR7);
+	bmc_sio_ahb_writel(0xfe0001ff, LPC_HICR8);
+	bmc_sio_ahb_writel(0x00000500, LPC_HICR6);
+}
+
+/* Setup SuperIO UART 1*/
+void ast_setup_uart1(uint16_t io_base, uint8_t irq)
+{
+	/* Send SuperIO password */
+	lpc_outb(0xa5, 0x2e);
+	lpc_outb(0xa5, 0x2e);
+
+	/* Select logical dev 2 */
+	bmc_sio_outb(0x02, 0x07);
+
+	/* Disable UART1 for configuration */
+	bmc_sio_outb(0x01, 0x30);
+
+	/* Configure base and interrupt */
+	bmc_sio_outb(io_base >> 8, 0x60);
+	bmc_sio_outb(io_base & 0xff, 0x61);
+	bmc_sio_outb(irq, 0x70);
+	bmc_sio_outb(0x01, 0x71); /* level low */
+
+	/* Enable UART1 */
+	bmc_sio_outb(0x01, 0x30);
+
+	/* Re-lock SuperIO */
+	lpc_outb(0xaa, 0x2e);
+}
diff --git a/hw/ast-bmc/ast-sf-ctrl.c b/hw/ast-bmc/ast-sf-ctrl.c
new file mode 100644
index 00000000..e0d5fcca
--- /dev/null
+++ b/hw/ast-bmc/ast-sf-ctrl.c
@@ -0,0 +1,412 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <libflash/libflash.h>
+#include <libflash/libflash-priv.h>
+
+#include "ast.h"
+
+#ifndef __unused
+#define __unused __attribute__((unused))
+#endif
+
+struct ast_sf_ctrl {
+	/* We have 2 controllers, one for the BMC flash, one for the PNOR */
+	uint8_t			type;
+
+	/* Address and previous value of the ctrl register */
+	uint32_t		ctl_reg;
+
+	/* Control register value for normal commands */
+	uint32_t		ctl_val;
+
+	/* Control register value for (fast) reads */
+	uint32_t		ctl_read_val;
+
+	/* Address of the flash mapping */
+	uint32_t		flash;
+
+	/* Current 4b mode */
+	bool			mode_4b;
+
+	/* Callbacks */
+	struct spi_flash_ctrl	ops;
+};
+
+static int ast_sf_start_cmd(struct ast_sf_ctrl *ct, uint8_t cmd)
+{
+	/* Switch to user mode, CE# dropped */
+	ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg);
+
+	/* user mode, CE# active */
+	ast_ahb_writel(ct->ctl_val | 3, ct->ctl_reg);
+
+	/* write cmd */
+	return ast_copy_to_ahb(ct->flash, &cmd, 1);
+}
+
+static void ast_sf_end_cmd(struct ast_sf_ctrl *ct)
+{
+	/* clear CE# */
+	ast_ahb_writel(ct->ctl_val | 7, ct->ctl_reg);
+
+	/* Switch back to read mode */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+}
+
+static int ast_sf_send_addr(struct ast_sf_ctrl *ct, uint32_t addr)
+{
+	const void *ap;
+
+	/* Layout address MSB first in memory */
+	addr = cpu_to_be32(addr);
+
+	/* Send the right amount of bytes */
+	ap = (char *)&addr;
+
+	if (ct->mode_4b)
+		return ast_copy_to_ahb(ct->flash, ap, 4);
+	else
+		return ast_copy_to_ahb(ct->flash, ap + 1, 3);
+}
+
+static int ast_sf_cmd_rd(struct spi_flash_ctrl *ctrl, uint8_t cmd,
+			 bool has_addr, uint32_t addr, void *buffer,
+			 uint32_t size)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	int rc;
+
+	rc = ast_sf_start_cmd(ct, cmd);
+	if (rc)
+		goto bail;
+	if (has_addr) {
+		rc = ast_sf_send_addr(ct, addr);
+		if (rc)
+			goto bail;
+	}
+	if (buffer && size)
+		rc = ast_copy_from_ahb(buffer, ct->flash, size);
+ bail:
+	ast_sf_end_cmd(ct);
+	return rc;
+}
+
+static int ast_sf_cmd_wr(struct spi_flash_ctrl *ctrl, uint8_t cmd,
+			 bool has_addr, uint32_t addr, const void *buffer,
+			 uint32_t size)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+	int rc;
+
+	rc = ast_sf_start_cmd(ct, cmd);
+	if (rc)
+		goto bail;
+	if (has_addr) {
+		rc = ast_sf_send_addr(ct, addr);
+		if (rc)
+			goto bail;
+	}
+	if (buffer && size)
+		rc = ast_copy_to_ahb(ct->flash, buffer, size);
+ bail:
+	ast_sf_end_cmd(ct);
+	return rc;
+}
+
+static int ast_sf_set_4b(struct spi_flash_ctrl *ctrl, bool enable)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+	if (ct->type != AST_SF_TYPE_PNOR)
+		return enable ? FLASH_ERR_4B_NOT_SUPPORTED : 0;
+
+	/*
+	 * We update the "old" value as well since when quitting
+	 * we don't restore the mode of the flash itself so we need
+	 * to leave the controller in a compatible setup
+	 */
+	if (enable) {
+		ct->ctl_val |= 0x2000;
+		ct->ctl_read_val |= 0x2000;
+	} else {
+		ct->ctl_val &= ~0x2000;
+		ct->ctl_read_val &= ~0x2000;
+	}
+	ct->mode_4b = enable;
+
+	/* Update read mode */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	return 0;
+}
+
+static int ast_sf_read(struct spi_flash_ctrl *ctrl, uint32_t pos,
+		       void *buf, uint32_t len)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+	/*
+	 * We are in read mode by default. We don't yet support fancy
+	 * things like fast read or X2 mode
+	 */
+	return ast_copy_from_ahb(buf, ct->flash + pos, len);
+}
+
+static int ast_sf_setup(struct spi_flash_ctrl *ctrl, struct flash_info *info,
+			uint32_t *tsize)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);	
+
+	(void)tsize;
+
+	/*
+	 * Configure better timings and read mode for known
+	 * flash chips
+	 */
+	switch(info->id) {		
+	case 0xc22019: /* MX25L25635F */
+	case 0xc2201a: /* MX66L51235F */
+		/*
+		 * Those Macronix chips support dual IO reads at 104Mhz
+		 * with 8 dummy cycles so let's use HCLK/2 which is 96Mhz.
+		 *
+		 * We use DREAD (dual read) for now as it defaults to 8
+		 * dummy cycles. Eventually we'd like to use 2READ (which
+		 * also has the address using 2 IOs) but that defaults
+		 * to 6 dummy cycles and we can only do a multiple of bytes
+		 * (Note: I think that accounts for the dual IO so a byte is
+		 * probably 4 clocks in that mode, but I need to dlb check).
+		 *
+		 * We can change the configuration of the flash so we can
+		 * do that later, it's a bit more complex.
+		 * 
+		 * The CE# inactive width for reads must be 7ns, we set it
+		 * to 2T which is about 10.4ns.
+		 *
+		 * For write and program it's 30ns so let's set the value
+		 * for normal ops to 6T.
+		 *
+		 * Preserve the current 4b mode.
+		 */
+		ct->ctl_read_val = (ct->ctl_read_val & 0x2000) |
+			(0x02 << 28) | /* Dual bit data only */
+			(0x0e << 24) | /* CE# width 2T (b1110) */
+			(0x3b << 16) | /* DREAD command */
+			(0x07 <<  8) | /* HCLK/2 */
+			(0x01 <<  6) | /* 1-byte dummy cycle */
+			(0x01);	       /* fast read */
+
+		/* Configure SPI flash read timing ? */
+
+		/*
+		 * For other commands and writes also increase the SPI clock
+		 * to HCLK/2 since the chip supports up to 133Mhz and set
+		 * CE# inactive to 6T
+		 */
+		ct->ctl_val = (ct->ctl_val & 0x2000) |
+			(0x00 << 28) | /* Single bit */
+			(0x0a << 24) | /* CE# width 6T (b1010) */
+			(0x00 << 16) | /* no command */
+			(0x07 <<  8) | /* HCLK/2 */
+			(0x00 <<  6) | /* no dummy cycle */
+			(0x00);	       /* normal read */
+
+		/* Update chip with current read config */
+		ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+		break;
+	case 0xef4018: /* W25Q128BV */
+		/*
+		 * This Windbond chip support dual IO reads at 104Mhz
+		 * with 8 dummy cycles so let's use HCLK/2.
+		 *
+		 * The CE# inactive width for reads must be 10ns, we set it
+		 * to 3T which is about 15.6ns.
+		 */
+		ct->ctl_read_val =
+			(0x02 << 28) | /* Dual bit data only */
+			(0x0e << 24) | /* CE# width 2T (b1110) */
+			(0x3b << 16) | /* DREAD command */
+			(0x07 <<  8) | /* HCLK/2 */
+			(0x01 <<  6) | /* 1-byte dummy cycle */
+			(0x01);	       /* fast read */
+
+		/* Configure SPI flash read timing ? */
+
+		/*
+		 * For other commands and writes also increase the SPI clock
+		 * to HCLK/2 since the chip supports up to 133Mhz. CE# inactive
+		 * for write and erase is 50ns so let's set it to 10T.
+		 */
+		ct->ctl_val =
+			(0x00 << 28) | /* Single bit */
+			(0x06 << 24) | /* CE# width 10T (b0110) */
+			(0x00 << 16) | /* no command */
+			(0x07 <<  8) | /* HCLK/2 */
+			(0x00 <<  6) | /* no dummy cycle */
+			(0x01);	       /* fast read */
+
+		/* Update chip with current read config */
+		ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+		break;
+	}
+	return 0;
+}
+
+static bool ast_sf_init_pnor(struct ast_sf_ctrl *ct)
+{
+	uint32_t reg;
+
+	ct->ctl_reg = PNOR_SPI_FCTL_CTRL;
+	ct->flash = PNOR_FLASH_BASE;
+
+	/* Enable writing to the controller */
+	reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF);
+	if (reg == 0xffffffff) {
+		FL_ERR("AST_SF: Failed read from controller config\n");
+		return false;
+	}
+	ast_ahb_writel(reg | 1, PNOR_SPI_FCTL_CONF);
+
+	/*
+	 * Snapshot control reg and sanitize it for our
+	 * use, switching to 1-bit mode, clearing user
+	 * mode if set, etc...
+	 *
+	 * Also configure SPI clock to something safe
+	 * like HCLK/8 (24Mhz)
+	 */
+	ct->ctl_val = ast_ahb_readl(ct->ctl_reg);
+	if (ct->ctl_val == 0xffffffff) {
+		FL_ERR("AST_SF: Failed read from controller control\n");
+		return false;
+	}
+
+	ct->ctl_val = (ct->ctl_val & 0x2000) |
+		(0x00 << 28) | /* Single bit */
+		(0x00 << 24) | /* CE# width 16T */
+		(0x00 << 16) | /* no command */
+		(0x04 <<  8) | /* HCLK/8 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* normal read */
+
+	/* Initial read mode is default */
+	ct->ctl_read_val = ct->ctl_val;
+
+	/* Configure for read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	if (ct->ctl_val & 0x2000)
+		ct->mode_4b = true;
+	else
+		ct->mode_4b = false;
+
+	return true;
+}
+
+static bool ast_sf_init_bmc(struct ast_sf_ctrl *ct)
+{
+	ct->ctl_reg = BMC_SPI_FCTL_CTRL;
+	ct->flash = BMC_FLASH_BASE;
+
+	/*
+	 * Snapshot control reg and sanitize it for our
+	 * use, switching to 1-bit mode, clearing user
+	 * mode if set, etc...
+	 *
+	 * Also configure SPI clock to something safe
+	 * like HCLK/8 (24Mhz)
+	 */
+	ct->ctl_val =
+		(0x00 << 28) | /* Single bit */
+		(0x00 << 24) | /* CE# width 16T */
+		(0x00 << 16) | /* no command */
+		(0x04 <<  8) | /* HCLK/8 */
+		(0x00 <<  6) | /* no dummy cycle */
+		(0x00);	       /* normal read */
+
+	/* Initial read mode is default */
+	ct->ctl_read_val = ct->ctl_val;
+
+	/* Configure for read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	ct->mode_4b = false;
+
+	return true;
+}
+
+int ast_sf_open(uint8_t type, struct spi_flash_ctrl **ctrl)
+{
+	struct ast_sf_ctrl *ct;
+
+	if (type != AST_SF_TYPE_PNOR && type != AST_SF_TYPE_BMC)
+		return -EINVAL;
+
+	*ctrl = NULL;
+	ct = malloc(sizeof(*ct));
+	if (!ct) {
+		FL_ERR("AST_SF: Failed to allocate\n");
+		return -ENOMEM;
+	}
+	memset(ct, 0, sizeof(*ct));
+	ct->type = type;
+	ct->ops.cmd_wr = ast_sf_cmd_wr;
+	ct->ops.cmd_rd = ast_sf_cmd_rd;
+	ct->ops.set_4b = ast_sf_set_4b;
+	ct->ops.read = ast_sf_read;
+	ct->ops.setup = ast_sf_setup;
+
+	if (type == AST_SF_TYPE_PNOR) {
+		if (!ast_sf_init_pnor(ct))
+			goto fail;
+	} else {
+		if (!ast_sf_init_bmc(ct))
+			goto fail;
+	}
+
+	*ctrl = &ct->ops;
+
+	return 0;
+ fail:
+	free(ct);
+	return -EIO;
+}
+
+void ast_sf_close(struct spi_flash_ctrl *ctrl)
+{
+	struct ast_sf_ctrl *ct = container_of(ctrl, struct ast_sf_ctrl, ops);
+
+	/* Restore control reg to read */
+	ast_ahb_writel(ct->ctl_read_val, ct->ctl_reg);
+
+	/* Additional cleanup */
+	if (ct->type == AST_SF_TYPE_PNOR) {
+		uint32_t reg = ast_ahb_readl(PNOR_SPI_FCTL_CONF);
+		if (reg != 0xffffffff)
+			ast_ahb_writel(reg & ~1, PNOR_SPI_FCTL_CONF);
+	}
+
+	/* Free the whole lot */
+	free(ct);
+}
+
diff --git a/hw/cec.c b/hw/cec.c
new file mode 100644
index 00000000..d8d1354a
--- /dev/null
+++ b/hw/cec.c
@@ -0,0 +1,84 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <cec.h>
+#include <p7ioc.h>
+#include <p5ioc2.h>
+#include <interrupts.h>
+
+/*
+ * Note: This file os only used on P7/P7+
+ */
+#define MAX_IO_HUBS	0x80
+
+static struct io_hub *cec_iohubs[MAX_IO_HUBS];
+
+struct io_hub *cec_get_hub_by_id(uint32_t hub_id)
+{
+	if (hub_id >= MAX_IO_HUBS)
+		return NULL;
+	return cec_iohubs[hub_id];
+}
+
+void cec_register(struct io_hub *hub)
+{
+	cec_iohubs[hub->hub_id] = hub;
+}
+
+void cec_reset(void)
+{
+	unsigned int i;
+
+	/* Reset IO Hubs */
+	for (i = 0; i < MAX_IO_HUBS; i++) {
+		if (!cec_iohubs[i] || !cec_iohubs[i]->ops->reset)
+			continue;
+		cec_iohubs[i]->ops->reset(cec_iohubs[i]);
+	}
+}
+
+static int64_t opal_pci_set_hub_tce_memory(uint64_t hub_id,
+					   uint64_t tce_mem_addr,
+					   uint64_t tce_mem_size)
+{
+	struct io_hub *hub = cec_get_hub_by_id(hub_id);
+
+	if (!hub)
+		return OPAL_PARAMETER;
+
+	if (!hub->ops->set_tce_mem)
+		return OPAL_UNSUPPORTED;
+
+	return hub->ops->set_tce_mem(hub, tce_mem_addr, tce_mem_size);
+}
+opal_call(OPAL_PCI_SET_HUB_TCE_MEMORY, opal_pci_set_hub_tce_memory, 3);
+
+static int64_t opal_pci_get_hub_diag_data(uint64_t hub_id,
+					  void *diag_buffer,
+					  uint64_t diag_buffer_len)
+{
+	struct io_hub *hub = cec_get_hub_by_id(hub_id);
+
+	if (!hub)
+		return OPAL_PARAMETER;
+
+	if (!hub->ops->get_diag_data)
+		return OPAL_UNSUPPORTED;
+
+	return hub->ops->get_diag_data(hub, diag_buffer, diag_buffer_len);
+}
+opal_call(OPAL_PCI_GET_HUB_DIAG_DATA, opal_pci_get_hub_diag_data, 3);
diff --git a/hw/centaur.c b/hw/centaur.c
new file mode 100644
index 00000000..20f63cc3
--- /dev/null
+++ b/hw/centaur.c
@@ -0,0 +1,326 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <xscom.h>
+#include <processor.h>
+#include <device.h>
+#include <chip.h>
+#include <centaur.h>
+#include <lock.h>
+#include <fsi-master.h>
+
+/*
+ * Centaur chip IDs are using the XSCOM "partID" encoding
+ * described in xscom.h. recap:
+ *
+ *     0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM
+ *     N=Node, C=Chip, M=Memory Channel
+ *
+ * We currently use FSI exclusively for centaur access. We can
+ * start using MMIO on Centaur DD2.x when we have a way to handle
+ * machine checks happening inside Sapphire which we don't at the
+ * moment.
+ */
+struct centaur_chip {
+	bool		valid;
+	uint8_t		ec_level;
+	uint32_t	fsi_master_chip_id;
+	uint32_t	fsi_master_port;
+	uint32_t	fsi_master_engine;	
+	struct lock	lock;
+};
+
+/* Is that correct ? */
+#define MAX_CENTAURS_PER_CHIP	8
+
+/*
+ * FSI2PIB register definitions (this could be moved out if we were to
+ * support FSI master to other chips.
+ */
+#define FSI_DATA0_REG		0x1000
+#define FSI_DATA1_REG		0x1004
+#define FSI_CMD_REG		0x1008
+#define   FSI_CMD_WR		0x80000000
+#define   FSI_CMD_RD		0x00000000
+#define FSI_ENG_RESET_REG	0x1018
+#define FSI_STATUS_REG		0x101c
+#define   FSI_STATUS_ABORT	0x00100000
+#define   FSI_STATUS_ERRORS	0x00007000
+
+static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur)
+{
+	int64_t rc;
+	uint32_t stat;
+
+	rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+		       centaur->fsi_master_port, FSI_STATUS_REG, &stat);
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI read error %lld reading STAT\n", rc);
+		return rc;
+	}
+	if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0)
+		return OPAL_SUCCESS;
+
+	prerror("CENTAUR: Remote FSI error, stat=0x%08x\n", stat);
+
+	/* XXX Handle recovery */
+
+	return OPAL_HARDWARE;
+}
+
+static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_addr,
+				    uint64_t *val)
+{
+	int64_t rc;
+	uint32_t data0, data1;
+
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD);
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI write error %lld writing CMD\n", rc);
+		return rc;
+	}
+
+	rc = centaur_fsiscom_complete(centaur);
+	if (rc)
+		return rc;
+
+	rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+		       centaur->fsi_master_port, FSI_DATA0_REG, &data0);
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI read error %lld reading DATA0\n", rc);
+		return rc;
+	}
+	rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+		       centaur->fsi_master_port, FSI_DATA1_REG, &data1);
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI read error %lld readking DATA1\n", rc);
+		return rc;
+	}
+
+	*val = (((uint64_t)data0) << 32) | data1;
+
+	return OPAL_SUCCESS;
+}
+
+static struct centaur_chip *centaur_get(uint32_t part_id)
+{
+	uint32_t hchip_id, mchan;
+	struct proc_chip *hchip;
+	struct centaur_chip *centaur;
+
+	if ((part_id >> 28) != 8) {
+		prerror("CENTAUR: Invalid part ID 0x%x\n", part_id);
+		return NULL;
+	}
+	hchip_id = (part_id & 0x0fffffff) >> 4;
+	mchan = part_id & 0xf;
+
+	hchip = get_chip(hchip_id);
+	if (!hchip) {
+		prerror("CENTAUR: Centaur 0x%x not found on non-existing chip 0%x\n",
+			part_id, hchip_id);
+		return NULL;
+	}
+	if (mchan >= MAX_CENTAURS_PER_CHIP) {
+		prerror("CENTAUR: Centaur 0x%x channel out of bounds !\n", part_id);
+		return NULL;
+	}
+	if (!hchip->centaurs) {
+		prerror("CENTAUR: Centaur 0x%x not found on chip 0%x (no centaurs)\n",
+			part_id, hchip_id);
+		return NULL;
+	}
+	centaur = &hchip->centaurs[mchan];
+	if (!centaur->valid) {
+		prerror("CENTAUR: Centaur 0x%x not valid on chip 0%x\n",
+			part_id, hchip_id);
+		return NULL;
+	}
+	return centaur;
+}
+
+static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr,
+				     uint64_t val)
+{
+	int64_t rc;
+
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_DATA0_REG, hi32(val));
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI write error %lld writing DATA0\n", rc);
+		return rc;
+	}
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_DATA1_REG, lo32(val));
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI write error %lld writing DATA1\n", rc);
+		return rc;
+	}
+	rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+			centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR);
+	if (rc) {
+		/* XXX Improve logging */
+		prerror("CENTAUR: MFSI write error %lld writing CMD\n", rc);
+		return rc;
+	}
+
+	return centaur_fsiscom_complete(centaur);
+}
+
+int64_t centaur_xscom_read(uint32_t id, uint64_t pcb_addr, uint64_t *val)
+{
+	struct centaur_chip *centaur = centaur_get(id);
+	int64_t rc;
+
+	if (!centaur)
+		return OPAL_PARAMETER;
+
+	lock(&centaur->lock);
+	rc = centaur_fsiscom_read(centaur, pcb_addr, val);
+	unlock(&centaur->lock);
+
+	return rc;
+}
+
+int64_t centaur_xscom_write(uint32_t id, uint64_t pcb_addr, uint64_t val)
+{
+	struct centaur_chip *centaur = centaur_get(id);
+	int64_t rc;
+
+	if (!centaur)
+		return OPAL_PARAMETER;
+
+	lock(&centaur->lock);
+	rc = centaur_fsiscom_write(centaur, pcb_addr, val);
+	unlock(&centaur->lock);
+
+	return rc;
+}
+
+static bool centaur_check_id(struct centaur_chip *centaur)
+{
+	int64_t rc;
+	uint64_t val;
+
+	rc = centaur_fsiscom_read(centaur, 0xf000f, &val);
+	if (rc) {
+		prerror("CENTAUR:   FSISCOM error %lld reading ID register\n",
+			rc);
+		return false;
+	}
+
+	/* Extract CFAM id */
+	val >>= 44;
+
+	/* Identify chip */
+	if ((val & 0xff) != 0xe9) {
+		prerror("CENTAUR:   CFAM ID 0x%02x is not a Centaur !\n",
+			(unsigned int)(val & 0xff));
+		return false;
+	}
+
+	/* Get EC level from CFAM ID */
+	centaur->ec_level = ((val >> 16) & 0xf) << 4;
+	centaur->ec_level |= (val >> 8) & 0xf;
+
+	return true;
+}
+
+static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng,
+			uint32_t mport)
+{
+	uint32_t hchip_id, mchan;
+	struct proc_chip *hchip;
+	struct centaur_chip *centaur;
+
+	if ((part_id >> 28) != 8) {
+		prerror("CENTAUR: Invalid part ID 0x%x\n", part_id);
+		return false;
+	}
+	hchip_id = (part_id & 0x0fffffff) >> 4;
+	mchan = part_id & 0xf;
+
+	printf("CENTAUR: Found centaur for chip 0x%x channel %d\n",
+	       hchip_id, mchan);
+	printf("CENTAUR:   FSI host: 0x%x cMFSI%d port %d\n",
+	       mchip, meng, mport);
+
+	hchip = get_chip(hchip_id);
+	if (!hchip) {
+		prerror("CENTAUR:   No such chip !!!\n");
+		return false;
+	}
+
+	if (mchan >= MAX_CENTAURS_PER_CHIP) {
+		prerror("CENTAUR:   Channel out of bounds !\n");
+		return false;
+	}
+
+	if (!hchip->centaurs) {
+		hchip->centaurs =
+			zalloc(sizeof(struct centaur_chip) *
+			       MAX_CENTAURS_PER_CHIP);
+		assert(hchip->centaurs);
+	}
+
+	centaur = &hchip->centaurs[mchan];
+	if (centaur->valid) {
+		prerror("CENTAUR:   Duplicate centaur !\n");
+		return false;
+	}
+	centaur->fsi_master_chip_id = mchip;
+	centaur->fsi_master_port = mport;
+	centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0;
+	init_lock(&centaur->lock);
+
+	if (!centaur_check_id(centaur))
+		return false;
+
+	printf("CENTAUR:   ChipID 0x%x [DD%x.%x]\n", part_id,
+		       centaur->ec_level >> 4,
+		       centaur->ec_level & 0xf);
+
+	centaur->valid = true;
+	return true;
+}
+
+void centaur_init(void)
+{
+	struct dt_node *cn;
+
+	dt_for_each_compatible(dt_root, cn, "ibm,centaur-v10") {
+		uint32_t chip_id, mchip, meng, mport;
+
+		chip_id = dt_prop_get_u32(cn, "ibm,chip-id");
+		mchip = dt_prop_get_u32(cn, "ibm,fsi-master-chip-id");
+		meng = dt_prop_get_cell(cn, "ibm,fsi-master-port", 0);
+		mport = dt_prop_get_cell(cn, "ibm,fsi-master-port", 1);
+
+		/*
+		 * If adding the centaur succeeds, we expose it to
+		 * Linux as a scom-controller
+		 */
+		if (centaur_add(chip_id, mchip, meng, mport))
+			dt_add_property(cn, "scom-controller", NULL, 0);
+	}
+}
diff --git a/hw/chiptod.c b/hw/chiptod.c
new file mode 100644
index 00000000..e24d9667
--- /dev/null
+++ b/hw/chiptod.c
@@ -0,0 +1,685 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Handle ChipTOD chip & configure core timebases
+ */
+#include <skiboot.h>
+#include <chiptod.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <timebase.h>
+
+//#define DBG(fmt...)	printf("CHIPTOD: " fmt)
+#define DBG(fmt...)	do { } while(0)
+
+/* TOD chip XSCOM addresses */
+#define TOD_TTYPE_0			0x00040011
+#define TOD_TTYPE_1			0x00040012 /* PSS switch */
+#define TOD_TTYPE_2			0x00040013 /* Enable step checkers */
+#define TOD_TTYPE_3			0x00040014 /* Request TOD */
+#define TOD_TTYPE_4			0x00040015 /* Send TOD */
+#define TOD_TTYPE_5			0x00040016 /* Invalidate TOD */
+#define TOD_CHIPTOD_TO_TB		0x00040017
+#define TOD_LOAD_TOD_MOD		0x00040018
+#define TOD_CHIPTOD_VALUE		0x00040020
+#define TOD_CHIPTOD_LOAD_TB		0x00040021
+#define TOD_CHIPTOD_FSM			0x00040024
+
+/* -- TOD PIB Master reg -- */
+#define TOD_PIB_MASTER			0x00040027
+#define   TOD_PIBM_ADDR_CFG_MCAST	PPC_BIT(25)
+#define   TOD_PIBM_ADDR_CFG_SLADDR_MASK	PPC_BITMASK(26,31)
+#define   TOD_PIBM_ADDR_CFG_SLADDR_LSH	PPC_BITLSHIFT(31)
+
+/* -- TOD Error interrupt register -- */
+#define TOD_ERROR			0x00040030
+/* SYNC errors */
+#define   TOD_ERR_CRMO_PARITY		PPC_BIT(0)
+#define   TOD_ERR_OSC0_PARITY		PPC_BIT(1)
+#define   TOD_ERR_OSC1_PARITY		PPC_BIT(2)
+#define   TOD_ERR_CRITC_PARITY		PPC_BIT(13)
+#define   TOD_ERR_PSS_HAMMING_DISTANCE	PPC_BIT(18)
+#define	  TOD_ERR_DELAY_COMPL_PARITY	PPC_BIT(22)
+/* CNTR errors */
+#define   TOD_ERR_CTCR_PARITY		PPC_BIT(32)
+#define   TOD_ERR_TOD_SYNC_CHECK	PPC_BIT(33)
+#define   TOD_ERR_TOD_FSM_PARITY	PPC_BIT(34)
+#define   TOD_ERR_TOD_REGISTER_PARITY	PPC_BIT(35)
+#define   TOD_ERR_OVERFLOW_YR2042	PPC_BIT(36)
+#define   TOD_ERR_TOD_WOF_LSTEP_PARITY	PPC_BIT(37)
+#define   TOD_ERR_TTYPE0_RECVD		PPC_BIT(38)
+#define   TOD_ERR_TTYPE1_RECVD		PPC_BIT(39)
+#define   TOD_ERR_TTYPE2_RECVD		PPC_BIT(40)
+#define   TOD_ERR_TTYPE3_RECVD		PPC_BIT(41)
+#define   TOD_ERR_TTYPE4_RECVD		PPC_BIT(42)
+#define   TOD_ERR_TTYPE5_RECVD		PPC_BIT(43)
+
+/* Magic TB value. One step cycle ahead of sync */
+#define INIT_TB	0x000000000001ff0
+
+/* Number of iterations for the various timeouts */
+#define TIMEOUT_LOOPS		20000000
+
+static enum chiptod_type {
+	chiptod_unknown,
+	chiptod_p7,
+	chiptod_p8
+} chiptod_type;
+
+static int32_t chiptod_primary = -1;
+static int32_t chiptod_secondary = -1;
+
+/* The base TFMR value is the same for the whole machine
+ * for now as far as I can tell
+ */
+static uint64_t base_tfmr;
+
+/*
+ * For now, we use a global lock for runtime chiptod operations,
+ * eventually make this a per-core lock for wakeup rsync and
+ * take all of them for RAS cases.
+ */
+static struct lock chiptod_lock = LOCK_UNLOCKED;
+
+static void chiptod_setup_base_tfmr(void)
+{
+	struct dt_node *cpu = this_cpu()->node;
+	uint64_t core_freq, tod_freq;
+	uint64_t mcbs;
+
+	base_tfmr = SPR_TFMR_TB_ECLIPZ;
+
+	/* Get CPU and TOD freqs in Hz */
+	if (dt_has_node_property(cpu,"ibm,extended-clock-frequency", NULL))
+		core_freq = dt_prop_get_u64(cpu,"ibm,extended-clock-frequency");
+	else
+		core_freq = dt_prop_get_u32(cpu, "clock-frequency");
+	tod_freq = 32000000;
+
+	/* Calculate the "Max Cycles Between Steps" value according
+	 * to the magic formula:
+	 *
+	 * mcbs = (core_freq * max_jitter_factor) / (4 * tod_freq) / 100;
+	 *
+	 * The max jitter factor is set to 240 based on what pHyp uses.
+	 */
+	mcbs = (core_freq * 240) / (4 * tod_freq) / 100;
+	printf("CHIPTOD: Calculated MCBS is 0x%llx (Cfreq=%lld Tfreq=%lld)\n",
+	       mcbs, core_freq, tod_freq);
+
+	/* Bake that all into TFMR */
+	base_tfmr = SETFIELD(SPR_TFMR_MAX_CYC_BET_STEPS, base_tfmr, mcbs);
+	base_tfmr = SETFIELD(SPR_TFMR_N_CLKS_PER_STEP, base_tfmr, 0);
+	base_tfmr = SETFIELD(SPR_TFMR_SYNC_BIT_SEL, base_tfmr, 4);
+}
+
+static bool chiptod_mod_tb(void)
+{
+	uint64_t tfmr = base_tfmr;
+	uint64_t timeout = 0;
+
+	/* Switch timebase to "Not Set" state */
+	mtspr(SPR_TFMR, tfmr | SPR_TFMR_LOAD_TOD_MOD);
+	do {
+		if (++timeout >= (TIMEOUT_LOOPS*2)) {
+			prerror("CHIPTOD: TB \"Not Set\" timeout\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CHIPTOD: TB \"Not Set\" TFMR corrupt\n");
+			return false;
+		}
+		if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == 9) {
+			prerror("CHIPTOD: TB \"Not Set\" TOD in error state\n");
+			return false;
+		}
+	} while(tfmr & SPR_TFMR_LOAD_TOD_MOD);
+
+	return true;
+}
+
+static bool chiptod_interrupt_check(void)
+{
+	uint64_t tfmr = mfspr(SPR_TFMR);
+	uint64_t timeout = 0;
+
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("CHIPTOD: Interrupt check fail\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CHIPTOD: Interrupt check TFMR corrupt !\n");
+			return false;
+		}
+	} while(tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT);
+
+	return true;
+}
+
+static bool chiptod_poll_running(void)
+{
+	uint64_t timeout = 0;
+	uint64_t tval;
+
+	/* Chip TOD running check */
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("CHIPTOD: Running check fail timeout\n");
+			return false;
+		}
+		if (xscom_readme(TOD_CHIPTOD_FSM, &tval) != 0) {
+			prerror("CHIPTOD: XSCOM error polling run\n");
+			return false;
+		}
+	} while(!(tval & 0x0800000000000000UL));
+
+	return true;
+}
+
+static bool chiptod_to_tb(void)
+{
+	uint64_t tval, tfmr, tvbits;
+	uint64_t timeout = 0;
+
+	/* Tell the ChipTOD about our fabric address
+	 *
+	 * The pib_master value is calculated from the CPU core ID, given in
+	 * the PIR. Because we have different core/thread arrangements in the
+	 * PIR between p7 and p8, we need to do the calculation differently.
+	 *
+	 * p7: 0b00001 || 3-bit core id
+	 * p8: 0b0001 || 4-bit core id
+	 */
+
+	if (xscom_readme(TOD_PIB_MASTER, &tval) != 0) {
+		prerror("CHIPTOD: XSCOM error reading PIB_MASTER\n");
+		return false;
+	}
+	if (chiptod_type == chiptod_p8) {
+		tvbits = (this_cpu()->pir >> 3) & 0xf;
+		tvbits |= 0x10;
+	} else {
+		tvbits = (this_cpu()->pir >> 2) & 0x7;
+		tvbits |= 0x08;
+	}
+	tval &= ~TOD_PIBM_ADDR_CFG_MCAST;
+	tval = SETFIELD(TOD_PIBM_ADDR_CFG_SLADDR, tval, tvbits);
+	if (xscom_writeme(TOD_PIB_MASTER, tval) != 0) {
+		prerror("CHIPTOD: XSCOM error writing PIB_MASTER\n");
+		return false;
+	}
+
+	/* Make us ready to get the TB from the chipTOD */
+	mtspr(SPR_TFMR, base_tfmr | SPR_TFMR_MOVE_CHIP_TOD_TO_TB);
+
+	/* Tell the ChipTOD to send it */
+	if (xscom_writeme(TOD_CHIPTOD_TO_TB, (1ULL << 63)) != 0) {
+		prerror("CHIPTOD: XSCOM error writing CHIPTOD_TO_TB\n");
+		return false;
+	}
+
+	/* Wait for it to complete */
+	timeout = 0;
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			prerror("CHIPTOD: Chip to TB timeout\n");
+			return false;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CHIPTOD: MoveToTB: corrupt TFMR !\n");
+			return false;
+		}
+	} while(tfmr & SPR_TFMR_MOVE_CHIP_TOD_TO_TB);
+
+	return true;
+}
+
+static bool chiptod_check_tb_running(void)
+{
+	/* We used to wait for two SYNC pulses in TFMR but that
+	 * doesn't seem to occur in sim, so instead we use a
+	 * method similar to what pHyp does which is to check for
+	 * TFMR SPR_TFMR_TB_VALID and not SPR_TFMR_TFMR_CORRUPT
+	 */
+#if 0
+	uint64_t tfmr, timeout;
+	unsigned int i;
+
+	for (i = 0; i < 2; i++) {
+		tfmr = mfspr(SPR_TFMR);
+		tfmr &= ~SPR_TFMR_TB_SYNC_OCCURED;
+		mtspr(SPR_TFMR, tfmr);
+		timeout = 0;
+		do {
+			if (++timeout >= TIMEOUT_LOOPS) {
+				prerror("CHIPTOD: No sync pulses\n");
+				return false;
+			}
+			tfmr = mfspr(SPR_TFMR);
+		} while(!(tfmr & SPR_TFMR_TB_SYNC_OCCURED));
+	}
+#else
+	uint64_t tfmr = mfspr(SPR_TFMR);
+
+	return (tfmr & SPR_TFMR_TB_VALID) &&
+		!(tfmr & SPR_TFMR_TFMR_CORRUPT);
+#endif
+	return true;
+}
+
+static void chiptod_reset_tb_errors(void)
+{
+	uint64_t tfmr;
+	unsigned long timeout = 0;
+
+	/* Ask for automatic clear of errors */
+	tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+
+	/* Additionally pHyp sets these (write-1-to-clear ?) */
+	tfmr |= SPR_TFMR_TB_MISSING_SYNC;
+	tfmr |= SPR_TFMR_TB_MISSING_STEP;
+	tfmr |= SPR_TFMR_TB_RESIDUE_ERR;
+	mtspr(SPR_TFMR, tfmr);
+
+	/* We have to write "Clear TB Errors" again */
+	tfmr = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
+	mtspr(SPR_TFMR, tfmr);
+
+	do {
+		if (++timeout >= TIMEOUT_LOOPS) {
+			/* Don't actually do anything on error for
+			 * now ... not much we can do, panic maybe ?
+			 */
+			prerror("CHIPTOD: TB error reset timeout !\n");
+			return;
+		}
+		tfmr = mfspr(SPR_TFMR);
+		if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+			prerror("CHIPTOD: TB error reset: corrupt TFMR !\n");
+			return;
+		}
+	} while(tfmr & SPR_TFMR_CLEAR_TB_ERRORS);
+}
+
+static void chiptod_cleanup_thread_tfmr(void)
+{
+	uint64_t tfmr = base_tfmr;
+
+	tfmr |= SPR_TFMR_PURR_PARITY_ERR;
+	tfmr |= SPR_TFMR_SPURR_PARITY_ERR;
+	tfmr |= SPR_TFMR_DEC_PARITY_ERR;
+	tfmr |= SPR_TFMR_TFMR_CORRUPT;
+	tfmr |= SPR_TFMR_PURR_OVERFLOW;
+	tfmr |= SPR_TFMR_SPURR_OVERFLOW;
+	mtspr(SPR_TFMR, tfmr);
+}
+
+static void chiptod_reset_tod_errors(void)
+{
+	uint64_t terr;
+
+	/*
+	 * At boot, we clear the errors that the firmware is
+	 * supposed to handle. List provided by the pHyp folks.
+	 */
+	
+	terr = TOD_ERR_CRITC_PARITY;
+	terr |= TOD_ERR_PSS_HAMMING_DISTANCE;
+	terr |= TOD_ERR_DELAY_COMPL_PARITY;
+	terr |= TOD_ERR_CTCR_PARITY;
+	terr |= TOD_ERR_TOD_SYNC_CHECK;
+	terr |= TOD_ERR_TOD_FSM_PARITY;
+	terr |= TOD_ERR_TOD_REGISTER_PARITY;
+
+	if (xscom_writeme(TOD_ERROR, terr) != 0) {
+		prerror("CHIPTOD: XSCOM error writing TOD_ERROR !\n");
+		/* Not much we can do here ... abort ? */
+	}
+}
+
+static void chiptod_sync_master(void *data)
+{
+	bool *result = data;
+
+	printf("CHIPTOD: Master sync on CPU PIR 0x%04x...\n", this_cpu()->pir);
+
+	/* Apply base tfmr */
+	mtspr(SPR_TFMR, base_tfmr);
+
+	/* From recipe provided by pHyp folks, reset various errors
+	 * before attempting the sync
+	 */
+	chiptod_reset_tb_errors();
+
+	/* Cleanup thread tfmr bits */
+	chiptod_cleanup_thread_tfmr();
+
+	/* Reset errors in the chiptod itself */
+	chiptod_reset_tod_errors();
+
+	/* Switch timebase to "Not Set" state */
+	if (!chiptod_mod_tb())
+		goto error;
+	DBG("SYNC MASTER Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD step checkers enable */
+	if (xscom_writeme(TOD_TTYPE_2, (1UL << 63)) != 0) {
+		prerror("CHIPTOD: XSCOM error enabling steppers\n");
+		goto error;
+	}
+
+	DBG("SYNC MASTER Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD interrupt check */
+	if (!chiptod_interrupt_check())
+		goto error;	
+	DBG("SYNC MASTER Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Switch local chiptod to "Not Set" state */
+	if (xscom_writeme(TOD_LOAD_TOD_MOD, (1UL << 63)) != 0) {
+		prerror("CHIPTOD: XSCOM error sending LOAD_TOD_MOD\n");
+		goto error;
+	}
+
+	/* Switch all remote chiptod to "Not Set" state */
+	if (xscom_writeme(TOD_TTYPE_5, (1UL << 63)) != 0) {
+		prerror("CHIPTOD: XSCOM error sending TTYPE_5\n");
+		goto error;
+	}
+
+	/* Chip TOD load initial value */
+	if (xscom_writeme(TOD_CHIPTOD_LOAD_TB, INIT_TB) != 0) {
+		prerror("CHIPTOD: XSCOM error setting init TB\n");
+		goto error;
+	}
+
+	DBG("SYNC MASTER Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	if (!chiptod_poll_running())
+		goto error;
+	DBG("SYNC MASTER Step 6 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Move chiptod value to core TB */
+	if (!chiptod_to_tb())
+		goto error;
+	DBG("SYNC MASTER Step 7 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Send local chip TOD to all chips TOD */
+	if (xscom_writeme(TOD_TTYPE_4, (1ULL << 63)) != 0) {
+		prerror("CHIPTOD: XSCOM error sending TTYPE_4\n");
+		goto error;
+	}
+
+	/* Check if TB is running */
+	if (!chiptod_check_tb_running())
+		goto error;
+
+	DBG("Master sync completed, TB=%lx\n", mfspr(SPR_TBRL));
+
+	/*
+	 * A little delay to make sure the remote chips get up to
+	 * speed before we start syncing them.
+	 *
+	 * We have to do it here because we know our TB is running
+	 * while the boot thread TB might not yet.
+	 */
+	time_wait_ms(1);
+
+	*result = true;
+	return;
+ error:
+	prerror("CHIPTOD: Master sync failed! TFMR=0x%016lx\n",
+		mfspr(SPR_TFMR));
+	*result = false;
+}
+
+static void chiptod_sync_slave(void *data)
+{
+	bool *result = data;
+
+	/* Only get primaries, not threads */
+	if (this_cpu()->is_secondary) {
+		/* On secondaries we just cleanup the TFMR */
+		chiptod_cleanup_thread_tfmr();
+		*result = true;
+		return;
+	}
+
+	printf("CHIPTOD: Slave sync on CPU PIR 0x%04x...\n", this_cpu()->pir);
+
+	/* Apply base tfmr */
+	mtspr(SPR_TFMR, base_tfmr);
+
+	/* From recipe provided by pHyp folks, reset various errors
+	 * before attempting the sync
+	 */
+	chiptod_reset_tb_errors();
+
+	/* Cleanup thread tfmr bits */
+	chiptod_cleanup_thread_tfmr();
+
+	/* Switch timebase to "Not Set" state */
+	if (!chiptod_mod_tb())
+		goto error;
+	DBG("SYNC SLAVE Step 2 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD running check */
+	if (!chiptod_poll_running())
+		goto error;
+	DBG("SYNC SLAVE Step 3 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Chip TOD interrupt check */
+	if (!chiptod_interrupt_check())
+		goto error;
+	DBG("SYNC SLAVE Step 4 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Move chiptod value to core TB */
+	if (!chiptod_to_tb())
+		goto error;
+	DBG("SYNC SLAVE Step 5 TFMR=0x%016lx\n", mfspr(SPR_TFMR));
+
+	/* Check if TB is running */
+	if (!chiptod_check_tb_running())
+		goto error;
+
+	DBG("Slave sync completed, TB=%lx\n", mfspr(SPR_TBRL));
+
+	*result = true;
+	return;
+ error:
+	prerror("CHIPTOD: Slave sync failed ! TFMR=0x%016lx\n",
+		mfspr(SPR_TFMR));
+	*result = false;
+}
+
+bool chiptod_wakeup_resync(void)
+{
+	lock(&chiptod_lock);
+
+	/* Apply base tfmr */
+	mtspr(SPR_TFMR, base_tfmr);
+
+	/* From recipe provided by pHyp folks, reset various errors
+	 * before attempting the sync
+	 */
+	chiptod_reset_tb_errors();
+
+	/* Cleanup thread tfmr bits */
+	chiptod_cleanup_thread_tfmr();
+
+	/* Switch timebase to "Not Set" state */
+	if (!chiptod_mod_tb())
+		goto error;
+
+	/* Move chiptod value to core TB */
+	if (!chiptod_to_tb())
+		goto error;
+
+	unlock(&chiptod_lock);
+
+	return true;
+ error:
+	prerror("CHIPTOD: Resync failed ! TFMR=0x%16lx\n", mfspr(SPR_TFMR));
+	unlock(&chiptod_lock);
+	return false;
+}
+
+static int64_t opal_resync_timebase(void)
+{
+       if (!chiptod_wakeup_resync()) {
+               printf("OPAL: Resync timebase failed on CPU 0x%04x\n",
+		      this_cpu()->pir);
+               return OPAL_HARDWARE;
+       }
+       return OPAL_SUCCESS;
+}
+opal_call(OPAL_RESYNC_TIMEBASE, opal_resync_timebase, 0);
+
+static void chiptod_print_tb(void *data __unused)
+{
+	printf("CHIPTOD: PIR 0x%04x TB=%lx\n",
+	       this_cpu()->pir, mfspr(SPR_TBRL));
+}
+
+static bool chiptod_probe(u32 master_cpu)
+{
+	struct dt_node *np;
+
+	dt_for_each_compatible(dt_root, np, "ibm,power-chiptod") {
+		uint32_t chip;
+
+		/* Old DT has chip-id in chiptod node, newer only in the
+		 * parent xscom bridge
+		 */
+		chip = dt_get_chip_id(np);
+
+		if (dt_has_node_property(np, "primary", NULL)) {
+		    chiptod_primary = chip;
+		    if (dt_node_is_compatible(np,"ibm,power7-chiptod"))
+			    chiptod_type = chiptod_p7;
+		    if (dt_node_is_compatible(np,"ibm,power8-chiptod"))
+			    chiptod_type = chiptod_p8;
+		}
+
+		if (dt_has_node_property(np, "secondary", NULL))
+		    chiptod_secondary = chip;
+
+	}
+
+	/*
+	 * If ChipTOD isn't found in the device-tree, we fallback
+	 * based on the master CPU passed by OPAL boot since the
+	 * FSP strips off the ChipTOD info from the HDAT when booting
+	 * in OPAL mode :-(
+	 */
+	if (chiptod_primary < 0) {
+		struct cpu_thread *t = find_cpu_by_pir(master_cpu);
+		printf("CHIPTOD: Cannot find a primary TOD in device-tree\n");
+		printf("CHIPTOD: Falling back to Master CPU: %d\n", master_cpu);
+		if (!t) {
+			prerror("CHIPTOD: NOT FOUND !\n");
+			return false;
+		}
+		chiptod_primary = t->chip_id;
+		switch(proc_gen) {
+		case proc_gen_p7:
+			chiptod_type = chiptod_p7;
+			return true;
+		case proc_gen_p8:
+			chiptod_type = chiptod_p8;
+			return true;
+		default:
+			break;
+		}	
+		prerror("CHIPTOD: Unknown fallback CPU type !\n");
+		return false;
+	}
+	if (chiptod_type == chiptod_unknown) {
+		prerror("CHIPTOD: Unknown TOD type !\n");
+		return false;
+	}
+
+	return true;
+}
+
+void chiptod_init(u32 master_cpu)
+{
+	struct cpu_thread *cpu0, *cpu;
+	bool sres;
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 0);
+
+	if (!chiptod_probe(master_cpu)) {
+		prerror("CHIPTOD: Failed ChipTOD detection !\n");
+		op_display(OP_FATAL, OP_MOD_CHIPTOD, 0);
+		abort();
+	}
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 1);
+
+	/* Pick somebody on the primary */
+	cpu0 = find_cpu_by_chip_id(chiptod_primary);
+
+	/* Calculate the base TFMR value used for everybody */
+	chiptod_setup_base_tfmr();
+
+	printf("CHIPTOD: Base TFMR=0x%016llx\n", base_tfmr);
+
+	/* Schedule master sync */
+	sres = false;
+	cpu_wait_job(cpu_queue_job(cpu0, chiptod_sync_master, &sres), true);
+	if (!sres) {
+		op_display(OP_FATAL, OP_MOD_CHIPTOD, 2);
+		abort();
+	}
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 2);
+
+	/* Schedule slave sync */
+	for_each_available_cpu(cpu) {
+		/* Skip master */
+		if (cpu == cpu0)
+			continue;
+
+		/* Queue job */
+		sres = false;
+		cpu_wait_job(cpu_queue_job(cpu, chiptod_sync_slave, &sres),
+			     true);
+		if (!sres) {
+			op_display(OP_WARN, OP_MOD_CHIPTOD, 3|(cpu->pir << 8));
+
+			/* Disable threads */
+			cpu_disable_all_threads(cpu);
+		}
+		op_display(OP_LOG, OP_MOD_CHIPTOD, 3|(cpu->pir << 8));
+	}
+
+	/* Display TBs */
+	for_each_available_cpu(cpu) {
+		/* Only do primaries, not threads */
+		if (cpu->is_secondary)
+			continue;
+		cpu_wait_job(cpu_queue_job(cpu, chiptod_print_tb, NULL), true);
+	}
+
+	op_display(OP_LOG, OP_MOD_CHIPTOD, 4);
+}
diff --git a/hw/ec/Makefile.inc b/hw/ec/Makefile.inc
new file mode 100644
index 00000000..09c9c848
--- /dev/null
+++ b/hw/ec/Makefile.inc
@@ -0,0 +1,8 @@
+# -*-Makefile-*-
+# Sapphire EC makefile
+
+SUBDIRS += hw/ec
+EC_OBJS = gpio.o
+EC=hw/ec/built-in.o
+
+$(EC): $(EC_OBJS:%=hw/ec/%)
diff --git a/hw/ec/gpio.c b/hw/ec/gpio.c
new file mode 100644
index 00000000..0a2223d2
--- /dev/null
+++ b/hw/ec/gpio.c
@@ -0,0 +1,87 @@
+/* Copyright 2013-2014 Google Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdint.h>
+#include "ec/config.h"
+#include "ec/gpio.h"
+
+int ec_gpio_setup(EcGpioPort port, uint8_t pin,
+                  int is_output, int pullup_enable)
+{
+    uint8_t ddr_reg;
+    if (pin > 7) {
+        return -1;
+    }
+
+    /* Set data direction */
+    ec_outb(EC_GPIO_INDEX,
+            port * EC_GPIO_PORT_SKIP + EC_GPIO_DDR_OFFSET);
+    ddr_reg = ec_inb(EC_GPIO_DATA);
+    if (is_output) {
+        ddr_reg |= (1 << pin);
+    } else {
+        ddr_reg &= ~(1 << pin);
+    }
+    ec_outb(EC_GPIO_DATA, ddr_reg);
+
+    /* Set pullup enable for output GPOs */
+    if (is_output)
+    {
+        uint8_t pup_reg;
+        ec_outb(EC_GPIO_INDEX,
+                port * EC_GPIO_PORT_SKIP + EC_GPIO_PUP_OFFSET);
+        pup_reg = ec_inb(EC_GPIO_DATA);
+        if (pullup_enable) {
+            pup_reg |= (1 << pin);
+        } else {
+            pup_reg &= ~(1 << pin);
+        }
+        ec_outb(EC_GPIO_DATA, pup_reg);
+    }
+
+    return 0;
+}
+
+int ec_gpio_read(EcGpioPort port, uint8_t pin)
+{
+    uint8_t pin_reg;
+    if (pin > 7) {
+        return -1;
+    }
+
+    ec_outb(EC_GPIO_INDEX,
+            port * EC_GPIO_PORT_SKIP + EC_GPIO_PIN_OFFSET);
+    pin_reg = ec_inb(EC_GPIO_DATA);
+    return !!(pin_reg & (1 << pin));
+}
+
+int ec_gpio_set(EcGpioPort port, uint8_t pin, int val)
+{
+    uint8_t data_reg;
+    if (pin > 7) {
+        return -1;
+    }
+
+    ec_outb(EC_GPIO_INDEX,
+            port * EC_GPIO_PORT_SKIP + EC_GPIO_DATA_OFFSET);
+    data_reg = ec_inb(EC_GPIO_DATA);
+    if (val) {
+        data_reg |= (1 << pin);
+    } else {
+        data_reg &= ~(1 << pin);
+    }
+    ec_outb(EC_GPIO_DATA, data_reg);
+    return 0;
+}
diff --git a/hw/ec/makefile b/hw/ec/makefile
new file mode 100644
index 00000000..e6ceafaf
--- /dev/null
+++ b/hw/ec/makefile
@@ -0,0 +1,8 @@
+ROOTPATH = ../../..
+MODULE = ec
+
+OBJS = cmosdd.o gpio.o rhesus.o hostboot.o
+
+SUBDIRS = test.d
+
+include ${ROOTPATH}/config.mk
diff --git a/hw/fsi-master.c b/hw/fsi-master.c
new file mode 100644
index 00000000..67d337a6
--- /dev/null
+++ b/hw/fsi-master.c
@@ -0,0 +1,297 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <xscom.h>
+#include <lock.h>
+#include <timebase.h>
+#include <chip.h>
+#include <fsi-master.h>
+
+//#define DBG(fmt...) printf("MFSI: " fmt)
+#define DBG(fmt...) do { } while(0)
+
+
+/*
+ * FSI Masters sit on OPB busses behind PIB2OPB bridges
+ *
+ * There are two cMFSI behind two different bridges at
+ * different XSCOM addresses. For now we don't have them in
+ * the device-tree so we hard code the address
+ */
+#define PIB2OPB_MFSI0_ADDR	0x20000
+#define PIB2OPB_MFSI1_ADDR	0x30000
+
+/*
+ * Bridge registers on XSCOM that allow generatoin
+ * of OPB cycles
+ */
+#define PIB2OPB_REG_CMD		0x0
+#define   OPB_CMD_WRITE		0x80000000
+#define   OPB_CMD_READ		0x00000000
+#define   OPB_CMD_8BIT		0x00000000
+#define   OPB_CMD_16BIT		0x20000000
+#define   OPB_CMD_32BIT		0x60000000
+#define PIB2OPB_REG_STAT	0x1
+#define   OPB_STAT_BUSY		0x00010000
+#define   OPB_STAT_READ_VALID   0x00020000
+#define   OPB_STAT_ERR_OPB      0x09F00000
+#define   OPB_STAT_ERR_CMFSI    0x0000FC00
+#define   OPB_STAT_ERR_MFSI     0x000000FC
+#define   OPB_STAT_ERR_ANY      (OPB_STAT_ERR_OPB | \
+				 OPB_STAT_ERR_CMFSI | \
+				 OPB_STAT_ERR_MFSI)
+#define PIB2OPB_REG_LSTAT	0x2
+
+/*
+ * PIB2OPB 0 has 2 MFSIs, cMFSI and hMFSI, PIB2OPB 1 only
+ * has cMFSI
+ */
+#define cMFSI_OPB_PORT_BASE	0x40000
+#define cMFSI_OPB_REG_BASE	0x03000
+#define hMFSI_OPB_PORT_BASE	0x80000
+#define hMFSI_OPB_REG_BASE	0x03400
+#define MFSI_OPB_PORT_STRIDE	0x08000
+
+
+/*
+ * Use a global FSI lock for now. Beware of re-entrancy
+ * if we ever add support for normal chip XSCOM via FSI, in
+ * which case we'll probably have to consider either per chip
+ * lock (which can have AB->BA deadlock issues) or a re-entrant
+ * global lock
+ */
+static struct lock fsi_lock = LOCK_UNLOCKED;
+static uint32_t mfsi_valid_err = OPB_STAT_ERR_ANY;
+
+/*
+ * OPB accessors
+ */
+
+#define MFSI_OPB_MAX_TRIES	120
+
+static int64_t mfsi_handle_opb_error(uint32_t chip, uint32_t xscom_base,
+				     uint32_t stat)
+{
+	int64_t rc;
+
+	prerror("MFSI: Error status=0x%08x !\n", stat);
+
+	/* XXX Dump a bunch of data, create an error log ... */
+
+	/* Clean error */
+	rc = xscom_write(chip, xscom_base + PIB2OPB_REG_STAT, 0);
+	if (rc)
+		prerror("MFSI: XSCOM error %lld clearing status\n", rc);
+
+	/*
+	 * XXX HB resets the ports here, but that's broken as it will
+	 * re-enter the opb accessors ... the HW is a mess here, it mixes
+	 * the OPB stuff with the FSI stuff in horrible ways.
+	 * If we want to reset the port and generally handle FSI specific
+	 * errors we should do that at the upper level and leave only the
+	 * OPB error handling here.
+	 *
+	 * We probably need to return "stat" to the callers too for that
+	 * to work
+	 */
+	
+	return OPAL_HARDWARE;
+}
+
+static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
+			     uint32_t *read_data)
+{
+	unsigned long retries = MFSI_OPB_MAX_TRIES;
+	uint64_t sval;
+	uint32_t stat;
+	int64_t rc;
+
+	/* We try again every 10us for a bit more than 1ms */
+	for (;;) {
+		/* Read OPB status register */
+		rc = xscom_read(chip, xscom_base + PIB2OPB_REG_STAT, &sval);
+		if (rc) {
+			/* Do something here ? */
+			prerror("MFSI: XSCOM error %lld read OPB STAT\n", rc);
+			return rc;
+		}
+		DBG("  STAT=0x%16llx...\n", sval);
+
+		stat = sval >> 32;
+
+		/* Complete */
+		if (!(stat & OPB_STAT_BUSY))
+			break;
+		/* Error */
+		if (stat & mfsi_valid_err)
+			break;
+		if (retries-- == 0) {
+			/* XXX What should we do here ? reset it ? */
+			prerror("MFSI: OPB POLL timeout !\n");
+			return OPAL_HARDWARE;
+		}
+		time_wait_us(10);
+	}
+
+	/* Did we have an error ? */
+	if (stat & mfsi_valid_err)
+		return mfsi_handle_opb_error(chip, xscom_base, stat);
+
+	if (read_data) {
+		if (!(stat & OPB_STAT_READ_VALID)) {
+			prerror("MFSI: Read successful but no data !\n");
+			/* What do do here ? can it actually happen ? */
+			sval |= 0xffffffff;
+		}
+		*read_data = sval & 0xffffffff;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t mfsi_opb_read(uint32_t chip, uint32_t xscom_base,
+			     uint32_t addr, uint32_t *data)
+{
+	uint64_t opb_cmd = OPB_CMD_READ | OPB_CMD_32BIT;
+	int64_t rc;
+
+	if (addr > 0x00ffffff)
+		return OPAL_PARAMETER;
+
+	opb_cmd |= addr;
+	opb_cmd <<= 32;
+
+	DBG("MFSI_OPB_READ: Writing 0x%16llx to XSCOM %x\n",
+	    opb_cmd, xscom_base);
+
+	rc = xscom_write(chip, xscom_base + PIB2OPB_REG_CMD, opb_cmd);
+	if (rc) {
+		prerror("MFSI: XSCOM error %lld writing OPB CMD\n", rc);
+		return rc;
+	}
+	return mfsi_opb_poll(chip, xscom_base, data);
+}
+
+static int64_t mfsi_opb_write(uint32_t chip, uint32_t xscom_base,
+			      uint32_t addr, uint32_t data)
+{
+	uint64_t opb_cmd = OPB_CMD_WRITE | OPB_CMD_32BIT;
+	int64_t rc;
+
+	if (addr > 0x00ffffff)
+		return OPAL_PARAMETER;
+
+	opb_cmd |= addr;
+	opb_cmd <<= 32;
+	opb_cmd |= data;
+
+	DBG("MFSI_OPB_WRITE: Writing 0x%16llx to XSCOM %x\n",
+	    opb_cmd, xscom_base);
+
+	rc = xscom_write(chip, xscom_base + PIB2OPB_REG_CMD, opb_cmd);
+	if (rc) {
+		prerror("MFSI: XSCOM error %lld writing OPB CMD\n", rc);
+		return rc;
+	}
+	return mfsi_opb_poll(chip, xscom_base, NULL);
+}
+
+static int64_t mfsi_get_addrs(uint32_t mfsi, uint32_t port,
+			      uint32_t *xscom_base, uint32_t *port_base,
+			      uint32_t *reg_base)
+{
+	if (port > 7)
+		return OPAL_PARAMETER;
+
+	/* We hard code everything for now */
+	switch(mfsi) {
+	case MFSI_cMFSI0:
+		*xscom_base = PIB2OPB_MFSI0_ADDR;
+		*port_base = cMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
+		*reg_base = cMFSI_OPB_REG_BASE;
+		break;
+	case MFSI_cMFSI1:
+		*xscom_base = PIB2OPB_MFSI1_ADDR;
+		*port_base = cMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
+		*reg_base = cMFSI_OPB_REG_BASE;
+		break;
+	case MFSI_hMFSI0:
+		*xscom_base = PIB2OPB_MFSI0_ADDR;
+		*port_base = hMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
+		*reg_base = hMFSI_OPB_REG_BASE;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+	return OPAL_SUCCESS;
+}
+
+int64_t mfsi_read(uint32_t chip, uint32_t mfsi, uint32_t port,
+		  uint32_t fsi_addr, uint32_t *data)
+{
+	int64_t rc;
+	uint32_t xscom, port_addr, reg;
+
+	rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, &reg);
+	if (rc)
+		return rc;
+	lock(&fsi_lock);
+	rc = mfsi_opb_read(chip, xscom, port_addr + fsi_addr, data);
+	/* XXX Handle FSI level errors here, maybe reset port */
+	unlock(&fsi_lock);
+
+	return rc;
+}
+
+int64_t mfsi_write(uint32_t chip, uint32_t mfsi, uint32_t port,
+		   uint32_t fsi_addr, uint32_t data)
+{
+	int64_t rc;
+	uint32_t xscom, port_addr, reg;
+
+	rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, &reg);
+	if (rc)
+		return rc;
+	lock(&fsi_lock);
+	rc = mfsi_opb_write(chip, xscom, port_addr + fsi_addr, data);
+	/* XXX Handle FSI level errors here, maybe reset port */
+	unlock(&fsi_lock);
+
+	return rc;
+}
+
+void mfsi_init(void)
+{
+	struct proc_chip *chip;
+
+	/* For now assume all chips are the same DD... might need
+	 * fixing.
+	 */
+	chip = next_chip(NULL);
+	assert(chip);
+	if (chip->type == PROC_CHIP_P8_MURANO) {
+		/* Hardware Bug HW222712 on Murano DD1.0 causes the
+		 * any_error bit to be un-clearable so we just
+		 * have to ignore it
+ 		 */
+		if (chip->ec_level < 0x20) {
+			/* 16: cMFSI any-master-error */
+			/* 24: hMFSI any-master-error */
+			mfsi_valid_err &= 0xFFFF7F7F;
+ 		}
+	}
+}
+
diff --git a/hw/fsp/Makefile.inc b/hw/fsp/Makefile.inc
new file mode 100644
index 00000000..c16d0603
--- /dev/null
+++ b/hw/fsp/Makefile.inc
@@ -0,0 +1,9 @@
+SUBDIRS += hw/fsp
+
+FSP_OBJS  = fsp.o fsp-console.o fsp-rtc.o fsp-nvram.o fsp-sysparam.o
+FSP_OBJS += fsp-surveillance.o fsp-codeupdate.o fsp-sensor.o
+FSP_OBJS += fsp-diag.o fsp-leds.o fsp-mem-err.o fsp-op-panel.o
+FSP_OBJS += fsp-elog-read.o fsp-elog-write.o
+FSP_OBJS += fsp-dump.o fsp-mdst-table.o
+FSP = hw/fsp/built-in.o
+$(FSP): $(FSP_OBJS:%=hw/fsp/%)
diff --git a/hw/fsp/fsp-codeupdate.c b/hw/fsp/fsp-codeupdate.c
new file mode 100644
index 00000000..be705a48
--- /dev/null
+++ b/hw/fsp/fsp-codeupdate.c
@@ -0,0 +1,1197 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <fsp-sysparam.h>
+#include <lock.h>
+#include <codeupdate.h>
+#include <device.h>
+#include <ccan/endian/endian.h>
+#include <fsp-elog.h>
+
+enum flash_state {
+	FLASH_STATE_ABSENT,
+	FLASH_STATE_INVALID, /* IPL side marker lid is invalid */
+	FLASH_STATE_READING,
+	FLASH_STATE_READ,
+};
+
+enum lid_fetch_side {
+	FETCH_T_SIDE_ONLY,
+	FETCH_P_SIDE_ONLY,
+	FETCH_BOTH_SIDE,
+};
+
+static enum flash_state flash_state = FLASH_STATE_INVALID;
+static enum lid_fetch_side lid_fetch_side = FETCH_BOTH_SIDE;
+
+/* Image buffers */
+static struct opal_sg_list *image_data;
+static uint32_t tce_start;
+static void *lid_data;
+static char validate_buf[VALIDATE_BUF_SIZE];
+
+/* TCE buffer lock */
+static struct lock flash_lock = LOCK_UNLOCKED;
+
+/* FW VPD data */
+static struct fw_image_vpd fw_vpd[2];
+
+/* Code update related sys parameters */
+static uint32_t ipl_side;
+static uint32_t hmc_managed;
+static uint32_t update_policy;
+static uint32_t in_flight_params;
+
+/* If non-NULL, this gets called just before rebooting */
+int (*fsp_flash_term_hook)(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE,
+		OPAL_PREDICTIVE_ERR_GENERAL, OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_FLASH, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_SG_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_COMMIT, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_MSG, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_NOTIFY, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_CU_MARKER_LID, OPAL_PLATFORM_ERR_EVT, OPAL_CODEUPDATE,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+static inline void code_update_tce_map(uint32_t tce_offset,
+				       void *buffer, uint32_t size)
+{
+	uint32_t tlen = ALIGN_UP(size, TCE_PSIZE);
+
+	fsp_tce_map(PSI_DMA_CODE_UPD + tce_offset, buffer, tlen);
+}
+
+static inline void code_update_tce_unmap(uint32_t size)
+{
+	fsp_tce_unmap(PSI_DMA_CODE_UPD, size);
+}
+
+static inline void set_def_fw_version(uint32_t side)
+{
+	strncpy(fw_vpd[side].MI_keyword, FW_VERSION_UNKNOWN, MI_KEYWORD_SIZE);
+	strncpy(fw_vpd[side].ext_fw_id, FW_VERSION_UNKNOWN, ML_KEYWORD_SIZE);
+}
+
+/*
+ * Get IPL side
+ */
+static void get_ipl_side(void)
+{
+	struct dt_node *iplp;
+	const char *side = NULL;
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp)
+		side = dt_prop_get_def(iplp, "cec-ipl-side", NULL);
+	printf("CUPD: IPL SIDE = %s\n", side);
+
+	if (!side || !strcmp(side, "temp"))
+		ipl_side = FW_IPL_SIDE_TEMP;
+	else
+		ipl_side = FW_IPL_SIDE_PERM;
+}
+
+
+/*
+ * Helper routines to retrieve code update related
+ * system parameters from FSP.
+ */
+
+static void inc_in_flight_param(void)
+{
+	lock(&flash_lock);
+	in_flight_params++;
+	unlock(&flash_lock);
+}
+
+static void dec_in_flight_param(void)
+{
+	lock(&flash_lock);
+	assert(in_flight_params > 0);
+	in_flight_params--;
+	unlock(&flash_lock);
+}
+
+static void got_code_update_policy(uint32_t param_id __unused, int err_len,
+				   void *data __unused)
+{
+	if (err_len != 4) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error "
+			"retrieving code update policy: %d\n", err_len);
+	} else
+		printf("CUPD: Code update policy from FSP: %d\n",
+		       update_policy);
+
+	dec_in_flight_param();
+}
+
+static void get_code_update_policy(void)
+{
+	int rc;
+
+	inc_in_flight_param();
+	rc = fsp_get_sys_param(SYS_PARAM_FLASH_POLICY, &update_policy, 4,
+			       got_code_update_policy, NULL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT),
+			"CUPD: Error %d queueing param request\n", rc);
+		dec_in_flight_param();
+	}
+}
+
+static void got_platform_hmc_managed(uint32_t param_id __unused, int err_len,
+				     void *data __unused)
+{
+	if (err_len != 4) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT), "CUPD: Error "
+			"retrieving hmc managed status: %d\n", err_len);
+	} else
+		printf("CUPD: HMC managed status from FSP: %d\n", hmc_managed);
+
+	dec_in_flight_param();
+}
+
+static void get_platform_hmc_managed(void)
+{
+	int rc;
+
+	inc_in_flight_param();
+	rc = fsp_get_sys_param(SYS_PARAM_HMC_MANAGED, &hmc_managed, 4,
+			       got_platform_hmc_managed, NULL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT),
+			"FLASH: Error %d queueing param request\n", rc);
+		dec_in_flight_param();
+	}
+}
+
+static int64_t code_update_check_state(void)
+{
+	switch(flash_state) {
+	case FLASH_STATE_ABSENT:
+		return OPAL_HARDWARE;
+	case FLASH_STATE_INVALID:
+		return OPAL_INTERNAL_ERROR;
+	case FLASH_STATE_READING:
+		return OPAL_BUSY;
+	default:
+		break;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Get common marker LID additional data section
+ */
+static void *get_adf_sec_data(struct com_marker_adf_sec *adf_sec,
+			      uint32_t name)
+{
+	struct com_marker_adf_header *adf_header;
+	int i;
+
+	adf_header = (void *)adf_sec->adf_data;
+	for (i = 0; i < be32_to_cpu(adf_sec->adf_cnt); i++) {
+		if (be32_to_cpu(adf_header->name) == name)
+			return adf_header;
+
+		adf_header = (void *)adf_header + be32_to_cpu(adf_header->size);
+	}
+	return NULL;
+}
+
+/*
+ * Parse common marker LID to get FW version details
+ *
+ * Note:
+ *   At present, we are parsing "Service Pack Nomenclature ADF"
+ *   section only. If we are adding FW IP support, then we have
+ *   to parse "Firmware IP Protection ADF" as well.
+ */
+static void parse_marker_lid(uint32_t side)
+{
+	struct com_marker_header *header;
+	struct com_marker_mi_section *mi_sec;
+	struct com_marker_adf_sec *adf_sec;
+	struct com_marker_adf_sp *adf_sp;
+
+	header = (void *)lid_data;
+
+	/* Get MI details */
+	mi_sec = (void *)header + be32_to_cpu(header->MI_offset);
+	/*
+	 * If Marker LID is invalid, then FSP will return a Marker
+	 * LID with ASCII zeros for the entire MI keyword.
+	 */
+	if (mi_sec->MI_keyword[0] == '0')
+		return;
+
+	strncpy(fw_vpd[side].MI_keyword, mi_sec->MI_keyword, MI_KEYWORD_SIZE);
+	fw_vpd[side].MI_keyword[MI_KEYWORD_SIZE - 1] = '\0';
+	printf("CUPD: %s side MI Keyword = %s\n",
+	       side == 0x00 ? "P" : "T", fw_vpd[side].MI_keyword);
+
+	/* Get ML details */
+	adf_sec = (void *)header + be32_to_cpu(mi_sec->adf_offset);
+	adf_sp = get_adf_sec_data(adf_sec, ADF_NAME_SP);
+	if (!adf_sp)
+		return;
+
+	strncpy(fw_vpd[side].ext_fw_id,
+		(void *)adf_sp + be32_to_cpu(adf_sp->sp_name_offset),
+		ML_KEYWORD_SIZE);
+	fw_vpd[side].ext_fw_id[ML_KEYWORD_SIZE - 1] = '\0';
+	printf("CUPD: %s side ML Keyword = %s\n",
+	       side == 0x00 ? "P" : "T", fw_vpd[side].ext_fw_id);
+}
+
+static void validate_com_marker_lid(void)
+{
+	if (!strncmp(fw_vpd[ipl_side].MI_keyword, FW_VERSION_UNKNOWN,
+		     sizeof(FW_VERSION_UNKNOWN))) {
+		log_simple_error(&e_info(OPAL_RC_CU_MARKER_LID),
+			"CUPD: IPL side Marker LID is not valid\n");
+		flash_state = FLASH_STATE_INVALID;
+		return;
+	}
+
+	flash_state = FLASH_STATE_READ;
+}
+
+static void fetch_lid_data_complete(struct fsp_msg *msg)
+{
+	void *buffer;
+	size_t length, chunk;
+	uint32_t lid_id, offset;
+	uint16_t id;
+	uint8_t flags, status;
+
+	status = (msg->resp->word1 >> 8) & 0xff;
+	flags = (msg->data.words[0] >> 16) & 0xff;
+	id = msg->data.words[0] & 0xffff;
+	lid_id = msg->data.words[1];
+	offset = msg->resp->data.words[1];
+	length = msg->resp->data.words[2];
+
+	printf("CUPD: Marker LID id : size : status = 0x%x : 0x%x : 0x%x\n",
+	       msg->data.words[1], msg->resp->data.words[2], status);
+
+	fsp_freemsg(msg);
+
+	switch (status) {
+	case FSP_STATUS_SUCCESS: /* Read complete, parse VPD */
+		parse_marker_lid(lid_id == P_COM_MARKER_LID_ID ? 0 : 1);
+		break;
+	case FSP_STATUS_MORE_DATA: /* More data left */
+		offset += length;
+		chunk = MARKER_LID_SIZE - offset;
+		if (chunk > 0) {
+			buffer = (void *)PSI_DMA_CODE_UPD + offset;
+			fsp_fetch_data_queue(flags, id, lid_id,
+					     offset, buffer, &chunk,
+					     fetch_lid_data_complete);
+			return;
+		}
+		break;
+	default:	/* Fetch LID call failed */
+		break;
+	}
+
+	/* If required, fetch T side marker LID */
+	if (lid_id == P_COM_MARKER_LID_ID &&
+	    lid_fetch_side == FETCH_BOTH_SIDE) {
+		length = MARKER_LID_SIZE;
+		fsp_fetch_data_queue(flags, id, T_COM_MARKER_LID_ID,
+				     0, (void *)PSI_DMA_CODE_UPD,
+				     &length, fetch_lid_data_complete);
+		return;
+	}
+
+	lock(&flash_lock);
+
+	/* Validate marker LID data */
+	validate_com_marker_lid();
+	/* TCE unmap */
+	code_update_tce_unmap(MARKER_LID_SIZE);
+
+	unlock(&flash_lock);
+}
+
+static void fetch_com_marker_lid(void)
+{
+	size_t length = MARKER_LID_SIZE;
+	uint32_t lid_id;
+	int rc;
+
+	/* Read in progress? */
+	rc = code_update_check_state();
+	if (rc == OPAL_HARDWARE || rc == OPAL_BUSY)
+		return;
+
+	if (lid_fetch_side == FETCH_T_SIDE_ONLY) {
+		lid_id = T_COM_MARKER_LID_ID;
+		set_def_fw_version(FW_IPL_SIDE_TEMP);
+	} else if (lid_fetch_side == FETCH_P_SIDE_ONLY) {
+		lid_id = P_COM_MARKER_LID_ID;
+		set_def_fw_version(FW_IPL_SIDE_PERM);
+	} else {
+		lid_id = P_COM_MARKER_LID_ID;
+		set_def_fw_version(FW_IPL_SIDE_PERM);
+		set_def_fw_version(FW_IPL_SIDE_TEMP);
+	}
+
+	code_update_tce_map(0, lid_data, length);
+	rc = fsp_fetch_data_queue(0x00, 0x05, lid_id, 0,
+				  (void *)PSI_DMA_CODE_UPD, &length,
+				  fetch_lid_data_complete);
+	if (!rc)
+		flash_state = FLASH_STATE_READING;
+	else
+		flash_state = FLASH_STATE_INVALID;
+}
+
+/*
+ * Add MI and ML keyword details into DT
+ */
+#define FW_VER_SIZE	64
+static void add_opal_firmware_version(void)
+{
+	struct dt_node *dt_fw;
+	char buffer[FW_VER_SIZE];
+	int offset;
+
+	dt_fw = dt_find_by_path(dt_root, "ibm,opal/firmware");
+	if (!dt_fw)
+		return;
+
+	/* MI version */
+	offset = snprintf(buffer, FW_VER_SIZE, "MI %s %s",
+			  fw_vpd[FW_IPL_SIDE_TEMP].MI_keyword,
+			  fw_vpd[FW_IPL_SIDE_PERM].MI_keyword);
+	if (ipl_side == FW_IPL_SIDE_TEMP)
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_TEMP].MI_keyword);
+	else
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_PERM].MI_keyword);
+
+	dt_add_property(dt_fw, "mi-version", buffer, strlen(buffer));
+
+	/* ML version */
+	offset = snprintf(buffer, FW_VER_SIZE, "ML %s %s",
+			  fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id,
+			  fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id);
+	if (ipl_side == FW_IPL_SIDE_TEMP)
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_TEMP].ext_fw_id);
+	else
+		snprintf(buffer + offset, FW_VER_SIZE - offset,
+			 " %s", fw_vpd[FW_IPL_SIDE_PERM].ext_fw_id);
+
+	dt_add_property(dt_fw, "ml-version", buffer, strlen(buffer));
+}
+
+/*
+ * This is called right before starting the payload (Linux) to
+ * ensure the common marker LID read and parsing has happened
+ * before we transfer control.
+ */
+void fsp_code_update_wait_vpd(bool is_boot)
+{
+	if (!fsp_present())
+		return;
+
+	printf("CUPD: Waiting read marker LID completion...\n");
+
+	while(flash_state == FLASH_STATE_READING)
+		fsp_poll();
+
+	printf("CUPD: Waiting in flight params completion...\n");
+	while(in_flight_params)
+		fsp_poll();
+
+	if (is_boot)
+		add_opal_firmware_version();
+}
+
+static int code_update_start(void)
+{
+	struct fsp_msg *msg;
+	int rc;
+	uint16_t comp = 0x00;	/* All components */
+	uint8_t side = OPAL_COMMIT_TMP_SIDE;	/* Temporary side */
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_START, 1, side << 16 | comp);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_START message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_write_lid(uint32_t lid_id, uint32_t size)
+{
+	struct fsp_msg *msg;
+	int rc, n_pairs = 1;
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_WRITE, 5, lid_id,
+			n_pairs, 0, tce_start, size);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_WRITE message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_del_lid(uint32_t lid_id)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_DEL, 1, lid_id);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_DEL message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_complete(uint32_t cmd)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CUPD COMPLETE message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_swap_side(void)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_FLASH_SWAP, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_FLASH_SWAP message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static int code_update_set_ipl_side(void)
+{
+	struct fsp_msg *msg;
+	uint8_t side = FW_IPL_SIDE_TEMP; /* Next IPL side */
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_SET_IPL_SIDE, 1, side << 16);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: CMD_SET_IPL_SIDE message allocation failed!\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_sync_msg(msg, false)) {
+		fsp_freemsg(msg);
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: Setting next IPL side failed!\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	return rc;
+}
+
+static void code_update_commit_complete(struct fsp_msg *msg)
+{
+	int rc;
+	uint8_t type;
+
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	type = (msg->word1 >> 8) & 0xff;
+	fsp_freemsg(msg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_COMMIT),
+			"CUPD: Code update commit failed, err 0x%x\n", rc);
+		return;
+	}
+
+	/* Reset cached VPD data */
+	lock(&flash_lock);
+
+	/* Find commit type */
+	if (type == 0x01) {
+		lid_fetch_side = FETCH_P_SIDE_ONLY;
+	} else if (type == 0x02)
+		lid_fetch_side = FETCH_T_SIDE_ONLY;
+	else
+		lid_fetch_side = FETCH_BOTH_SIDE;
+
+	fetch_com_marker_lid();
+
+	unlock(&flash_lock);
+}
+
+static int code_update_commit(uint32_t cmd)
+{
+	struct fsp_msg *msg;
+
+	msg = fsp_mkmsg(cmd, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_CU_MSG),
+			"CUPD: COMMIT message allocation failed !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_queue_msg(msg, code_update_commit_complete)) {
+		log_simple_error(&e_info(OPAL_RC_CU_COMMIT),
+			"CUPD: Failed to queue code update commit message\n");
+		fsp_freemsg(msg);
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Inband code update is allowed?
+ */
+static int64_t validate_inband_policy(void)
+{
+	/* Quirk:
+	 *  If the code update policy is out-of-band, but the system
+	 *  is not HMC-managed, then inband update is allowed.
+	 */
+	if (hmc_managed != PLATFORM_HMC_MANAGED)
+		return 0;
+	if (update_policy == INBAND_UPDATE_ALLOWED)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * Validate magic Number
+ */
+static int64_t validate_magic_num(uint16_t magic)
+{
+	if (magic != IMAGE_MAGIC_NUMBER)
+		return -1;
+	return 0;
+}
+
+/*
+ * Compare MI keyword to make sure candidate image
+ * is valid for this platform.
+ */
+static int64_t validate_image_version(struct update_image_header *header,
+				      uint32_t *result)
+{
+	struct fw_image_vpd vpd;
+	int t_valid = 0, p_valid = 0, cton_ver = -1, ptot_ver = -1;
+
+	/* Valid flash image level? */
+	if (strncmp(fw_vpd[0].MI_keyword, FW_VERSION_UNKNOWN,
+		    sizeof(FW_VERSION_UNKNOWN)) != 0)
+		p_valid = 1;
+
+	if (strncmp(fw_vpd[1].MI_keyword, FW_VERSION_UNKNOWN,
+		    sizeof(FW_VERSION_UNKNOWN)) != 0)
+		t_valid = 1;
+
+	/* Validate with IPL side image */
+	vpd = fw_vpd[ipl_side];
+
+	/* Validate platform identifier (first two char of MI keyword) */
+	if (strncmp(vpd.MI_keyword, header->MI_keyword_data, 2) != 0) {
+		*result = VALIDATE_INVALID_IMG;
+		return OPAL_SUCCESS;
+	}
+
+	/* Don't flash different FW series (like P7 image on P8) */
+	if (vpd.MI_keyword[2] != header->MI_keyword_data[2]) {
+		*result = VALIDATE_INVALID_IMG;
+		return OPAL_SUCCESS;
+	}
+
+	/* Get current to new version difference */
+	cton_ver = strncmp(vpd.MI_keyword + 3, header->MI_keyword_data + 3, 6);
+
+	/* Get P to T version difference */
+	if (t_valid && p_valid)
+		ptot_ver = strncmp(fw_vpd[0].MI_keyword + 3,
+				   fw_vpd[1].MI_keyword + 3, 6);
+
+	/* Update validation result */
+	if (ipl_side == FW_IPL_SIDE_TEMP) {
+		if (!ptot_ver && cton_ver > 0) /* downgrade T side */
+			*result = VALIDATE_TMP_UPDATE_DL;
+		else if (!ptot_ver && cton_ver <= 0) /* upgrade T side */
+			*result = VALIDATE_TMP_UPDATE;
+		else if (cton_ver > 0) /* Implied commit & downgrade T side */
+			*result = VALIDATE_TMP_COMMIT_DL;
+		else /* Implied commit & upgrade T side */
+			*result = VALIDATE_TMP_COMMIT;
+	} else {
+		if (!t_valid)	/* Current unknown */
+			*result = VALIDATE_CUR_UNKNOWN;
+		else if (cton_ver > 0) /* downgrade FW version */
+			*result = VALIDATE_TMP_UPDATE_DL;
+		else		/* upgrade FW version */
+			*result = VALIDATE_TMP_UPDATE;
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Validate candidate image
+ */
+static int validate_candidate_image(uint64_t buffer,
+				    uint32_t size, uint32_t *result)
+{
+	struct update_image_header *header;
+	int rc = OPAL_PARAMETER;
+
+	if (size < VALIDATE_BUF_SIZE)
+		goto out;
+
+	rc = code_update_check_state();
+	if (rc != OPAL_SUCCESS)
+		goto out;
+
+	if (validate_inband_policy() != 0) {
+		*result = VALIDATE_FLASH_AUTH;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	memcpy(validate_buf, (void *)buffer, VALIDATE_BUF_SIZE);
+	header = (struct update_image_header *)validate_buf;
+
+	if (validate_magic_num(be32_to_cpu(header->magic)) != 0) {
+		*result = VALIDATE_INVALID_IMG;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+	rc = validate_image_version(header, result);
+out:
+	return rc;
+}
+
+static int validate_out_buf_mi_data(void *buffer, int offset, uint32_t result)
+{
+	struct update_image_header *header = (void *)validate_buf;
+
+	/* Current T & P side MI data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "MI %s %s\n",
+			   fw_vpd[1].MI_keyword, fw_vpd[0].MI_keyword);
+
+	/* New T & P side MI data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "MI %s", header->MI_keyword_data);
+	if (result == VALIDATE_TMP_COMMIT_DL ||
+	    result == VALIDATE_TMP_COMMIT)
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[1].MI_keyword);
+	else
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[0].MI_keyword);
+	return offset;
+}
+
+static int validate_out_buf_ml_data(void *buffer, int offset, uint32_t result)
+{
+	struct update_image_header *header = (void *)validate_buf;
+	/* Candidate image ML data */
+	char *ext_fw_id = (void *)header->data;
+
+	/* Current T & P side ML data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "ML %s %s\n",
+			   fw_vpd[1].ext_fw_id, fw_vpd[0].ext_fw_id);
+
+	/* New T & P side ML data */
+	offset += snprintf(buffer + offset, VALIDATE_BUF_SIZE - offset,
+			   "ML %s", ext_fw_id);
+	if (result == VALIDATE_TMP_COMMIT_DL ||
+	    result == VALIDATE_TMP_COMMIT)
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[1].ext_fw_id);
+	else
+		offset += snprintf(buffer + offset,
+				   VALIDATE_BUF_SIZE - offset,
+				   " %s\n", fw_vpd[0].ext_fw_id);
+
+	return offset;
+}
+
+/*
+ * Copy LID data to TCE buffer
+ */
+static int get_lid_data(struct opal_sg_list *list,
+			int lid_size, int lid_offset)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *entry;
+	int length, num_entries, i, buf_pos = 0;
+	int map_act, map_size;
+	bool last = false;
+
+	/* Reset TCE start address */
+	tce_start = 0;
+
+	for (sg = list; sg; sg = sg->next) {
+		length = (sg->length & ~(SG_LIST_VERSION << 56)) - 16;
+		num_entries = length / sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return -1;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+
+			/*
+			 * Continue until we get data block which
+			 * contains LID data
+			 */
+			if (lid_offset > entry->length) {
+				lid_offset -= entry->length;
+				continue;
+			}
+
+                        /*
+			 * SG list entry size can be more than 4k.
+			 * Map only required pages, instead of
+			 * mapping entire entry.
+			 */
+			map_act = entry->length;
+			map_size = entry->length;
+
+			/* First TCE mapping */
+			if (!tce_start) {
+				tce_start = PSI_DMA_CODE_UPD +
+						(lid_offset & 0xfff);
+				map_act = entry->length - lid_offset;
+				lid_offset &= ~0xfff;
+				map_size = entry->length - lid_offset;
+			}
+
+			/* Check pending LID size to map */
+			if (lid_size <= map_act) {
+				/* (map_size - map_act) gives page
+				 * start to tce offset difference.
+				 * This is required when LID size
+				 * is <= 4k.
+				 */
+				map_size = (map_size - map_act) + lid_size;
+				last = true;
+			}
+
+			/* Ajust remaining size to map */
+			lid_size -= map_act;
+
+			/* TCE mapping */
+			code_update_tce_map(buf_pos, entry->data + lid_offset,
+					    map_size);
+			buf_pos += map_size;
+			/* Reset LID offset count */
+			lid_offset = 0;
+
+			if (last)
+				return OPAL_SUCCESS;
+		}
+	} /* outer loop */
+	return -1;
+}
+
+/*
+ * If IPL side is T, then swap P & T sides to add
+ * new fix to T side.
+ */
+static int validate_ipl_side(void)
+{
+	if (ipl_side == FW_IPL_SIDE_PERM)
+		return 0;
+	return code_update_swap_side();
+}
+
+static int64_t fsp_opal_validate_flash(uint64_t buffer,
+				       uint32_t *size, uint32_t *result)
+{
+	int64_t rc = 0;
+	int offset;
+
+	lock(&flash_lock);
+
+	rc = validate_candidate_image(buffer, *size, result);
+	/* Fill output buffer
+	 *
+	 * Format:
+	 *   MI<sp>current-T-image<sp>current-P-image<0x0A>
+	 *   MI<sp>new-T-image<sp>new-P-image<0x0A>
+	 *   ML<sp>current-T-image<sp>current-P-image<0x0A>
+	 *   ML<sp>new-T-image<sp>new-P-image<0x0A>
+	 */
+	if (!rc && (*result != VALIDATE_FLASH_AUTH &&
+		   *result != VALIDATE_INVALID_IMG)) {
+		/* Clear output buffer */
+		memset((void *)buffer, 0, VALIDATE_BUF_SIZE);
+
+		offset = validate_out_buf_mi_data((void *)buffer, 0, *result);
+		offset += validate_out_buf_ml_data((void *)buffer,
+						   offset, *result);
+		*size = offset;
+	}
+
+	unlock(&flash_lock);
+	return rc;
+}
+
+/* Commit/Reject T side image */
+static int64_t fsp_opal_manage_flash(uint8_t op)
+{
+	uint32_t cmd;
+	int rc;
+
+	lock(&flash_lock);
+	rc = code_update_check_state();
+	unlock(&flash_lock);
+
+	if (rc != OPAL_SUCCESS)
+		return rc;
+
+	if (op != OPAL_REJECT_TMP_SIDE && op != OPAL_COMMIT_TMP_SIDE)
+		return OPAL_PARAMETER;
+
+	if ((op == OPAL_COMMIT_TMP_SIDE && ipl_side == FW_IPL_SIDE_PERM) ||
+	    (op == OPAL_REJECT_TMP_SIDE && ipl_side == FW_IPL_SIDE_TEMP))
+		return OPAL_ACTIVE_SIDE_ERR;
+
+	if (op == OPAL_COMMIT_TMP_SIDE)
+		cmd = FSP_CMD_FLASH_NORMAL;
+	else
+		cmd = FSP_CMD_FLASH_REMOVE;
+
+	return code_update_commit(cmd);
+}
+
+static int fsp_flash_firmware(void)
+{
+	struct update_image_header *header;
+	struct lid_index_entry *idx_entry;
+	struct opal_sg_list *list;
+	struct opal_sg_entry *entry;
+	int rc, i;
+
+	lock(&flash_lock);
+
+	/* Make sure no outstanding LID read is in progress */
+	rc = code_update_check_state();
+	if (rc == OPAL_BUSY)
+		fsp_code_update_wait_vpd(false);
+
+	/* Get LID Index */
+	list = image_data;
+	if (!list)
+		goto out;
+	entry = &list->entry[0];
+	header = (struct update_image_header *)entry->data;
+	idx_entry = (void *)header + be16_to_cpu(header->lid_index_offset);
+
+	/* FIXME:
+	 *   At present we depend on FSP to validate CRC for
+	 *   individual LIDs. Calculate and validate individual
+	 *   LID CRC here.
+	 */
+
+	if (validate_ipl_side() != 0)
+		goto out;
+
+	/* Set next IPL side */
+	if (code_update_set_ipl_side() != 0)
+		goto out;
+
+	/* Start code update process */
+	if (code_update_start() != 0)
+		goto out;
+
+	/*
+	 * Delete T side LIDs before writing.
+	 *
+	 * Note:
+	 *   - Applicable for FWv >= 760.
+	 *   - Current Code Update design is to ignore
+	 *     any delete lid failure, and continue with
+	 *     the update.
+	 */
+	rc = code_update_del_lid(DEL_UPD_SIDE_LIDS);
+
+	for (i = 0; i < be16_to_cpu(header->number_lids); i++) {
+		if (be32_to_cpu(idx_entry->size) > LID_MAX_SIZE) {
+			log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: "
+				"LID size 0x%x is > max LID size \n",
+				be32_to_cpu(idx_entry->size));
+
+			goto abort_update;
+		}
+
+		rc = get_lid_data(list, be32_to_cpu(idx_entry->size),
+				  be32_to_cpu(idx_entry->offset));
+		if (rc)
+			goto abort_update;
+
+		rc = code_update_write_lid(be32_to_cpu(idx_entry->id),
+					   be32_to_cpu(idx_entry->size));
+		if (rc)
+			goto abort_update;
+
+		/* Unmap TCE */
+		code_update_tce_unmap(PSI_DMA_CODE_UPD_SIZE);
+
+		/* Next LID index */
+		idx_entry = (void *)idx_entry + sizeof(struct lid_index_entry);
+	}
+
+	/* Code update completed */
+	rc = code_update_complete(FSP_CMD_FLASH_COMPLETE);
+
+	unlock(&flash_lock);
+	return rc;
+
+abort_update:
+	log_simple_error(&e_info(OPAL_RC_CU_FLASH), "CUPD: LID update failed "
+					"Aborting codeupdate! rc:%d", rc);
+	rc = code_update_complete(FSP_CMD_FLASH_ABORT);
+out:
+	unlock(&flash_lock);
+	return -1;
+}
+
+static int64_t validate_sglist(struct opal_sg_list *list)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *prev_entry, *entry;
+	int length, num_entries, i;
+
+	prev_entry = NULL;
+	for (sg = list; sg; sg = sg->next) {
+		length = (sg->length & ~(SG_LIST_VERSION << 56)) - 16;
+		num_entries = length / sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return -1;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+
+			/* All entries must be aligned */
+			if (((uint64_t)entry->data) & 0xfff)
+				return OPAL_PARAMETER;
+
+			/* All non-terminal entries size must be aligned */
+			if (prev_entry && (prev_entry->length & 0xfff))
+				return OPAL_PARAMETER;
+
+			prev_entry = entry;
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t fsp_opal_update_flash(struct opal_sg_list *list)
+{
+	struct opal_sg_entry *entry;
+	int length, num_entries, result = 0, rc = OPAL_PARAMETER;
+
+	/* Ensure that the sg list honors our alignment requirements */
+	rc = validate_sglist(list);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_CU_SG_LIST),
+			"CUPD: sglist fails alignment requirements\n");
+		return rc;
+	}
+
+	lock(&flash_lock);
+	if (!list) {	/* Cancel update request */
+		fsp_flash_term_hook = NULL;
+		image_data = NULL;
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+	length = (list->length & ~(SG_LIST_VERSION << 56)) - 16;
+	num_entries = length / sizeof(struct opal_sg_entry);
+	if (num_entries <= 0)
+		goto out;
+
+	/* Validate image header */
+	entry = &list->entry[0];
+	rc = validate_candidate_image((uint64_t)entry->data,
+				      VALIDATE_BUF_SIZE, &result);
+	if (!rc && (result != VALIDATE_FLASH_AUTH &&
+		   result != VALIDATE_INVALID_IMG)) {
+		image_data = list;
+		fsp_flash_term_hook = fsp_flash_firmware;
+		goto out;
+	}
+
+	/* Adjust return code */
+	if (result == VALIDATE_FLASH_AUTH)
+		rc = OPAL_FLASH_NO_AUTH;
+	else if (result == VALIDATE_INVALID_IMG)
+		rc = OPAL_INVALID_IMAGE;
+
+out:
+	unlock(&flash_lock);
+	return rc;
+}
+
+/*
+ * Code Update notifications
+ *
+ * Note: At present we just ACK these notifications.
+ *       Reset cached VPD data if we are going to support
+ *       concurrent image maint in future.
+ */
+static bool code_update_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	int rc;
+	uint32_t cmd;
+
+	switch(cmd_sub_mod) {
+	case FSP_CMD_FLASH_CACHE:
+		cmd = FSP_CMD_FLASH_CACHE_RSP;
+		printf("CUPD: Update LID cache event [data = 0x%x]\n",
+		       msg->data.words[0]);
+		break;
+	case FSP_CMD_FLASH_OUTC:
+	case FSP_CMD_FLASH_OUTR:
+	case FSP_CMD_FLASH_OUTS:
+		cmd = FSP_CMD_FLASH_OUT_RSP;
+		printf("CUPD: Out of band commit notify [Type = 0x%x]\n",
+		       (msg->word1 >> 8) & 0xff);
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Unknown "
+			"notification [cmd = 0x%x]\n", cmd_sub_mod);
+		return false;
+	}
+
+	rc = fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+	if (rc)
+		log_simple_error(&e_info(OPAL_RC_CU_NOTIFY), "CUPD: Failed to "
+			"queue code update notification response :%d\n", rc);
+
+	return true;
+}
+
+static struct fsp_client fsp_get_notify = {
+	.message = code_update_notify,
+};
+
+void fsp_code_update_init(void)
+{
+	if (!fsp_present()) {
+		flash_state = FLASH_STATE_ABSENT;
+		return;
+	}
+
+	/* OPAL interface */
+	opal_register(OPAL_FLASH_VALIDATE, fsp_opal_validate_flash, 3);
+	opal_register(OPAL_FLASH_MANAGE, fsp_opal_manage_flash, 1);
+	opal_register(OPAL_FLASH_UPDATE, fsp_opal_update_flash, 1);
+
+	/* register Code Update Class D3 */
+	fsp_register_client(&fsp_get_notify, FSP_MCLASS_CODE_UPDATE);
+
+	/* Flash hook */
+	fsp_flash_term_hook = NULL;
+
+	/* Fetch various code update related sys parameters */
+	get_ipl_side();
+	get_code_update_policy();
+	get_platform_hmc_managed();
+
+	/* Fetch common marker LID */
+	lid_data = memalign(TCE_PSIZE, MARKER_LID_SIZE);
+	if (!lid_data) {
+		log_simple_error(&e_info(OPAL_RC_CU_INIT),
+			"CUPD: Failed to allocate memory for marker LID\n");
+		flash_state = FLASH_STATE_ABSENT;
+		return;
+	}
+	fetch_com_marker_lid();
+}
diff --git a/hw/fsp/fsp-console.c b/hw/fsp/fsp-console.c
new file mode 100644
index 00000000..725edcc2
--- /dev/null
+++ b/hw/fsp/fsp-console.c
@@ -0,0 +1,922 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Service Processor serial console handling code
+ */
+#include <skiboot.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <console.h>
+#include <opal.h>
+#include <timebase.h>
+#include <device.h>
+
+struct fsp_serbuf_hdr {
+	u16	partition_id;
+	u8	session_id;
+	u8	hmc_id;
+	u16	data_offset;
+	u16	last_valid;
+	u16	ovf_count;
+	u16	next_in;
+	u8	flags;
+	u8	reserved;
+	u16	next_out;
+	u8	data[];
+};
+#define SER_BUF_DATA_SIZE	(0x10000 - sizeof(struct fsp_serbuf_hdr))
+
+struct fsp_serial {
+	bool			available;
+	bool			open;
+	bool			has_part0;
+	bool			has_part1;
+	bool			log_port;
+	bool			out_poke;
+	char			loc_code[LOC_CODE_SIZE];
+	u16			rsrc_id;
+	struct fsp_serbuf_hdr	*in_buf;
+	struct fsp_serbuf_hdr	*out_buf;
+	struct fsp_msg		*poke_msg;
+};
+
+#define SER_BUFFER_SIZE 0x00040000UL
+#define MAX_SERIAL	4
+
+static struct fsp_serial fsp_serials[MAX_SERIAL];
+static bool got_intf_query;
+static bool got_assoc_resp;
+static bool got_deassoc_resp;
+static struct lock fsp_con_lock = LOCK_UNLOCKED;
+static void* ser_buffer = NULL;
+
+static void fsp_console_reinit(void)
+{
+	int i;
+	void *base;
+
+	/* Initialize out data structure pointers & TCE maps */
+	base = ser_buffer;
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *ser = &fsp_serials[i];
+
+		ser->in_buf = base;
+		ser->out_buf = base + SER_BUFFER_SIZE/2;
+		base += SER_BUFFER_SIZE;
+	}
+	fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer,
+			4 * PSI_DMA_SER0_SIZE);
+
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		printf("FSP: Reassociating HVSI console %d\n", i);
+		got_assoc_resp = false;
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+				(fs->rsrc_id << 16) | 1, i), true);
+		/* XXX add timeout ? */
+		while(!got_assoc_resp)
+			fsp_poll();
+	}
+}
+
+static void fsp_close_consoles(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+
+		if (!fs->available)
+			continue;
+
+		if (fs->rsrc_id == 0xffff)	/* Get clarity from benh */
+			continue;
+
+		lock(&fsp_con_lock);
+		if (fs->open) {
+			fs->open = false;
+			fs->out_poke = false;
+			if (fs->poke_msg->state != fsp_msg_unused)
+				fsp_cancelmsg(fs->poke_msg);
+			fsp_freemsg(fs->poke_msg);
+			fs->poke_msg = NULL;
+		}
+		unlock(&fsp_con_lock);
+	}
+	printf("FSPCON: Closed consoles on account of FSP reset/reload\n");
+}
+
+static void fsp_pokemsg_reclaim(struct fsp_msg *msg)
+{
+	struct fsp_serial *fs = msg->user_data;
+
+	/*
+	 * The poke_msg might have been "detached" from the console
+	 * in vserial_close, so we need to check whether it's current
+	 * before touching the state, otherwise, just free it
+	 */
+	lock(&fsp_con_lock);
+	if (fs->open && fs->poke_msg == msg) {
+		if (fs->out_poke) {
+			fs->out_poke = false;
+			fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim);
+		} else
+			fs->poke_msg->state = fsp_msg_unused;
+	} else
+		fsp_freemsg(msg);
+	unlock(&fsp_con_lock);
+}
+
+/* Called with the fsp_con_lock held */
+static size_t fsp_write_vserial(struct fsp_serial *fs, const char *buf,
+				size_t len)
+{
+	struct fsp_serbuf_hdr *sb = fs->out_buf;
+	u16 old_nin = sb->next_in;
+	u16 space, chunk;
+
+	if (!fs->open)
+		return 0;
+
+	space = (sb->next_out + SER_BUF_DATA_SIZE - old_nin - 1)
+		% SER_BUF_DATA_SIZE;
+	if (space < len)
+		len = space;
+	if (!len)
+		return 0;
+
+	chunk = SER_BUF_DATA_SIZE - old_nin;
+	if (chunk > len)
+		chunk = len;
+	memcpy(&sb->data[old_nin], buf, chunk);
+	if (chunk < len)
+		memcpy(&sb->data[0], buf + chunk, len - chunk);
+	lwsync();
+	sb->next_in = (old_nin + len) % SER_BUF_DATA_SIZE;
+	sync();
+
+	if (sb->next_out == old_nin && fs->poke_msg) {
+		if (fs->poke_msg->state == fsp_msg_unused)
+			fsp_queue_msg(fs->poke_msg, fsp_pokemsg_reclaim);
+		else
+			fs->out_poke = true;
+	}
+#ifndef DISABLE_CON_PENDING_EVT
+	opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT,
+				OPAL_EVENT_CONSOLE_OUTPUT);
+#endif
+	return len;
+}
+
+#ifdef DVS_CONSOLE
+static int fsp_con_port = -1;
+static bool fsp_con_full;
+
+/*
+ * This is called by the code in console.c without the con_lock
+ * held. However it can be called as the result of any printf
+ * thus any other lock might be held including possibly the
+ * FSP lock
+ */
+static size_t fsp_con_write(const char *buf, size_t len)
+{
+	size_t written;
+
+	if (fsp_con_port < 0)
+		return 0;
+
+	lock(&fsp_con_lock);
+	written = fsp_write_vserial(&fsp_serials[fsp_con_port], buf, len);
+	fsp_con_full = (written < len);
+	unlock(&fsp_con_lock);
+
+	return written;
+}
+
+static struct con_ops fsp_con_ops = {
+	.write = fsp_con_write,
+};
+#endif /* DVS_CONSOLE */
+
+static void fsp_open_vserial(struct fsp_msg *msg)
+{
+	u16 part_id = msg->data.words[0] & 0xffff;
+	u16 sess_id = msg->data.words[1] & 0xffff;
+	u8 hmc_sess = msg->data.bytes[0];	
+	u8 hmc_indx = msg->data.bytes[1];
+	u8 authority = msg->data.bytes[4];
+	u32 tce_in, tce_out;
+	struct fsp_serial *fs;
+
+	printf("FSPCON: Got VSerial Open\n");
+	printf("  part_id   = 0x%04x\n", part_id);
+	printf("  sess_id   = 0x%04x\n", sess_id);
+	printf("  hmc_sess  = 0x%02x\n", hmc_sess);
+	printf("  hmc_indx  = 0x%02x\n", hmc_indx);
+	printf("  authority = 0x%02x\n", authority);
+
+	if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) {
+		fsp_queue_msg(fsp_mkmsg(FSP_RSP_OPEN_VSERIAL | 0x2f, 0),
+			      fsp_freemsg);
+		printf("  NOT AVAILABLE !\n");
+		return;
+	}
+
+	fs = &fsp_serials[sess_id];
+
+	/* Hack ! On blades, the console opened via the mm has partition 1
+	 * while the debug DVS generally has partition 0 (though you can
+	 * use what you want really).
+	 * We don't want a DVS open/close to crap on the blademm console
+	 * thus if it's a raw console, gets an open with partID 1, we
+	 * set a flag that ignores the close of partid 0
+	 */
+	if (fs->rsrc_id == 0xffff) {
+		if (part_id == 0)
+			fs->has_part0 = true;
+		if (part_id == 1)
+			fs->has_part1 = true;
+	}
+
+	tce_in = PSI_DMA_SER0_BASE + PSI_DMA_SER0_SIZE * sess_id;
+	tce_out = tce_in + SER_BUFFER_SIZE/2;
+
+	lock(&fsp_con_lock);
+	if (fs->open) {
+		printf("  already open, skipping init !\n");
+		unlock(&fsp_con_lock);
+		goto already_open;
+	}
+
+	fs->open = true;
+
+	fs->poke_msg = fsp_mkmsg(FSP_CMD_VSERIAL_OUT, 2,
+				 msg->data.words[0],
+				 msg->data.words[1] & 0xffff);
+	fs->poke_msg->user_data = fs;
+
+	fs->in_buf->partition_id = fs->out_buf->partition_id = part_id;
+	fs->in_buf->session_id	 = fs->out_buf->session_id   = sess_id;
+	fs->in_buf->hmc_id       = fs->out_buf->hmc_id       = hmc_indx;
+	fs->in_buf->data_offset  = fs->out_buf->data_offset  =
+		sizeof(struct fsp_serbuf_hdr);
+	fs->in_buf->last_valid   = fs->out_buf->last_valid   =
+		SER_BUF_DATA_SIZE - 1;
+	fs->in_buf->ovf_count    = fs->out_buf->ovf_count    = 0;
+	fs->in_buf->next_in      = fs->out_buf->next_in      = 0;
+	fs->in_buf->flags        = fs->out_buf->flags        = 0;
+	fs->in_buf->reserved     = fs->out_buf->reserved     = 0;
+	fs->in_buf->next_out     = fs->out_buf->next_out     = 0;
+	unlock(&fsp_con_lock);
+
+ already_open:
+	fsp_queue_msg(fsp_mkmsg(FSP_RSP_OPEN_VSERIAL, 6,
+				msg->data.words[0],
+				msg->data.words[1] & 0xffff,
+				0, tce_in, 0, tce_out), fsp_freemsg);
+
+#ifdef DVS_CONSOLE
+	printf("  log_port  = %d\n", fs->log_port);
+	if (fs->log_port) {
+		fsp_con_port = sess_id;
+		sync();
+		/*
+		 * We mark the FSP lock as being in the console
+		 * path. We do that only once, we never unmark it
+		 * (there is really no much point)
+		 */
+		fsp_used_by_console();
+		fsp_con_lock.in_con_path = true;
+		set_console(&fsp_con_ops);
+	}
+#endif
+}
+
+static void fsp_close_vserial(struct fsp_msg *msg)
+{
+	u16 part_id = msg->data.words[0] & 0xffff;
+	u16 sess_id = msg->data.words[1] & 0xffff;
+	u8 hmc_sess = msg->data.bytes[0];	
+	u8 hmc_indx = msg->data.bytes[1];
+	u8 authority = msg->data.bytes[4];
+	struct fsp_serial *fs;
+
+	printf("FSPCON: Got VSerial Close\n");
+	printf("  part_id   = 0x%04x\n", part_id);
+	printf("  sess_id   = 0x%04x\n", sess_id);
+	printf("  hmc_sess  = 0x%02x\n", hmc_sess);
+	printf("  hmc_indx  = 0x%02x\n", hmc_indx);
+	printf("  authority = 0x%02x\n", authority);
+
+	if (sess_id >= MAX_SERIAL || !fsp_serials[sess_id].available) {
+		printf("  NOT AVAILABLE !\n");
+		goto skip_close;
+	}
+
+	fs = &fsp_serials[sess_id];
+
+	/* See "HACK" comment in open */
+	if (fs->rsrc_id == 0xffff) {
+		if (part_id == 0)
+			fs->has_part0 = false;
+		if (part_id == 1)
+			fs->has_part1 = false;
+		if (fs->has_part0 || fs->has_part1) {
+			printf("  skipping close !\n");
+			goto skip_close;
+		}
+	}
+
+#ifdef DVS_CONSOLE
+	if (fs->log_port) {
+		fsp_con_port = -1;
+		set_console(NULL);
+	}
+#endif
+	
+	lock(&fsp_con_lock);
+	if (fs->open) {
+		fs->open = false;
+		fs->out_poke = false;
+		if (fs->poke_msg && fs->poke_msg->state == fsp_msg_unused) {
+			fsp_freemsg(fs->poke_msg);
+			fs->poke_msg = NULL;
+		}
+	}
+	unlock(&fsp_con_lock);
+ skip_close:
+	fsp_queue_msg(fsp_mkmsg(FSP_RSP_CLOSE_VSERIAL, 2,
+				msg->data.words[0],
+				msg->data.words[1] & 0xffff),
+		      fsp_freemsg);
+}
+
+static bool fsp_con_msg_hmc(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	/* Associate response */
+	if ((cmd_sub_mod >> 8) == 0xe08a) {
+		printf("FSPCON: Got associate response, status 0x%02x\n",
+		       cmd_sub_mod & 0xff);
+		got_assoc_resp = true;
+		return true;
+	}
+	if ((cmd_sub_mod >> 8) == 0xe08b) {
+		printf("Got unassociate response, status 0x%02x\n",
+		       cmd_sub_mod & 0xff);
+		got_deassoc_resp = true;
+		return true;
+	}
+	switch(cmd_sub_mod) {
+	case FSP_CMD_OPEN_VSERIAL:
+		fsp_open_vserial(msg);
+		return true;
+	case FSP_CMD_CLOSE_VSERIAL:
+		fsp_close_vserial(msg);
+		return true;
+	case FSP_CMD_HMC_INTF_QUERY:
+		printf("FSPCON: Got HMC interface query\n");
+
+		/* Keep that synchronous due to FSP fragile ordering
+		 * of the boot sequence
+		 */
+		fsp_sync_msg(fsp_mkmsg(FSP_RSP_HMC_INTF_QUERY, 1,
+				       msg->data.words[0] & 0x00ffffff), true);
+		got_intf_query = true;
+		return true;
+	}
+	return false;
+}
+
+static bool fsp_con_msg_vt(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u16 sess_id = msg->data.words[1] & 0xffff;
+
+	if (cmd_sub_mod == FSP_CMD_VSERIAL_IN && sess_id < MAX_SERIAL) {
+		struct fsp_serial *fs = &fsp_serials[sess_id];
+
+		if (!fs->open)
+			return true;
+
+		/* FSP is signaling some incoming data. We take the console
+		 * lock to avoid racing with a simultaneous read, though we
+		 * might want to consider to simplify all that locking into
+		 * one single lock that covers the console and the pending
+		 * events.
+		 */
+		lock(&fsp_con_lock);
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+		unlock(&fsp_con_lock);
+	}
+	return true;
+}
+
+static bool fsp_con_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		fsp_close_consoles();
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		fsp_console_reinit();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_con_client_hmc = {
+	.message = fsp_con_msg_hmc,
+};
+
+static struct fsp_client fsp_con_client_vt = {
+	.message = fsp_con_msg_vt,
+};
+
+static struct fsp_client fsp_con_client_rr = {
+	.message = fsp_con_msg_rr,
+};
+
+static void fsp_serial_add(int index, u16 rsrc_id, const char *loc_code,
+			   bool log_port)
+{
+	struct fsp_serial *ser;
+
+	lock(&fsp_con_lock);
+	ser = &fsp_serials[index];
+
+	if (ser->available) {
+		unlock(&fsp_con_lock);
+		return;
+	}
+
+	ser->rsrc_id = rsrc_id;
+	strncpy(ser->loc_code, loc_code, LOC_CODE_SIZE);
+	ser->available = true;
+	ser->log_port = log_port;
+	unlock(&fsp_con_lock);
+
+	/* DVS doesn't have that */
+	if (rsrc_id != 0xffff) {
+		got_assoc_resp = false;
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+				       (rsrc_id << 16) | 1, index), true);
+		/* XXX add timeout ? */
+		while(!got_assoc_resp)
+			fsp_poll();
+	}
+}
+
+void fsp_console_preinit(void)
+{
+	int i;
+	void *base;
+
+	if (!fsp_present())
+		return;
+
+	ser_buffer = memalign(TCE_PSIZE, SER_BUFFER_SIZE * MAX_SERIAL);
+
+	/* Initialize out data structure pointers & TCE maps */
+	base = ser_buffer;
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *ser = &fsp_serials[i];
+
+		ser->in_buf = base;
+		ser->out_buf = base + SER_BUFFER_SIZE/2;
+		base += SER_BUFFER_SIZE;
+	}
+	fsp_tce_map(PSI_DMA_SER0_BASE, ser_buffer,
+		    4 * PSI_DMA_SER0_SIZE);
+
+	/* Register for class E0 and E1 */
+	fsp_register_client(&fsp_con_client_hmc, FSP_MCLASS_HMC_INTFMSG);
+	fsp_register_client(&fsp_con_client_vt, FSP_MCLASS_HMC_VT);
+	fsp_register_client(&fsp_con_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Add DVS ports. We currently have session 0 and 3, 0 is for
+	 * OS use. 3 is our debug port. We need to add those before
+	 * we complete the OPL or we'll potentially miss the
+	 * console setup on Firebird blades.
+	 */
+	fsp_serial_add(0, 0xffff, "DVS_OS", false);
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0001);
+	fsp_serial_add(3, 0xffff, "DVS_FW", true);
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0002);
+
+}
+
+static int64_t fsp_console_write(int64_t term_number, int64_t *length,
+				 const uint8_t *buffer)
+{
+	struct fsp_serial *fs;
+	size_t written, requested;
+
+	if (term_number < 0 || term_number >= MAX_SERIAL)
+		return OPAL_PARAMETER;
+	fs = &fsp_serials[term_number];
+	if (!fs->available || fs->log_port)
+		return OPAL_PARAMETER;
+	lock(&fsp_con_lock);
+	if (!fs->open) {
+		unlock(&fsp_con_lock);
+		return OPAL_CLOSED;
+	}
+	/* Clamp to a reasonable size */
+	requested = *length;
+	if (requested > 0x1000)
+		requested = 0x1000;
+	written = fsp_write_vserial(fs, buffer, requested);
+
+#ifdef OPAL_DEBUG_CONSOLE_IO
+	printf("OPAL: console write req=%ld written=%ld ni=%d no=%d\n",
+	       requested, written, fs->out_buf->next_in, fs->out_buf->next_out);
+	printf("      %02x %02x %02x %02x "
+	       "%02x \'%c\' %02x \'%c\' %02x \'%c\'.%02x \'%c\'..\n",
+	       buffer[0], buffer[1], buffer[2], buffer[3],
+	       buffer[4], buffer[4], buffer[5], buffer[5],
+	       buffer[6], buffer[6], buffer[7], buffer[7]);
+#endif /* OPAL_DEBUG_CONSOLE_IO */
+
+	*length = written;
+	unlock(&fsp_con_lock);
+
+	return written ? OPAL_SUCCESS : OPAL_BUSY_EVENT;
+}
+
+static int64_t fsp_console_write_buffer_space(int64_t term_number,
+					      int64_t *length)
+{
+	struct fsp_serial *fs;
+	struct fsp_serbuf_hdr *sb;
+
+	if (term_number < 0 || term_number >= MAX_SERIAL)
+		return OPAL_PARAMETER;
+	fs = &fsp_serials[term_number];
+	if (!fs->available || fs->log_port)
+		return OPAL_PARAMETER;
+	lock(&fsp_con_lock);
+	if (!fs->open) {
+		unlock(&fsp_con_lock);
+		return OPAL_CLOSED;
+	}
+	sb = fs->out_buf;
+	*length = (sb->next_out + SER_BUF_DATA_SIZE - sb->next_in - 1)
+		% SER_BUF_DATA_SIZE;
+	unlock(&fsp_con_lock);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t fsp_console_read(int64_t term_number, int64_t *length,
+				uint8_t *buffer __unused)
+{
+	struct fsp_serial *fs;
+	struct fsp_serbuf_hdr *sb;
+	bool pending = false;
+	uint32_t old_nin, n, i, chunk, req = *length;
+
+	if (term_number < 0 || term_number >= MAX_SERIAL)
+		return OPAL_PARAMETER;
+	fs = &fsp_serials[term_number];
+	if (!fs->available || fs->log_port)
+		return OPAL_PARAMETER;
+	lock(&fsp_con_lock);
+	if (!fs->open) {
+		unlock(&fsp_con_lock);
+		return OPAL_CLOSED;
+	}
+	sb = fs->in_buf;
+	old_nin = sb->next_in;
+	lwsync();
+	n = (old_nin + SER_BUF_DATA_SIZE - sb->next_out)
+		% SER_BUF_DATA_SIZE;
+	if (n > req) {
+		pending = true;
+		n = req;
+	}
+	*length = n;
+
+	chunk = SER_BUF_DATA_SIZE - sb->next_out;
+	if (chunk > n)
+		chunk = n;
+	memcpy(buffer, &sb->data[sb->next_out], chunk);
+	if (chunk < n)
+		memcpy(buffer + chunk, &sb->data[0], n - chunk);
+	sb->next_out = (sb->next_out + n) % SER_BUF_DATA_SIZE;
+
+#ifdef OPAL_DEBUG_CONSOLE_IO
+	printf("OPAL: console read req=%d read=%d ni=%d no=%d\n",
+	       req, n, sb->next_in, sb->next_out);
+	printf("      %02x %02x %02x %02x %02x %02x %02x %02x ...\n",
+	       buffer[0], buffer[1], buffer[2], buffer[3],
+	       buffer[4], buffer[5], buffer[6], buffer[7]);
+#endif /* OPAL_DEBUG_CONSOLE_IO */
+
+	/* Might clear the input pending flag */
+	for (i = 0; i < MAX_SERIAL && !pending; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct fsp_serbuf_hdr *sb = fs->in_buf;
+
+		if (fs->log_port || !fs->open)
+			continue;
+		if (sb->next_out != sb->next_in)
+			pending = true;
+	}
+	if (!pending)
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+
+	unlock(&fsp_con_lock);
+
+	return OPAL_SUCCESS;
+}
+
+void fsp_console_poll(void *data __unused)
+{
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+       	static int debug;
+#endif
+
+	/*
+	 * We don't get messages for out buffer being consumed, so we
+	 * need to poll. We also defer sending of poke messages from
+	 * the sapphire console to avoid a locking nightmare with
+	 * beging called from printf() deep into an existing lock nest
+	 * stack.
+	 */
+	if (fsp_con_full ||
+	    (opal_pending_events & OPAL_EVENT_CONSOLE_OUTPUT)) {
+		unsigned int i;
+		bool pending = false;
+
+		/* We take the console lock. This is somewhat inefficient
+		 * but it guarantees we aren't racing with a write, and
+		 * thus clearing an event improperly
+		 */
+		lock(&fsp_con_lock);
+		for (i = 0; i < MAX_SERIAL && !pending; i++) {
+			struct fsp_serial *fs = &fsp_serials[i];
+			struct fsp_serbuf_hdr *sb = fs->out_buf;
+
+			if (!fs->open)
+				continue;
+			if (sb->next_out == sb->next_in)
+				continue;
+			if (fs->log_port)
+				__flush_console();
+			else {
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+				if (debug < 5) {
+					printf("OPAL: %d still pending"
+					       " ni=%d no=%d\n",
+					       i, sb->next_in, sb->next_out);
+					debug++;
+				}
+#endif /* OPAL_DEBUG_CONSOLE_POLL */
+				pending = true;
+			}
+		}
+		if (!pending) {
+			opal_update_pending_evt(OPAL_EVENT_CONSOLE_OUTPUT, 0);
+#ifdef OPAL_DEBUG_CONSOLE_POLL
+			debug = 0;
+#endif
+		}
+		unlock(&fsp_con_lock);
+	}
+}
+
+void fsp_console_init(void)
+{
+	struct dt_node *serials, *ser;
+	int i;
+
+	if (!fsp_present())
+		return;
+
+	opal_register(OPAL_CONSOLE_READ, fsp_console_read, 3);
+	opal_register(OPAL_CONSOLE_WRITE_BUFFER_SPACE,
+		      fsp_console_write_buffer_space, 2);
+	opal_register(OPAL_CONSOLE_WRITE, fsp_console_write, 3);
+
+	/* Wait until we got the intf query before moving on */
+	while (!got_intf_query)
+		fsp_poll();
+
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0000);
+
+	/* Register poller */
+	opal_add_poller(fsp_console_poll, NULL);
+
+	/* Parse serial port data */
+	serials = dt_find_by_path(dt_root, "ipl-params/fsp-serial");
+	if (!serials) {
+		prerror("FSPCON: No FSP serial ports in device-tree\n");
+		return;
+	}
+
+	i = 1;
+	dt_for_each_child(serials, ser) {
+		u32 rsrc_id = dt_prop_get_u32(ser, "reg");
+		const void *lc = dt_prop_get(ser, "ibm,loc-code");
+
+		printf("FSPCON: Serial %d rsrc: %04x loc: %s\n",
+		       i, rsrc_id, (const char *)lc);
+		fsp_serial_add(i++, rsrc_id, lc, false);
+		op_display(OP_LOG, OP_MOD_FSPCON, 0x0010 + i);
+	}
+
+	op_display(OP_LOG, OP_MOD_FSPCON, 0x0005);
+}
+
+static void flush_all_input(void)
+{
+	unsigned int i;
+
+	lock(&fsp_con_lock);
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct fsp_serbuf_hdr *sb = fs->in_buf;
+
+		if (fs->log_port)
+			continue;
+
+		sb->next_out = sb->next_in;
+	}
+	unlock(&fsp_con_lock);
+}
+		
+static bool send_all_hvsi_close(void)
+{
+	unsigned int i;
+	bool has_hvsi = false;
+	static const uint8_t close_packet[] = { 0xfe, 6, 0, 1, 0, 3 };
+
+	lock(&fsp_con_lock);
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct fsp_serbuf_hdr *sb = fs->out_buf;
+		unsigned int space, timeout = 10;
+
+		if (fs->log_port)
+			continue;
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		has_hvsi = true;
+
+		/* Do we have room ? Wait a bit if not */
+		while(timeout--) {
+			space = (sb->next_out + SER_BUF_DATA_SIZE -
+				 sb->next_in - 1) % SER_BUF_DATA_SIZE;
+			if (space >= 6)
+				break;
+			time_wait_ms(500);
+		}
+		fsp_write_vserial(fs, close_packet, 6);
+	}
+	unlock(&fsp_con_lock);
+
+	return has_hvsi;
+}
+
+static void reopen_all_hvsi(void)
+{
+	unsigned int i;
+
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		printf("FSP: Deassociating HVSI console %d\n", i);
+		got_deassoc_resp = false;
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_UNASSOC_SERIAL, 1,
+				       (i << 16) | 1), true);
+		/* XXX add timeout ? */
+		while(!got_deassoc_resp)
+			fsp_poll();
+	}
+ 	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		if (fs->rsrc_id == 0xffff)
+			continue;
+		printf("FSP: Reassociating HVSI console %d\n", i);
+		got_assoc_resp = false;
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_ASSOC_SERIAL, 2,
+				       (fs->rsrc_id << 16) | 1, i), true);
+		/* XXX add timeout ? */
+		while(!got_assoc_resp)
+			fsp_poll();
+	}
+}
+
+void fsp_console_reset(void)
+{
+	printf("FSP: Console reset !\n");
+
+	/* This is called on a fast-reset. To work around issues with HVSI
+	 * initial negotiation, before we reboot the kernel, we flush all
+	 * input and send an HVSI close packet.
+	 */
+	flush_all_input();
+
+	/* Returns false if there is no HVSI console */
+	if (!send_all_hvsi_close())
+		return;
+
+	time_wait_ms(500);
+	
+	flush_all_input();
+
+	reopen_all_hvsi();
+
+}
+
+void fsp_console_add_nodes(void)
+{
+	unsigned int i;
+	struct dt_node *consoles;
+
+	consoles = dt_new(opal_node, "consoles");
+	dt_add_property_cells(consoles, "#address-cells", 1);
+	dt_add_property_cells(consoles, "#size-cells", 0);
+	for (i = 0; i < MAX_SERIAL; i++) {
+		struct fsp_serial *fs = &fsp_serials[i];
+		struct dt_node *fs_node;
+		char name[32];
+
+		if (fs->log_port || !fs->available)
+			continue;
+
+		snprintf(name, sizeof(name), "serial@%d", i);
+		fs_node = dt_new(consoles, name);
+		if (fs->rsrc_id == 0xffff)
+			dt_add_property_string(fs_node, "compatible",
+					       "ibm,opal-console-raw");
+		else
+			dt_add_property_string(fs_node, "compatible",
+					       "ibm,opal-console-hvsi");
+		dt_add_property_cells(fs_node,
+				     "#write-buffer-size", SER_BUF_DATA_SIZE);
+		dt_add_property_cells(fs_node, "reg", i);
+		dt_add_property_string(fs_node, "device_type", "serial");
+	}
+}
+
+void fsp_console_select_stdout(void)
+{
+	struct dt_node *iplp;
+	u32 ipl_mode = 0;
+
+	if (!fsp_present())
+		return;
+
+	/*
+	 * We hijack the "os-ipl-mode" setting in iplparams to select
+	 * out output console. This is the "i5/OS partition mode boot"
+	 * setting in ASMI converted to an integer: 0=A, 1=B, ...
+	 */
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp)
+		ipl_mode = dt_prop_get_u32_def(iplp, "os-ipl-mode", 0);
+
+	/*
+	 * Now, if ipl_mode is 1 or 2, we set the corresponding serial
+	 * port if it exists (ie, is opened) as the default console.
+	 *
+	 * In any other case, we set the default console to serial0
+	 * which is DVS or IPMI
+	 */
+	if (ipl_mode == 1 && fsp_serials[1].open) {
+		dt_add_property_string(dt_chosen, "linux,stdout-path",
+				       "/ibm,opal/consoles/serial@1");
+		printf("FSPCON: default console 1\n");
+	} else if (ipl_mode == 2 && fsp_serials[2].open) {
+		dt_add_property_string(dt_chosen, "linux,stdout-path",
+				       "/ibm,opal/consoles/serial@2");
+		printf("FSPCON: default console 2\n");
+	} else {
+		dt_add_property_string(dt_chosen, "linux,stdout-path",
+				       "/ibm,opal/consoles/serial@0");
+		printf("FSPCON: default console 0\n");
+	}
+}
+
diff --git a/hw/fsp/fsp-diag.c b/hw/fsp/fsp-diag.c
new file mode 100644
index 00000000..5f588af9
--- /dev/null
+++ b/hw/fsp/fsp-diag.c
@@ -0,0 +1,58 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Code for handling FSP_MCLASS_DIAG messages (cmd 0xee)
+ * Receiving a high level ack timeout is likely indicative of a firmware bug
+ */
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <processor.h>
+#include <timebase.h>
+#include <opal.h>
+#include <fsp-sysparam.h>
+
+static bool fsp_diag_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+
+	if (cmd_sub_mod == FSP_RSP_DIAG_LINK_ERROR) {
+		printf("FIXME: Unhandled FSP_MCLASS_DIAG Link Error Report\n");
+		return false;
+	}
+
+	if (cmd_sub_mod != FSP_RSP_DIAG_ACK_TIMEOUT) {
+		printf("BUG: Unhandled subcommand: 0x%x (New FSP spec?)\n",
+		       cmd_sub_mod);
+		return false;
+	}
+
+	printf("BUG: High Level ACK timeout (FSP_MCLASS_DIAG) for 0x%x\n",
+	       msg->data.words[0] & 0xffff0000);
+
+	return true;
+}
+
+static struct fsp_client fsp_diag = {
+	.message = fsp_diag_msg,
+};
+
+/* This is called at boot time */
+void fsp_init_diag(void)
+{
+	/* Register for the diag event */
+	fsp_register_client(&fsp_diag, FSP_MCLASS_DIAG);
+}
diff --git a/hw/fsp/fsp-dump.c b/hw/fsp/fsp-dump.c
new file mode 100644
index 00000000..be1aa7c2
--- /dev/null
+++ b/hw/fsp/fsp-dump.c
@@ -0,0 +1,917 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * Dump support:
+ *  We get dump notification from different sources:
+ *   - During system intialization via HDAT
+ *   - During FSP reset/reload (FipS dump)
+ *   - Dump available notification MBOX command (0xCE, 0x78, 0x00)
+ *
+ *  To avoid complications, we keep list of dumps in a list and fetch
+ *  them serially.
+ *
+ * Dump retrieve process:
+ *   - Once we get notification from FSP we enqueue the dump ID and notify
+ *     Linux via OPAL event notification.
+ *   - Linux reads dump info and allocates required memory to fetch the dump
+ *     and makes dump read call.
+ *   - Sapphire fetches dump data from FSP.
+ *   - Linux writes dump to disk and sends acknowledgement.
+ *   - Sapphire acknowledges FSP.
+ */
+
+#include <fsp.h>
+#include <psi.h>
+#include <lock.h>
+#include <device.h>
+#include <skiboot.h>
+#include <fsp-elog.h>
+
+/*
+ * Max outstanding dumps to retrieve
+ *
+ * Note:
+ *  Dumps are serialized. We don't get notification for second
+ *  dump of given type until we acknowledge first one. But we
+ *  may get notification for different dump type. And our dump
+ *  retrieval code is serialized. Hence we use list to keep
+ *  track of outstanding dumps to be retrieved.
+ */
+#define MAX_DUMP_RECORD		0x04
+
+/* Max retry */
+#define FIPS_DUMP_MAX_RETRY	0x03
+
+/* Dump type */
+#define DUMP_TYPE_FSP		0x01
+#define DUMP_TYPE_SYS		0x02
+#define DUMP_TYPE_SMA		0x03
+
+/* Dump fetch size */
+#define DUMP_FETCH_SIZE_FSP	0x500000
+#define DUMP_FETCH_SIZE_SYS	0x400000
+#define DUMP_FETCH_SIZE_RES	0x200000
+
+/* Params for Fips dump */
+#define FSP_DUMP_TOOL_TYPE	"SYS "
+#define FSP_DUMP_CLIENT_ID	"SAPPHIRE_CLIENT"
+
+enum dump_state {
+	DUMP_STATE_ABSENT,	/* No FSP dump */
+	DUMP_STATE_NONE,	/* No dump to retrieve */
+	DUMP_STATE_NOTIFY,	/* Notified Linux */
+	DUMP_STATE_FETCHING,	/* Dump retrieval is in progress */
+	DUMP_STATE_FETCH,	/* Dump retrieve complete */
+	DUMP_STATE_PARTIAL,	/* Partial read */
+	DUMP_STATE_ABORTING,	/* Aborting due to kexec */
+};
+
+/* Pending dump list */
+struct dump_record {
+	uint8_t	 type;
+	uint32_t id;
+	uint32_t size;
+	struct list_node link;
+};
+
+/* List definations */
+static LIST_HEAD(dump_pending);
+static LIST_HEAD(dump_free);
+
+/* Dump retrieve state */
+static enum dump_state dump_state = DUMP_STATE_NONE;
+
+/* Dump buffer SG list */
+static struct opal_sg_list *dump_data;
+static struct dump_record *dump_entry;
+static int64_t dump_offset;
+static size_t fetch_remain;
+
+/* FipS dump retry count */
+static int retry_cnt;
+
+/* Protect list and dump retrieve state */
+static struct lock dump_lock = LOCK_UNLOCKED;
+
+/* Forward declaration */
+static int64_t fsp_opal_dump_init(uint8_t dump_type);
+static int64_t fsp_dump_read(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE,
+		 OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_LIST, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE,
+		 OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_ACK, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+/*
+ * Helper functions
+ */
+static inline void update_dump_state(enum dump_state state)
+{
+	dump_state = state;
+}
+
+static int64_t check_dump_state(void)
+{
+	switch (dump_state) {
+	case DUMP_STATE_ABSENT:
+		return OPAL_HARDWARE;
+	case DUMP_STATE_NONE:
+	case DUMP_STATE_NOTIFY:
+		/* During dump fetch, notify is wrong state */
+		return OPAL_WRONG_STATE;
+	case DUMP_STATE_FETCHING:
+	case DUMP_STATE_ABORTING:
+		return OPAL_BUSY_EVENT;
+	case DUMP_STATE_FETCH:
+		return OPAL_SUCCESS;
+	case DUMP_STATE_PARTIAL:
+		return OPAL_PARTIAL;
+	}
+	return OPAL_SUCCESS;
+}
+
+static inline void dump_tce_map(uint32_t tce_offset,
+				void *buffer, uint32_t size)
+{
+	uint32_t tlen = ALIGN_UP(size, TCE_PSIZE);
+	fsp_tce_map(PSI_DMA_DUMP_DATA + tce_offset, buffer, tlen);
+}
+
+static inline void dump_tce_unmap(uint32_t size)
+{
+	fsp_tce_unmap(PSI_DMA_DUMP_DATA, size);
+}
+
+/*
+ * Returns Data set ID for the given dump type
+ */
+static inline uint16_t get_dump_data_set_id(uint8_t type)
+{
+	switch (type) {
+	case DUMP_TYPE_FSP:
+		return FSP_DATASET_SP_DUMP;
+	case DUMP_TYPE_SYS:
+		return FSP_DATASET_HW_DUMP;
+	default:
+		break;
+	}
+	return OPAL_INTERNAL_ERROR;
+}
+
+/*
+ * Returns max data we can fetch from FSP fetch data call
+ */
+static inline int64_t get_dump_fetch_max_size(uint8_t type)
+{
+	switch (type) {
+	case DUMP_TYPE_FSP:
+		return DUMP_FETCH_SIZE_FSP;
+	case DUMP_TYPE_SYS:
+		return DUMP_FETCH_SIZE_SYS;
+	default:
+		break;
+	}
+	return OPAL_INTERNAL_ERROR;
+}
+
+/*
+ * Get dump record from pending list
+ */
+static inline struct dump_record *get_dump_rec_from_list(uint32_t id)
+{
+	struct dump_record *record;
+
+	list_for_each(&dump_pending, record, link) {
+		if (record->id == id)
+			return record;
+	}
+	return NULL;
+}
+
+/*
+ * New dump available notification to Linux
+ */
+static void update_opal_dump_notify(void)
+{
+	/*
+	 * Wait until current dump retrieval to complete
+	 * before notifying again.
+	 */
+	if (dump_state != DUMP_STATE_NONE)
+		return;
+
+	 /* More dump's to retrieve */
+	if (!list_empty(&dump_pending)) {
+		update_dump_state(DUMP_STATE_NOTIFY);
+		opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL,
+					OPAL_EVENT_DUMP_AVAIL);
+	}
+}
+
+static int64_t remove_dump_id_from_list(uint32_t dump_id)
+{
+	struct dump_record *record, *nxt_record;
+	int rc = OPAL_SUCCESS;
+	bool found = false;
+
+	/* Remove record from pending list */
+	list_for_each_safe(&dump_pending, record, nxt_record, link) {
+		if (record->id != dump_id)
+			continue;
+
+		found = true;
+		list_del(&record->link);
+		list_add(&dump_free, &record->link);
+		break;
+	}
+
+	/*
+	 * Continue update_opal_dump_notify even if it fails
+	 * to remove ID. So that we can resend notification
+	 * for the same dump ID to Linux.
+	 */
+	if (!found) { /* List corrupted? */
+		log_simple_error(&e_info(OPAL_RC_DUMP_LIST),
+				 "DUMP: ID 0x%x not found in list!\n",
+				 dump_id);
+		rc = OPAL_PARAMETER;
+	}
+
+	/* Update state */
+	update_dump_state(DUMP_STATE_NONE);
+	/* Notify next available dump to retrieve */
+	update_opal_dump_notify();
+
+	return rc;
+}
+
+static int64_t add_dump_id_to_list(uint8_t dump_type,
+				   uint32_t dump_id, uint32_t dump_size)
+{
+	struct dump_record *record;
+	int rc = OPAL_SUCCESS;
+
+	lock(&dump_lock);
+
+	rc = check_dump_state();
+	if (rc == OPAL_HARDWARE)
+		goto out;
+
+	/* List is full ? */
+	if (list_empty(&dump_free)) {
+		printf("DUMP: Dump ID 0x%x is not queued.\n", dump_id);
+		rc = OPAL_RESOURCE;
+		goto out;
+	}
+
+	/* Already queued? */
+	record = get_dump_rec_from_list(dump_id);
+	if (record) {
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	/* Add to list */
+	record = list_pop(&dump_free, struct dump_record, link);
+	record->type = dump_type;
+	record->id = dump_id;
+	record->size = dump_size;
+	list_add_tail(&dump_pending, &record->link);
+
+	/* OPAL notification */
+	update_opal_dump_notify();
+	rc = OPAL_SUCCESS;
+
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+static void dump_init_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+	printf("DUMP: FipS dump init status = 0x%x\n", status);
+	fsp_freemsg(msg);
+
+	switch (status) {
+	case FSP_STATUS_SUCCESS:
+		printf("DUMP: Initiated FipS dump.\n");
+		break;
+	case FSP_STATUS_BUSY: /* Retry, if FSP is busy */
+		if (retry_cnt++ < FIPS_DUMP_MAX_RETRY)
+			if (fsp_opal_dump_init(DUMP_TYPE_FSP) == OPAL_SUCCESS)
+				return;
+		break;
+	default:
+		break;
+	}
+	/* Reset max retry count */
+	retry_cnt = 0;
+}
+
+/*
+ * Initiate new FipS dump
+ */
+static int64_t fsp_opal_dump_init(uint8_t dump_type)
+{
+	struct fsp_msg *msg;
+	int rc = OPAL_SUCCESS;
+	uint32_t *tool_type = (void *)FSP_DUMP_TOOL_TYPE;
+	uint32_t *client_id = (void *)FSP_DUMP_CLIENT_ID;
+
+	/* Only FipS dump generate request is supported */
+	if (dump_type != DUMP_TYPE_FSP)
+		return OPAL_PARAMETER;
+
+	msg = fsp_mkmsg(FSP_CMD_FSP_DUMP_INIT, 6, *tool_type,
+			sizeof(FSP_DUMP_CLIENT_ID), *client_id,
+			*(client_id + 1), *(client_id + 2), *(client_id + 3));
+
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+				 "DUMP: Message allocation failed.\n");
+		rc = OPAL_INTERNAL_ERROR;
+	} else if (fsp_queue_msg(msg, dump_init_complete)) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+			"DUMP: Failed to queue FipS dump init request.\n");
+		fsp_freemsg(msg);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+
+	return rc;
+}
+
+/*
+ * OPAL interface to send dump information to Linux.
+ */
+static int64_t fsp_opal_dump_info2(uint32_t *dump_id, uint32_t *dump_size,
+				   uint32_t *dump_type)
+{
+	struct dump_record *record;
+	int rc = OPAL_SUCCESS;
+
+	lock(&dump_lock);
+
+	/* Clear notification */
+	opal_update_pending_evt(OPAL_EVENT_DUMP_AVAIL, 0);
+
+	record = list_top(&dump_pending, struct dump_record, link);
+	if (!record) { /* List corrupted? */
+		update_dump_state(DUMP_STATE_NONE);
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+	*dump_id = record->id;
+	*dump_size = record->size;
+	*dump_type = record->type;
+
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+static int64_t fsp_opal_dump_info(uint32_t *dump_id, uint32_t *dump_size)
+{
+	uint32_t dump_type;
+	return fsp_opal_dump_info2(dump_id, dump_size, &dump_type);
+}
+
+static int64_t validate_dump_sglist(struct opal_sg_list *list,
+				    int64_t *size)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *prev_entry, *entry;
+	int length, num_entries, i;
+
+	prev_entry = NULL;
+	*size = 0;
+	for (sg = list; sg; sg = sg->next) {
+		length = sg->length - 16;
+		num_entries = length / sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return OPAL_PARAMETER;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+			*size += entry->length;
+
+			/* All entries must be aligned */
+			if (((uint64_t)entry->data) & 0xfff)
+				return OPAL_PARAMETER;
+
+			/* All non-terminal entries size must be aligned */
+			if (prev_entry && (prev_entry->length & 0xfff))
+				return OPAL_PARAMETER;
+
+			prev_entry = entry;
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Map dump buffer to TCE buffer
+ */
+static int64_t map_dump_buffer(void)
+{
+	struct opal_sg_list *sg;
+	struct opal_sg_entry *entry;
+	int64_t fetch_max;
+	int length, num_entries, i;
+	int buf_off, fetch_off, tce_off, sg_off;
+	bool last = false;
+
+	/* FSP fetch max size */
+	fetch_max = get_dump_fetch_max_size(dump_entry->type);
+	if (fetch_max > (dump_entry->size - dump_offset))
+		fetch_remain = dump_entry->size - dump_offset;
+	else
+		fetch_remain = fetch_max;
+
+	/* offsets */
+	fetch_off = fetch_remain;
+	tce_off = sg_off = 0;
+
+	for (sg = dump_data; sg; sg = sg->next) {
+		num_entries = (sg->length - 16) /
+					sizeof(struct opal_sg_entry);
+		if (num_entries <= 0)
+			return OPAL_PARAMETER;
+
+		for (i = 0; i < num_entries; i++) {
+			entry = &sg->entry[i];
+
+			/* Continue until we get offset */
+			if ((sg_off + entry->length) < dump_offset) {
+				sg_off += entry->length;
+				continue;
+			}
+
+			/*
+			 * SG list entry size can be more than 4k.
+			 * Map only required pages, instead of
+			 * mapping entire entry.
+			 */
+			if (!tce_off) {
+				buf_off = (dump_offset - sg_off) & ~0xfff;
+				length = entry->length - buf_off;
+			} else {
+				buf_off = 0;
+				length = entry->length;
+			}
+
+			/* Adjust length for last mapping */
+			if (fetch_off <= length) {
+				length = fetch_off;
+				last = true;
+			}
+
+			/* Adjust offset */
+			sg_off += entry->length;
+			fetch_off -= length;
+
+			/* TCE mapping */
+			dump_tce_map(tce_off, entry->data + buf_off, length);
+			tce_off += length;
+
+			/* TCE mapping complete */
+			if (last)
+				return OPAL_SUCCESS;
+		}
+	} /* outer loop */
+	return OPAL_PARAMETER;
+}
+
+static void dump_read_complete(struct fsp_msg *msg)
+{
+	void *buffer;
+	size_t length, offset;
+	int rc;
+	uint32_t dump_id;
+	uint16_t id;
+	uint8_t flags, status;
+	bool compl = false;
+
+	status = (msg->resp->word1 >> 8) & 0xff;
+	flags = (msg->data.words[0] >> 16) & 0xff;
+	id = msg->data.words[0] & 0xffff;
+	dump_id = msg->data.words[1];
+	offset = msg->resp->data.words[1];
+	length = msg->resp->data.words[2];
+
+	fsp_freemsg(msg);
+
+	lock(&dump_lock);
+
+	if (dump_state == DUMP_STATE_ABORTING) {
+		printf("DUMP: Fetch dump aborted, ID = 0x%x\n", dump_id);
+		dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+		update_dump_state(DUMP_STATE_NONE);
+		goto bail;
+	}
+
+	switch (status) {
+	case FSP_STATUS_SUCCESS: /* Fetch next dump block */
+		if (dump_offset < dump_entry->size) {
+			dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+			rc = fsp_dump_read();
+			if (rc == OPAL_SUCCESS)
+				goto bail;
+		} else { /* Dump read complete */
+			compl = true;
+		}
+		break;
+	case FSP_STATUS_MORE_DATA:	/* More data to read */
+		offset += length;
+		buffer = (void *)PSI_DMA_DUMP_DATA + offset;
+		fetch_remain -= length;
+
+		rc = fsp_fetch_data_queue(flags, id, dump_id, offset, buffer,
+					  &fetch_remain, dump_read_complete);
+		if (rc == OPAL_SUCCESS)
+			goto bail;
+		break;
+	default:
+		break;
+	}
+
+	dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+
+	/* Update state */
+	if (compl) {
+		printf("DUMP: Fetch dump success. ID = 0x%x\n", dump_id);
+		update_dump_state(DUMP_STATE_FETCH);
+	} else {
+		printf("DUMP: Fetch dump partial. ID = 0x%x\n", dump_id);
+		update_dump_state(DUMP_STATE_PARTIAL);
+	}
+ bail:
+	unlock(&dump_lock);
+}
+
+/*
+ * Fetch dump data from FSP
+ */
+static int64_t fsp_dump_read(void)
+{
+	int64_t rc;
+	uint16_t data_set;
+	uint8_t flags = 0x00;
+
+	/* Get data set ID */
+	data_set = get_dump_data_set_id(dump_entry->type);
+
+	/* Map TCE buffer */
+	rc = map_dump_buffer();
+	if (rc != OPAL_SUCCESS) {
+		printf("DUMP: TCE mapping failed\n");
+		return rc;
+	}
+
+	printf("DUMP: Fetch Dump. ID = %02x, sub ID = %08x, len = %ld\n",
+	       data_set, dump_entry->id, fetch_remain);
+
+	/* Fetch data */
+	rc = fsp_fetch_data_queue(flags, data_set, dump_entry->id,
+				  dump_offset, (void *)PSI_DMA_DUMP_DATA,
+				  &fetch_remain, dump_read_complete);
+
+	/* Adjust dump fetch offset */
+	dump_offset += fetch_remain;
+
+	return rc;
+}
+
+static int64_t fsp_opal_dump_read(uint32_t dump_id,
+				  struct opal_sg_list *list)
+{
+	struct dump_record *record;
+	int64_t rc, size;
+
+	lock(&dump_lock);
+
+	/* Check state */
+	if (dump_state != DUMP_STATE_NOTIFY) {
+		rc = check_dump_state();
+		goto out;
+	}
+
+	/* Validate dump ID */
+	record = get_dump_rec_from_list(dump_id);
+	if (!record) { /* List corrupted? */
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+
+	/* Validate dump buffer and size */
+	rc = validate_dump_sglist(list, &size);
+	if (rc != OPAL_SUCCESS) {
+		printf("DUMP: SG list validation failed\n");
+		goto out;
+	}
+
+	if (size < record->size) { /* Insuffient buffer */
+		printf("DUMP: Insufficient buffer\n");
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	/* Update state */
+	update_dump_state(DUMP_STATE_FETCHING);
+
+	/* Fetch dump data */
+	dump_entry = record;
+	dump_data = list;
+	dump_offset = 0;
+	rc = fsp_dump_read();
+	if (rc != OPAL_SUCCESS)
+		goto out;
+
+	/* Check status after initiating fetch data */
+	rc = check_dump_state();
+
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+static void dump_ack_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+	if (status)
+		log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+				 "DUMP: ACK failed for ID: 0x%x\n",
+				 msg->data.words[0]);
+	else
+		printf("DUMP: ACKed dump ID: 0x%x\n", msg->data.words[0]);
+
+	fsp_freemsg(msg);
+}
+
+/*
+ * Acknowledge dump
+ */
+static int64_t fsp_opal_dump_ack(uint32_t dump_id)
+{
+	struct dump_record *record;
+	struct fsp_msg *msg;
+	int rc;
+	uint32_t cmd;
+	uint8_t dump_type = 0;
+
+	/* Get dump type */
+	lock(&dump_lock);
+	record = get_dump_rec_from_list(dump_id);
+	if (record)
+		dump_type = record->type;
+
+	/*
+	 * Next available dump in pending list will be of different
+	 * type. Hence we don't need to wait for ack complete.
+	 *
+	 * Note:
+	 *   This allows us to proceed even if we fail to ACK.
+	 *   In the worst case we may get notification for the
+	 *   same dump again, which is probably better than
+	 *   looping forever.
+	 */
+	rc = remove_dump_id_from_list(dump_id);
+	if (rc != OPAL_SUCCESS) /* Invalid dump id */
+		goto out;
+
+	/* Adjust mod value */
+	cmd = FSP_CMD_ACK_DUMP | (dump_type & 0xff);
+	msg = fsp_mkmsg(cmd, 1, dump_id);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+				 "DUMP: Message allocation failed.!\n");
+		rc = OPAL_INTERNAL_ERROR;
+	} else if (fsp_queue_msg(msg, dump_ack_complete)) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_ACK),
+			"DUMP: Failed to queue dump ack message.\n");
+		fsp_freemsg(msg);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+out:
+	unlock(&dump_lock);
+	return rc;
+}
+
+/* Resend dump available notification */
+static int64_t fsp_opal_dump_resend_notification(void)
+{
+	lock(&dump_lock);
+
+	if (dump_state != DUMP_STATE_ABSENT)
+		update_dump_state(DUMP_STATE_NONE);
+
+	update_opal_dump_notify();
+
+	unlock(&dump_lock);
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Handle FSP R/R event.
+ */
+static bool fsp_dump_retrieve_rr(uint32_t cmd_sub_mod,
+				 struct fsp_msg *msg __unused)
+{
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		lock(&dump_lock);
+		/* Reset dump state */
+		if (dump_state == DUMP_STATE_FETCHING)
+			update_dump_state(DUMP_STATE_ABORTING);
+		unlock(&dump_lock);
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		lock(&dump_lock);
+
+		/* Reset TCE mapping */
+		dump_tce_unmap(PSI_DMA_DUMP_DATA_SIZE);
+
+		/* Reset dump state */
+		update_dump_state(DUMP_STATE_NONE);
+
+		/*
+		 * For now keeping R/R handler simple. In the worst case
+		 * we may endup resending dump available notification for
+		 * same dump ID twice to Linux.
+		 */
+		update_opal_dump_notify();
+		unlock(&dump_lock);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Handle host kexec'ing scenarios
+ */
+static bool opal_kexec_dump_notify(void *data __unused)
+{
+	bool ready = true;
+
+	lock(&dump_lock);
+
+	/* Dump retrieve is in progress? */
+	if (dump_state == DUMP_STATE_FETCHING)
+		dump_state = DUMP_STATE_ABORTING;
+
+	/* Not yet safe to kexec */
+	if (dump_state == DUMP_STATE_ABORTING)
+		ready = false;
+
+	unlock(&dump_lock);
+
+	return ready;
+}
+
+/*
+ * FipS dump notification
+ */
+void fsp_fips_dump_notify(uint32_t dump_id, uint32_t dump_size)
+{
+	printf("DUMP: FipS dump available. ID = 0x%x [size: %d bytes]\n",
+	       dump_id, dump_size);
+	add_dump_id_to_list(DUMP_TYPE_FSP, dump_id, dump_size);
+}
+
+/*
+ * System/Platform dump notification
+ */
+static bool fsp_sys_dump_notify(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	/*
+	 * Though spec says mod 00 is deprecated we still
+	 * seems to get mod 00 notification (at least on
+	 * P7 machine).
+	 */
+	if (cmd_sub_mod != FSP_RSP_SYS_DUMP &&
+	    cmd_sub_mod != FSP_RSP_SYS_DUMP_OLD)
+		return false;
+
+	printf("DUMP: Platform dump available. ID = 0x%x [size: %d bytes]\n",
+	       msg->data.words[0], msg->data.words[1]);
+
+	add_dump_id_to_list(DUMP_TYPE_SYS,
+			    msg->data.words[0], msg->data.words[1]);
+	return true;
+}
+
+/*
+ * If platform dump available during IPL time, then we
+ * get notification via HDAT. Check for DT for the dump
+ * presence.
+ */
+static void check_ipl_sys_dump(void)
+{
+	struct dt_node *dump_node;
+	uint32_t dump_id, dump_size;
+
+	dump_node = dt_find_by_path(dt_root, "ipl-params/platform-dump");
+	if (!dump_node)
+		return;
+
+	if (!dt_find_property(dump_node, "dump-id"))
+		return;
+
+	dump_id = dt_prop_get_u32(dump_node, "dump-id");
+	dump_size = (uint32_t)dt_prop_get_u64(dump_node, "total-size");
+
+	printf("DUMP: Platform dump present during IPL.\n");
+	printf("      ID = 0x%x [size: %d bytes]\n", dump_id, dump_size);
+
+	add_dump_id_to_list(DUMP_TYPE_SYS, dump_id, dump_size);
+}
+
+/*
+ * Allocate and initialize dump list
+ */
+static int init_dump_free_list(void)
+{
+	struct dump_record *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct dump_record) * MAX_DUMP_RECORD);
+	if (!entry) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_INIT),
+				 "DUMP: Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_DUMP_RECORD; i++) {
+		list_add_tail(&dump_free, &entry->link);
+		entry++;
+	}
+	return 0;
+}
+
+static struct fsp_client fsp_sys_dump_client = {
+	.message = fsp_sys_dump_notify,
+};
+
+static struct fsp_client fsp_dump_client_rr = {
+	.message = fsp_dump_retrieve_rr,
+};
+
+void fsp_dump_init(void)
+{
+	if (!fsp_present()) {
+		update_dump_state(DUMP_STATE_ABSENT);
+		return;
+	}
+
+	/* Initialize list */
+	if (init_dump_free_list() != 0) {
+		update_dump_state(DUMP_STATE_ABSENT);
+		return;
+	}
+
+	/* Register for Class CE */
+	fsp_register_client(&fsp_sys_dump_client, FSP_MCLASS_SERVICE);
+	/* Register for Class AA (FSP R/R) */
+	fsp_register_client(&fsp_dump_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Register for sync on host reboot call */
+	opal_add_host_sync_notifier(opal_kexec_dump_notify, NULL);
+
+	/* OPAL interface */
+	opal_register(OPAL_DUMP_INIT, fsp_opal_dump_init, 1);
+	opal_register(OPAL_DUMP_INFO, fsp_opal_dump_info, 2);
+	opal_register(OPAL_DUMP_INFO2, fsp_opal_dump_info2, 3);
+	opal_register(OPAL_DUMP_READ, fsp_opal_dump_read, 2);
+	opal_register(OPAL_DUMP_ACK, fsp_opal_dump_ack, 1);
+	opal_register(OPAL_DUMP_RESEND, fsp_opal_dump_resend_notification, 0);
+
+	/* Check for platform dump presence during IPL time */
+	check_ipl_sys_dump();
+}
diff --git a/hw/fsp/fsp-elog-read.c b/hw/fsp/fsp-elog-read.c
new file mode 100644
index 00000000..f4a689ff
--- /dev/null
+++ b/hw/fsp/fsp-elog-read.c
@@ -0,0 +1,520 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This code will enable retrieving of error log from fsp->sapphire
+ * in sequence.
+ * Here, FSP would send next log only when sapphire sends a new
+ * log notification response to FSP. On Completion of reading
+ * the log from FSP, OPAL_EVENT_ERROR_LOG_AVAIL is signaled.
+ * This will remain raised until a call to opal_elog_read()
+ * is made and OPAL_SUCCESS is returned, upon which.
+ * the operation is complete and the event is cleared.
+ * This is READ action from FSP.
+ */
+
+/*
+ * Design of READ error log :
+ * When we receive a new error log entry notificatiion from FSP,
+ * we queue it into the "pending" list.
+ * If the "pending" list is not empty, then we start the fetching log from FSP.
+ *
+ * When Linux reads a log entry, we dequeue it from the "pending" list
+ * and enqueue it to another "processed" list. At this point, if the
+ * "pending" list is not empty, we continue to fetch the next log.
+ *
+ * When Linux calls opal_resend_pending_logs(), we fetch the log
+ * corresponding to the head of the pending list and move it to the
+ * processed list, and continue this process this until the pending list is
+ * empty. If the pending list was empty earlier and is currently non-empty, we
+ * initiate an error log fetch.
+ *
+ * When Linux acks an error log, we remove it from processed list.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <cpu.h>
+#include <lock.h>
+#include <errno.h>
+#include <psi.h>
+#include <fsp-elog.h>
+
+/*
+ * Maximum number of entries that are pre-allocated
+ * to keep track of pending elogs to be fetched.
+ */
+#define ELOG_READ_MAX_RECORD		128
+
+/* Following variables are used to indicate state of the
+ * head log entry which is being fetched from FSP and
+ * these variables are not overwritten until next log is
+ * retrieved from FSP.
+ */
+enum elog_head_state {
+	ELOG_STATE_FETCHING,    /*In the process of reading log from FSP. */
+	ELOG_STATE_FETCHED,     /* Indicates reading log from FSP completed */
+	ELOG_STATE_NONE,        /* Indicates to fetch next log */
+	ELOG_STATE_REJECTED,    /* resend all pending logs to linux */
+};
+
+/* structure to maintain log-id,log-size, pending and processed list */
+struct fsp_log_entry {
+	uint32_t log_id;
+	size_t log_size;
+	struct list_node link;
+};
+
+static LIST_HEAD(elog_read_pending);
+static LIST_HEAD(elog_read_processed);
+static LIST_HEAD(elog_read_free);
+
+/*
+ * lock is used to protect overwriting of processed and pending list
+ * and also used while updating state of each log
+ */
+static struct lock elog_read_lock = LOCK_UNLOCKED;
+
+/* log buffer  to copy FSP log for READ */
+#define ELOG_READ_BUFFER_SIZE	0x00040000
+static void *elog_read_buffer = NULL;
+static uint32_t elog_head_id;	/* FSP entry ID */
+static size_t elog_head_size;	/* actual FSP log size */
+static uint32_t elog_read_retries;	/* bad response status count */
+
+/* Initialize the state of the log */
+static enum elog_head_state elog_head_state = ELOG_STATE_NONE;
+
+/* Need forward declaration because of Circular dependency */
+static void fsp_elog_queue_fetch(void);
+
+/*
+ * check the response message for mbox acknowledgment
+ * command send to FSP.
+ */
+static void fsp_elog_ack_complete(struct fsp_msg *msg)
+{
+	uint8_t val;
+
+	if (!msg->resp)
+		return;
+	val = (msg->resp->word1 >> 8) & 0xff;
+	if (val != 0)
+		prerror("ELOG: Acknowledgment error\n");
+	fsp_freemsg(msg);
+}
+
+/* send Error Log PHYP Acknowledgment to FSP with entry ID */
+static int64_t fsp_send_elog_ack(uint32_t log_id)
+{
+
+	struct fsp_msg *ack_msg;
+
+	ack_msg = fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK, 1, log_id);
+	if (!ack_msg) {
+		prerror("ELOG: Failed to allocate ack message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_queue_msg(ack_msg, fsp_elog_ack_complete)) {
+		fsp_freemsg(ack_msg);
+		ack_msg = NULL;
+		prerror("ELOG: Error queueing elog ack complete\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+/* retrive error log from FSP with TCE for the data transfer */
+static void fsp_elog_check_and_fetch_head(void)
+{
+	lock(&elog_read_lock);
+
+	if (elog_head_state != ELOG_STATE_NONE ||
+			list_empty(&elog_read_pending)) {
+		unlock(&elog_read_lock);
+		return;
+	}
+
+	elog_read_retries = 0;
+
+	/* Start fetching first entry from the pending list */
+	fsp_elog_queue_fetch();
+	unlock(&elog_read_lock);
+}
+
+/* this function should be called with the lock held */
+static void fsp_elog_set_head_state(enum elog_head_state state)
+{
+	enum elog_head_state old_state = elog_head_state;
+
+	elog_head_state = state;
+
+	if (state == ELOG_STATE_FETCHED && old_state != ELOG_STATE_FETCHED)
+		opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL,
+					OPAL_EVENT_ERROR_LOG_AVAIL);
+	if (state != ELOG_STATE_FETCHED && old_state == ELOG_STATE_FETCHED)
+		opal_update_pending_evt(OPAL_EVENT_ERROR_LOG_AVAIL, 0);
+}
+
+/*
+ * when we try maximum time of fetching log from fsp
+ * we call following function to delete log from the
+ * pending list and update the state to fetch next log
+ *
+ * this function should be called with the lock held
+ */
+static void fsp_elog_fetch_failure(uint8_t fsp_status)
+{
+	struct fsp_log_entry *log_data;
+
+	/* read top list and delete the node */
+	log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	list_del(&log_data->link);
+	list_add(&elog_read_free, &log_data->link);
+	prerror("ELOG: received invalid data: %x FSP status: 0x%x\n",
+		log_data->log_id, fsp_status);
+	fsp_elog_set_head_state(ELOG_STATE_NONE);
+}
+
+/* Read response value from FSP for fetch sp data mbox command */
+static void fsp_elog_read_complete(struct fsp_msg *read_msg)
+{
+	uint8_t val;
+	/*struct fsp_log_entry *log_data;*/
+
+	lock(&elog_read_lock);
+	val = (read_msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(read_msg);
+
+	switch (val) {
+	case FSP_STATUS_SUCCESS:
+		fsp_elog_set_head_state(ELOG_STATE_FETCHED);
+		break;
+
+	case FSP_STATUS_DMA_ERROR:
+		if (elog_read_retries++ < MAX_RETRIES) {
+			/*
+			 * for a error response value from FSP, we try to
+			 * send fetch sp data mbox command again for three
+			 * times if response from FSP is still not valid
+			 * we send generic error response to fsp.
+			 */
+			fsp_elog_queue_fetch();
+			break;
+		}
+		fsp_elog_fetch_failure(val);
+		break;
+
+	default:
+		fsp_elog_fetch_failure(val);
+	}
+	if (elog_head_state == ELOG_STATE_REJECTED)
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+	unlock(&elog_read_lock);
+
+	/* Check if a new log needs fetching */
+	fsp_elog_check_and_fetch_head();
+}
+
+/* read error log from FSP through mbox commands */
+static void fsp_elog_queue_fetch(void)
+{
+	int rc;
+	uint8_t flags = 0;
+	struct fsp_log_entry *entry;
+
+	entry = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	fsp_elog_set_head_state(ELOG_STATE_FETCHING);
+	elog_head_id = entry->log_id;
+	elog_head_size = entry->log_size;
+
+	rc = fsp_fetch_data_queue(flags, FSP_DATASET_ERRLOG, elog_head_id,
+				  0, (void *)PSI_DMA_ERRLOG_READ_BUF,
+				  &elog_head_size, fsp_elog_read_complete);
+	if (rc) {
+		prerror("ELOG: failed to queue read message: %d\n", rc);
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+	}
+}
+
+/* opal interface for powernv to read log size and log ID from sapphire */
+static int64_t fsp_opal_elog_info(uint64_t *opla_elog_id,
+				  uint64_t *opal_elog_size, uint64_t *elog_type)
+{
+	struct fsp_log_entry *log_data;
+
+	/* copy type of the error log */
+	*elog_type = ELOG_TYPE_PEL;
+
+	lock(&elog_read_lock);
+	if (elog_head_state != ELOG_STATE_FETCHED) {
+		unlock(&elog_read_lock);
+		return OPAL_WRONG_STATE;
+	}
+	log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+	*opla_elog_id = log_data->log_id;
+	*opal_elog_size = log_data->log_size;
+	unlock(&elog_read_lock);
+	return OPAL_SUCCESS;
+}
+
+/* opal interface for powernv to read log from sapphire */
+static int64_t fsp_opal_elog_read(uint64_t *buffer, uint64_t opal_elog_size,
+				  uint64_t opla_elog_id)
+{
+	struct fsp_log_entry *log_data;
+
+	/*
+	 * Read top entry from list.
+	 * as we know always top record of the list is fetched from FSP
+	 */
+	lock(&elog_read_lock);
+	if (elog_head_state != ELOG_STATE_FETCHED) {
+		unlock(&elog_read_lock);
+		return OPAL_WRONG_STATE;
+	}
+
+	log_data = list_top(&elog_read_pending, struct fsp_log_entry, link);
+
+	/* Check log ID and log size are same and then read log from buffer */
+	if ((opla_elog_id != log_data->log_id) &&
+				(opal_elog_size != log_data->log_size)) {
+		unlock(&elog_read_lock);
+		return OPAL_PARAMETER;
+	}
+
+	memcpy((void *)buffer, elog_read_buffer, opal_elog_size);
+
+	/*
+	 * once log is read from linux move record from pending
+	 * to processed list and delete record from pending list
+	 * and change state of the log to fetch next record
+	 */
+	list_del(&log_data->link);
+	list_add(&elog_read_processed, &log_data->link);
+	fsp_elog_set_head_state(ELOG_STATE_NONE);
+	unlock(&elog_read_lock);
+
+
+	/* read error log from FSP */
+	fsp_elog_check_and_fetch_head();
+
+	return OPAL_SUCCESS;
+}
+
+/* set state of the log head before fetching the log */
+static void elog_reject_head(void)
+{
+	if (elog_head_state == ELOG_STATE_FETCHING)
+		fsp_elog_set_head_state(ELOG_STATE_REJECTED);
+	if (elog_head_state == ELOG_STATE_FETCHED)
+		fsp_elog_set_head_state(ELOG_STATE_NONE);
+}
+
+/* opal Interface for powernv to send ack to fsp with log ID */
+static int64_t fsp_opal_elog_ack(uint64_t ack_id)
+{
+	int rc = 0;
+	struct fsp_log_entry  *record, *next_record;
+
+	/* Send acknowledgement to FSP */
+	rc = fsp_send_elog_ack(ack_id);
+	if (rc != OPAL_SUCCESS) {
+		prerror("ELOG: failed to send acknowledgement: %d\n", rc);
+		return rc;
+	}
+	lock(&elog_read_lock);
+	if (ack_id == elog_head_id)
+		elog_reject_head();
+	list_for_each_safe(&elog_read_pending, record, next_record, link) {
+		if (record->log_id != ack_id)
+			continue;
+		list_del(&record->link);
+		list_add(&elog_read_free, &record->link);
+	}
+	list_for_each_safe(&elog_read_processed, record, next_record, link) {
+		if (record->log_id != ack_id)
+			continue;
+		list_del(&record->link);
+		list_add(&elog_read_free, &record->link);
+	}
+	unlock(&elog_read_lock);
+
+	return rc;
+}
+
+/*
+ * once linux kexec's it ask to resend all logs which
+ * are not acknowledged from  linux
+ */
+static void fsp_opal_resend_pending_logs(void)
+{
+	struct fsp_log_entry  *entry;
+
+	lock(&elog_read_lock);
+
+	/*
+	 * If processed list is not empty add all record from
+	 * processed list to pending list at head of the list
+	 * and delete records from processed list.
+	 */
+	while (!list_empty(&elog_read_processed)) {
+		entry = list_pop(&elog_read_processed,
+					 struct fsp_log_entry, link);
+		list_add(&elog_read_pending, &entry->link);
+	}
+
+	/*
+	 * If the current fetched or fetching log doesn't match our
+	 * new pending list head, then reject it
+	 */
+	if (!list_empty(&elog_read_pending)) {
+		entry = list_top(&elog_read_pending,
+					 struct fsp_log_entry, link);
+		if (entry->log_id != elog_head_id)
+			elog_reject_head();
+	}
+
+	unlock(&elog_read_lock);
+
+	/* Read error log from FSP if needed */
+	fsp_elog_check_and_fetch_head();
+}
+
+/* fsp elog notify function  */
+static bool fsp_elog_msg(uint32_t cmd_sub_mod, struct fsp_msg *msg)
+{
+	int rc = 0;
+	struct fsp_log_entry  *record;
+	uint32_t log_id;
+	uint32_t log_size;
+
+
+	if (cmd_sub_mod != FSP_CMD_ERRLOG_NOTIFICATION)
+		return false;
+
+	log_id = msg->data.words[0];
+	log_size = msg->data.words[1];
+
+	printf("ELOG: Notified of log 0x%08x (size: %d)\n",
+	       log_id, log_size);
+
+	/* take a lock until we take out the node from elog_read_free */
+	lock(&elog_read_lock);
+	if (!list_empty(&elog_read_free)) {
+		/* Create a new entry in the pending list */
+		record = list_pop(&elog_read_free, struct fsp_log_entry, link);
+		record->log_id = log_id;
+		record->log_size = log_size;
+		list_add_tail(&elog_read_pending, &record->link);
+		unlock(&elog_read_lock);
+
+		/* Send response back to FSP for a new elog notify message */
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION,
+					1, log_id), fsp_freemsg);
+		if (rc)
+			prerror("ELOG: Failed to queue errlog notification"
+				" response: %d\n", rc);
+
+		/* read error log from FSP */
+		fsp_elog_check_and_fetch_head();
+
+	} else {
+		printf("ELOG: Log entry 0x%08x discarded\n", log_id);
+
+		/* unlock if elog_read_free is empty */
+		unlock(&elog_read_lock);
+
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_RSP_ERRLOG_NOTIFICATION,
+					     1, log_id), fsp_freemsg);
+		if (rc)
+			prerror("ELOG: Failed to queue errlog notification"
+				" response: %d\n", rc);
+		/*
+		 * if list is full with max record then we
+		 * send discarded by phyp (condition full) ack to FSP.
+		 *
+		 * At some point in the future, we'll get notified again.
+		 * This is largely up to FSP as to when they tell us about
+		 * the log again.
+		 */
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_ERRLOG_PHYP_ACK | 0x02,
+				1, log_id), fsp_freemsg);
+		if (rc)
+			prerror("ELOG: Failed to queue errlog ack"
+				" response: %d\n", rc);
+	}
+
+	return true;
+}
+
+static struct fsp_client fsp_get_elog_notify = {
+	.message = fsp_elog_msg,
+};
+
+/* Pre-allocate memory for reading error log from FSP */
+static int init_elog_read_free_list(uint32_t num_entries)
+{
+	struct fsp_log_entry *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct fsp_log_entry) * num_entries);
+	if (!entry)
+		goto out_err;
+
+	for (i = 0; i < num_entries; ++i) {
+		list_add_tail(&elog_read_free, &entry->link);
+		entry++;
+	}
+	return 0;
+
+out_err:
+	return -ENOMEM;
+}
+
+/* fsp elog read init function */
+void fsp_elog_read_init(void)
+{
+	int val = 0;
+
+	if (!fsp_present())
+		return;
+
+	elog_read_buffer = memalign(TCE_PSIZE, ELOG_READ_BUFFER_SIZE);
+	if (!elog_read_buffer) {
+		prerror("FSP: could not allocate FSP ELOG_READ_BUFFER!\n");
+		return;
+	}
+
+	/* Map TCEs */
+	fsp_tce_map(PSI_DMA_ERRLOG_READ_BUF, elog_read_buffer,
+					PSI_DMA_ERRLOG_READ_BUF_SZ);
+
+	/* pre allocate memory for 128 record */
+	val = init_elog_read_free_list(ELOG_READ_MAX_RECORD);
+	if (val != 0)
+		return;
+
+	/* register Eror log Class D2 */
+	fsp_register_client(&fsp_get_elog_notify, FSP_MCLASS_ERR_LOG);
+
+	/* register opal Interface */
+	opal_register(OPAL_ELOG_READ, fsp_opal_elog_read, 3);
+	opal_register(OPAL_ELOG_ACK, fsp_opal_elog_ack, 1);
+	opal_register(OPAL_ELOG_RESEND, fsp_opal_resend_pending_logs, 0);
+	opal_register(OPAL_ELOG_SIZE, fsp_opal_elog_info, 3);
+}
diff --git a/hw/fsp/fsp-elog-write.c b/hw/fsp/fsp-elog-write.c
new file mode 100644
index 00000000..ee79c4d9
--- /dev/null
+++ b/hw/fsp/fsp-elog-write.c
@@ -0,0 +1,643 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This code will enable generation and pushing of error log
+ * from powernv, sapphire to FSP
+ * Critical events from sapphire that needs to be reported
+ * will be pushed on to FSP after converting the
+ * error log to Platform Error Log (PEL) format.
+ * This is termed as WRITE action to FSP.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <cpu.h>
+#include <lock.h>
+#include <errno.h>
+#include <fsp-elog.h>
+
+/*
+ * Maximum number buffers that are pre-allocated
+ * to hold elogs that are reported on Sapphire and
+ * powernv.
+ */
+#define ELOG_WRITE_MAX_RECORD		64
+
+static LIST_HEAD(elog_write_pending);
+static LIST_HEAD(elog_write_free);
+
+static struct lock elog_write_lock = LOCK_UNLOCKED;
+static struct lock elog_panic_write_lock = LOCK_UNLOCKED;
+
+/* Platform Log ID as per the spec */
+static uint32_t sapphire_elog_id = 0xB0000000;
+static uint32_t powernv_elog_id = 0xB1000000;
+
+/* log buffer  to copy FSP log for READ */
+#define ELOG_WRITE_BUFFER_SIZE	0x00050000
+static void *elog_write_buffer = NULL;
+
+#define ELOG_PANIC_WRITE_BUFFER_SIZE	0x0010000
+static void *elog_panic_write_buffer = NULL;
+
+struct opal_errorlog *panic_write_buffer;
+static int panic_write_buffer_valid;
+static uint32_t elog_write_retries;
+
+/* Need forward declaration because of Circular dependency */
+static int create_opal_event(struct opal_errorlog *elog_data, char *pel_buffer);
+static int opal_send_elog_to_fsp(void);
+
+void log_error(struct opal_err_info *e_info, void *data, uint16_t size,
+	       const char *fmt, ...)
+{
+	struct opal_errorlog *buf;
+	int tag = 0x44455343;  /* ASCII of DESC */
+	va_list list;
+	char err_msg[250];
+
+	va_start(list, fmt);
+	vsnprintf(err_msg, sizeof(err_msg), fmt, list);
+	va_end(list);
+
+	/* Log the error on to Sapphire console */
+	prerror("%s", err_msg);
+
+	buf = opal_elog_create(e_info);
+	if (buf == NULL)
+		prerror("ELOG: Error getting buffer to log error\n");
+	else {
+		opal_elog_update_user_dump(buf, err_msg, tag, strlen(err_msg));
+		/* Append any number of call out dumps */
+		if (e_info->call_out)
+			e_info->call_out(buf, data, size);
+		if (elog_fsp_commit(buf))
+			prerror("ELOG: Re-try error logging\n");
+	}
+}
+
+
+void log_simple_error(struct opal_err_info *e_info, const char *fmt, ...)
+{
+	struct opal_errorlog *buf;
+	int tag = 0x44455343;  /* ASCII of DESC */
+	va_list list;
+	char err_msg[250];
+
+	va_start(list, fmt);
+	vsnprintf(err_msg, sizeof(err_msg), fmt, list);
+	va_end(list);
+
+	/* Log the error on to Sapphire console */
+	prerror("%s", err_msg);
+
+	buf = opal_elog_create(e_info);
+	if (buf == NULL)
+		prerror("ELOG: Error getting buffer to log error\n");
+	else {
+		opal_elog_update_user_dump(buf, err_msg, tag, strlen(err_msg));
+		if (elog_fsp_commit(buf))
+			prerror("ELOG: Re-try error logging\n");
+	}
+}
+
+static struct opal_errorlog *get_write_buffer(int opal_event_severity)
+{
+	struct opal_errorlog *buf;
+
+	lock(&elog_write_lock);
+	if (list_empty(&elog_write_free)) {
+		unlock(&elog_write_lock);
+		if (opal_event_severity == OPAL_ERROR_PANIC) {
+			lock(&elog_panic_write_lock);
+			if (panic_write_buffer_valid == 0) {
+				buf = (struct opal_errorlog *)
+						panic_write_buffer;
+				panic_write_buffer_valid = 1; /* In Use */
+				unlock(&elog_panic_write_lock);
+			} else {
+				unlock(&elog_panic_write_lock);
+				prerror("ELOG: Write buffer full. Retry later\n");
+				return NULL;
+			}
+		} else {
+			prerror("ELOG: Write buffer list is full. Retry later\n");
+			return NULL;
+		}
+	} else {
+		buf = list_pop(&elog_write_free, struct opal_errorlog, link);
+		unlock(&elog_write_lock);
+	}
+
+	memset(buf, 0, sizeof(struct opal_errorlog));
+	return buf;
+}
+
+/* Reporting of error via struct opal_errorlog */
+struct opal_errorlog *opal_elog_create(struct opal_err_info *e_info)
+{
+	struct opal_errorlog *buf;
+
+	buf = get_write_buffer(e_info->sev);
+	if (buf) {
+		buf->error_event_type = e_info->err_type;
+		buf->component_id = e_info->cmp_id;
+		buf->subsystem_id = e_info->subsystem;
+		buf->event_severity = e_info->sev;
+		buf->event_subtype = e_info->event_subtype;
+		buf->reason_code = e_info->reason_code;
+		buf->elog_origin = ORG_SAPPHIRE;
+	}
+
+	return buf;
+}
+
+static void remove_elog_head_entry(void)
+{
+	struct opal_errorlog *entry;
+
+	lock(&elog_write_lock);
+	entry = list_pop(&elog_write_pending, struct opal_errorlog, link);
+	list_add_tail(&elog_write_free, &entry->link);
+	elog_write_retries = 0;
+	unlock(&elog_write_lock);
+}
+
+static void opal_fsp_write_complete(struct fsp_msg *read_msg)
+{
+	uint8_t val;
+
+	val = (read_msg->resp->word1 >> 8) & 0xff;
+	fsp_freemsg(read_msg);
+
+	switch (val) {
+	case FSP_STATUS_SUCCESS:
+			remove_elog_head_entry();
+			break;
+
+	default:
+		if (elog_write_retries++ >= MAX_RETRIES) {
+			remove_elog_head_entry();
+			prerror("ELOG: Error in writing to FSP!\n");
+		}
+		break;
+	}
+
+	if (opal_send_elog_to_fsp() != OPAL_SUCCESS)
+		prerror("ELOG: Error sending elog to FSP !\n");
+}
+
+/* write PEL format hex dump of the log to FSP */
+static int64_t fsp_opal_elog_write(size_t opal_elog_size)
+{
+	struct fsp_msg *elog_msg;
+
+	elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size,
+						 0, PSI_DMA_ERRLOG_WRITE_BUF);
+	if (!elog_msg) {
+		prerror("ELOG: Failed to create message for WRITE to FSP\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_queue_msg(elog_msg, opal_fsp_write_complete)) {
+		fsp_freemsg(elog_msg);
+		elog_msg = NULL;
+		prerror("FSP: Error queueing elog update\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+static int opal_send_elog_to_fsp(void)
+{
+	struct opal_errorlog *head;
+	int rc = OPAL_SUCCESS;
+	int pel_offset = 0;
+
+	/* Convert entry to PEL
+	 * and push it down to FSP. We wait for the ack from
+	 * FSP.
+	 */
+	lock(&elog_write_lock);
+	if (!list_empty(&elog_write_pending)) {
+		head = list_top(&elog_write_pending,
+					 struct opal_errorlog, link);
+		pel_offset = create_opal_event(head, (char *)elog_write_buffer);
+		rc = fsp_opal_elog_write(pel_offset);
+		unlock(&elog_write_lock);
+		return rc;
+	}
+	unlock(&elog_write_lock);
+	return rc;
+}
+
+static int opal_push_logs_sync_to_fsp(struct opal_errorlog *buf)
+{
+	struct fsp_msg *elog_msg;
+	int opal_elog_size = 0;
+	int rc = OPAL_SUCCESS;
+
+	lock(&elog_panic_write_lock);
+	opal_elog_size = create_opal_event(buf,
+				(char *)elog_panic_write_buffer);
+
+	elog_msg = fsp_mkmsg(FSP_CMD_CREATE_ERRLOG, 3, opal_elog_size,
+					0, PSI_DMA_ELOG_PANIC_WRITE_BUF);
+	if (!elog_msg) {
+		prerror("ELOG: Failed to create message for WRITE to FSP\n");
+		unlock(&elog_panic_write_lock);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	if (fsp_sync_msg(elog_msg, false)) {
+		fsp_freemsg(elog_msg);
+		rc = OPAL_INTERNAL_ERROR;
+	} else {
+		rc = (elog_msg->resp->word1 >> 8) & 0xff;
+		fsp_freemsg(elog_msg);
+	}
+
+	if ((buf == panic_write_buffer) && (panic_write_buffer_valid == 1)) {
+		panic_write_buffer_valid = 0;
+		unlock(&elog_panic_write_lock);
+	} else {
+		/* buffer got from the elog_write list , put it back */
+		unlock(&elog_panic_write_lock);
+		lock(&elog_write_lock);
+		list_add_tail(&elog_write_free, &buf->link);
+		unlock(&elog_write_lock);
+	}
+	return rc;
+}
+
+int elog_fsp_commit(struct opal_errorlog *buf)
+{
+	int rc = OPAL_SUCCESS;
+
+	if (buf->event_severity == OPAL_ERROR_PANIC) {
+		rc = opal_push_logs_sync_to_fsp(buf);
+		return rc;
+	}
+
+	lock(&elog_write_lock);
+	if (list_empty(&elog_write_pending)) {
+		list_add_tail(&elog_write_pending, &buf->link);
+		unlock(&elog_write_lock);
+		rc = opal_send_elog_to_fsp();
+		return rc;
+	}
+	list_add_tail(&elog_write_pending, &buf->link);
+	unlock(&elog_write_lock);
+	return rc;
+}
+
+/* This function is called from POWERNV to push logs
+ * on FSP
+ */
+static int opal_commit_log_to_fsp(struct opal_errorlog *buf)
+{
+	struct opal_errorlog *opal_buf;
+	int rc = OPAL_SUCCESS;
+
+	/* Copy the buffer to Sapphire and queue it to push
+	 * to FSP and return
+	 */
+	lock(&elog_write_lock);
+	if (list_empty(&elog_write_free)) {
+		unlock(&elog_write_lock);
+		prerror("ELOG: Error! Write buffer list is full. Retry later\n");
+		return -1;
+	}
+	opal_buf = list_pop(&elog_write_free, struct opal_errorlog, link);
+	unlock(&elog_write_lock);
+	memcpy(opal_buf, buf, sizeof(struct opal_errorlog));
+	opal_buf->elog_origin = ORG_POWERNV;
+	rc = elog_fsp_commit(opal_buf);
+	return rc;
+}
+
+int opal_elog_update_user_dump(struct opal_errorlog *buf, unsigned char *data,
+						uint32_t tag, uint16_t size)
+{
+	char *buffer;
+	struct opal_user_data_section *tmp;
+
+	if (!buf) {
+		prerror("ELOG: Cannot update user data. Buffer is invalid\n");
+		return -1;
+	}
+
+	buffer = (char *)buf->user_data_dump + buf->user_section_size;
+	if ((buf->user_section_size + size) > OPAL_LOG_MAX_DUMP) {
+		prerror("ELOG: Size of dump data overruns buffer\n");
+		return -1;
+	}
+
+	tmp = (struct opal_user_data_section *)buffer;
+	tmp->tag = tag;
+	tmp->size = size + sizeof(struct opal_user_data_section) - 1;
+	memcpy(tmp->data_dump, data, size);
+
+	buf->user_section_size += tmp->size;
+	buf->user_section_count++;
+	return 0;
+}
+
+/* Create MTMS section for sapphire log */
+static void create_mtms_section(struct opal_errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	struct opal_mtms_section *mtms = (struct opal_mtms_section *)
+				(pel_buffer + *pel_offset);
+
+	mtms->v6header.id = ELOG_SID_MACHINE_TYPE;
+	mtms->v6header.length = MTMS_SECTION_SIZE;
+	mtms->v6header.version = OPAL_EXT_HRD_VER;
+	mtms->v6header.subtype = 0;
+	mtms->v6header.component_id = elog_data->component_id;
+
+	memset(mtms->model, 0x00, sizeof(mtms->model));
+	memcpy(mtms->model, dt_prop_get(dt_root, "model"), OPAL_SYS_MODEL_LEN);
+	memset(mtms->serial_no, 0x00, sizeof(mtms->serial_no));
+
+	memcpy(mtms->serial_no, dt_prop_get(dt_root, "system-id"),
+						 OPAL_SYS_SERIAL_LEN);
+	*pel_offset += MTMS_SECTION_SIZE;
+}
+
+/* Create extended header section */
+static void create_extended_header_section(struct opal_errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	const char  *opalmodel = NULL;
+	uint64_t extd_time;
+
+	struct opal_extended_header_section *extdhdr =
+			(struct opal_extended_header_section *)
+					(pel_buffer + *pel_offset);
+
+	extdhdr->v6header.id = ELOG_SID_EXTENDED_HEADER;
+	extdhdr->v6header.length = EXTENDED_HEADER_SECTION_SIZE;
+	extdhdr->v6header.version = OPAL_EXT_HRD_VER;
+	extdhdr->v6header.subtype = 0;
+	extdhdr->v6header.component_id = elog_data->component_id;
+
+	memset(extdhdr->model, 0x00, sizeof(extdhdr->model));
+	opalmodel = dt_prop_get(dt_root, "model");
+	memcpy(extdhdr->model, opalmodel, OPAL_SYS_MODEL_LEN);
+
+	memset(extdhdr->serial_no, 0x00, sizeof(extdhdr->serial_no));
+	memcpy(extdhdr->serial_no, dt_prop_get(dt_root, "system-id"),
+							OPAL_SYS_SERIAL_LEN);
+
+	memset(extdhdr->opal_release_version, 0x00,
+				sizeof(extdhdr->opal_release_version));
+	memset(extdhdr->opal_subsys_version, 0x00,
+				sizeof(extdhdr->opal_subsys_version));
+
+	fsp_rtc_get_cached_tod(&extdhdr->extended_header_date, &extd_time);
+	extdhdr->extended_header_time = extd_time >> 32;
+	extdhdr->opal_symid_len = 0;
+	memset(extdhdr->opalsymid, 0x00, sizeof(extdhdr->opalsymid));
+
+	*pel_offset += EXTENDED_HEADER_SECTION_SIZE;
+}
+
+/* set src type */
+static void settype(struct opal_src_section *src, uint8_t src_type)
+{
+	char type[4];
+	sprintf(type, "%02X", src_type);
+	memcpy(src->srcstring, type, 2);
+}
+
+/* set SRC subsystem type */
+static void setsubsys(struct opal_src_section *src, uint8_t src_subsys)
+{
+	char subsys[4];
+	sprintf(subsys, "%02X", src_subsys);
+	memcpy(src->srcstring+2, subsys, 2);
+}
+
+/* Ser reason code of SRC */
+static void setrefcode(struct opal_src_section *src, uint16_t src_refcode)
+{
+	char refcode[8];
+	sprintf(refcode, "%04X", src_refcode);
+	memcpy(src->srcstring+4, refcode, 4);
+}
+
+/* Create SRC section of OPAL log */
+static void create_src_section(struct opal_errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	struct opal_src_section *src = (struct opal_src_section *)
+						(pel_buffer + *pel_offset);
+
+	src->v6header.id = ELOG_SID_PRIMARY_SRC;
+	src->v6header.length = SRC_SECTION_SIZE;
+	src->v6header.version = OPAL_ELOG_VERSION;
+	src->v6header.subtype = OPAL_ELOG_SST;
+	src->v6header.component_id = elog_data->component_id;
+
+	src->version = OPAL_SRC_SEC_VER;
+	src->flags = 0;
+	src->wordcount = OPAL_SRC_MAX_WORD_COUNT;
+	src->srclength = SRC_LENGTH;
+	settype(src, OPAL_SRC_TYPE_ERROR);
+	setsubsys(src, OPAL_FAILING_SUBSYSTEM);
+	setrefcode(src, elog_data->reason_code);
+	memset(src->hexwords, 0 , (8 * 4));
+	src->hexwords[0] = OPAL_SRC_FORMAT;
+	src->hexwords[4] = elog_data->additional_info[0];
+	src->hexwords[5] = elog_data->additional_info[1];
+	src->hexwords[6] = elog_data->additional_info[2];
+	src->hexwords[7] = elog_data->additional_info[3];
+	*pel_offset += SRC_SECTION_SIZE;
+}
+
+/* Create user header section */
+static void create_user_header_section(struct opal_errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	struct opal_user_header_section *usrhdr =
+				(struct opal_user_header_section *)
+						(pel_buffer + *pel_offset);
+
+	usrhdr->v6header.id = ELOG_SID_USER_HEADER;
+	usrhdr->v6header.length = USER_HEADER_SECTION_SIZE;
+	usrhdr->v6header.version = OPAL_ELOG_VERSION;
+	usrhdr->v6header.subtype = OPAL_ELOG_SST;
+	usrhdr->v6header.component_id = elog_data->component_id;
+
+	usrhdr->subsystem_id = elog_data->subsystem_id;
+	usrhdr->event_scope = 0;
+	usrhdr->event_severity = elog_data->event_severity;
+	usrhdr->event_type = elog_data->event_subtype;
+
+	if (elog_data->elog_origin == ORG_SAPPHIRE)
+		usrhdr->action_flags = ERRL_ACTION_REPORT;
+	else
+		usrhdr->action_flags = ERRL_ACTION_NONE;
+
+	*pel_offset += USER_HEADER_SECTION_SIZE;
+}
+
+/* Create private header section */
+static void create_private_header_section(struct opal_errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	uint64_t ctime;
+	struct opal_private_header_section *privhdr =
+				(struct opal_private_header_section *)
+								pel_buffer;
+
+	privhdr->v6header.id = ELOG_SID_PRIVATE_HEADER;
+	privhdr->v6header.length = PRIVATE_HEADER_SECTION_SIZE;
+	privhdr->v6header.version = OPAL_ELOG_VERSION;
+	privhdr->v6header.subtype = OPAL_ELOG_SST;
+	privhdr->v6header.component_id = elog_data->component_id;
+
+	fsp_rtc_get_cached_tod(&privhdr->create_date, &ctime);
+	privhdr->create_time = ctime >> 32;
+	privhdr->section_count = 5;
+
+	privhdr->creator_subid_hi = 0x00;
+	privhdr->creator_subid_lo = 0x00;
+
+	if (elog_data->elog_origin == ORG_SAPPHIRE) {
+		privhdr->plid = ++sapphire_elog_id;
+		privhdr->creator_id = OPAL_CID_SAPPHIRE;
+	} else {
+		privhdr->plid = ++powernv_elog_id;
+		privhdr->creator_id = OPAL_CID_POWERNV;
+	}
+	privhdr->log_entry_id = 0x00;   /* entry id is updated by FSP */
+
+	*pel_offset += PRIVATE_HEADER_SECTION_SIZE;
+}
+
+static void create_user_defined_section(struct opal_errorlog *elog_data,
+					char *pel_buffer, int *pel_offset)
+{
+	char *dump = (char *)pel_buffer + *pel_offset;
+	char *opal_buf = (char *)elog_data->user_data_dump;
+	struct opal_user_section *usrhdr;
+	struct opal_user_data_section *opal_usr_data;
+	struct opal_private_header_section *privhdr =
+			 (struct opal_private_header_section *)pel_buffer;
+	int i;
+
+	for (i = 0; i < elog_data->user_section_count; i++) {
+
+		usrhdr = (struct opal_user_section *)dump;
+		opal_usr_data = (struct opal_user_data_section *)opal_buf;
+
+		usrhdr->v6header.id = ELOG_SID_USER_DEFINED;
+		usrhdr->v6header.version = OPAL_ELOG_VERSION;
+		usrhdr->v6header.length = sizeof(struct opal_v6_header) +
+							opal_usr_data->size;
+		usrhdr->v6header.subtype = OPAL_ELOG_SST;
+		usrhdr->v6header.component_id = elog_data->component_id;
+
+		memcpy(usrhdr->dump, opal_buf, opal_usr_data->size);
+		*pel_offset += usrhdr->v6header.length;
+		dump += usrhdr->v6header.length;
+		opal_buf += opal_usr_data->size;
+		privhdr->section_count++;
+	}
+}
+
+/* Create all require section of PEL log and write to TCE buffer */
+static int create_opal_event(struct opal_errorlog *elog_data, char *pel_buffer)
+{
+	int pel_offset = 0;
+
+	memset(pel_buffer, 0, PSI_DMA_ERRLOG_WRITE_BUF_SZ);
+
+	create_private_header_section(elog_data, pel_buffer, &pel_offset);
+	create_user_header_section(elog_data, pel_buffer, &pel_offset);
+	create_src_section(elog_data, pel_buffer, &pel_offset);
+	create_extended_header_section(elog_data, pel_buffer, &pel_offset);
+	create_mtms_section(elog_data, pel_buffer, &pel_offset);
+	if (elog_data->user_section_count)
+		create_user_defined_section(elog_data, pel_buffer, &pel_offset);
+
+	return pel_offset;
+}
+
+/* Pre-allocate memory for writing error log to FSP */
+static int init_elog_write_free_list(uint32_t num_entries)
+{
+	struct opal_errorlog *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct opal_errorlog) * num_entries);
+	if (!entry)
+		goto out_err;
+
+	for (i = 0; i < num_entries; ++i) {
+		list_add_tail(&elog_write_free, &entry->link);
+		entry++;
+	}
+
+	/* Pre-allocate one single buffer for PANIC path */
+	panic_write_buffer = zalloc(sizeof(struct opal_errorlog));
+	if (!panic_write_buffer)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	return -ENOMEM;
+}
+
+/* fsp elog init function */
+void fsp_elog_write_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	elog_panic_write_buffer = memalign(TCE_PSIZE,
+					   ELOG_PANIC_WRITE_BUFFER_SIZE);
+	if (!elog_panic_write_buffer) {
+		prerror("FSP: could not allocate ELOG_PANIC_WRITE_BUFFER!\n");
+		return;
+	}
+
+	elog_write_buffer = memalign(TCE_PSIZE, ELOG_WRITE_BUFFER_SIZE);
+	if (!elog_write_buffer) {
+		prerror("FSP: could not allocate ELOG_WRITE_BUFFER!\n");
+		return;
+	}
+
+	/* Map TCEs */
+	fsp_tce_map(PSI_DMA_ELOG_PANIC_WRITE_BUF, elog_panic_write_buffer,
+					PSI_DMA_ELOG_PANIC_WRITE_BUF_SZ);
+
+	fsp_tce_map(PSI_DMA_ERRLOG_WRITE_BUF, elog_write_buffer,
+					PSI_DMA_ERRLOG_WRITE_BUF_SZ);
+
+	/* pre-allocate memory for 128 records */
+	if (init_elog_write_free_list(ELOG_WRITE_MAX_RECORD)) {
+		prerror("ELOG: Cannot allocate WRITE buffers to log errors!\n");
+		return;
+	}
+
+	/* register opal Interface */
+	opal_register(OPAL_ELOG_SEND, opal_commit_log_to_fsp, 1);
+}
diff --git a/hw/fsp/fsp-leds.c b/hw/fsp/fsp-leds.c
new file mode 100644
index 00000000..69b05830
--- /dev/null
+++ b/hw/fsp/fsp-leds.c
@@ -0,0 +1,1080 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * LED location code and indicator handling
+ */
+#include <skiboot.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <console.h>
+#include <timebase.h>
+#include <device.h>
+#include <fsp-leds.h>
+#include <stdio.h>
+#include <spcn.h>
+#include <timebase.h>
+#include <hdata/spira.h>
+#include <hdata/hdata.h>
+#include <fsp-elog.h>
+
+/* Debug prefix */
+#define PREFIX		"FSPLED: "
+
+#define buf_write(p, type, val)  do { *(type *)(p) = val;\
+					p += sizeof(type); } while(0)
+#define buf_read(p, type, addr)  do { *addr = *(type *)(p);\
+					p += sizeof(type); } while(0)
+
+//#define DBG(fmt...)	do { printf(PREFIX fmt); } while(0)
+#define DBG(fmt...)	do { } while(0)
+
+/* SPCN replay threshold */
+#define SPCN_REPLAY_THRESHOLD 2
+
+/* Sapphire LED support */
+static bool led_support;
+
+/*
+ *  PSI mapped buffer for LED data
+ *
+ * Mapped once and never unmapped. Used for fetching all
+ * available LED information and creating the list. Also
+ * used for setting individual LED state.
+ *
+ */
+static void *led_buffer;
+
+/* Maintain list of all LEDs
+ *
+ * The contents here will be used to cater requests from FSP
+ * async commands and HV initiated OPAL calls.
+ */
+static struct list_head  cec_ledq;		/* CEC LED list */
+static struct list_head	 encl_ledq;	/* Enclosure LED list */
+
+/* LED lock */
+static struct lock led_lock = LOCK_UNLOCKED;
+
+/* Last SPCN command */
+static u32 last_spcn_cmd;
+static int replay = 0;
+
+
+static void fsp_leds_query_spcn(void);
+static void fsp_read_leds_data_complete(struct fsp_msg *msg);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_SPCN, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_BUFF, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_LC, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_STATE, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LED_SUPPORT, OPAL_PLATFORM_ERR_EVT, OPAL_LED,
+		OPAL_PLATFORM_FIRMWARE, OPAL_INFO, OPAL_NA, NULL);
+
+/* Find descendent LED record with CEC location code in CEC list */
+static struct fsp_led_data * fsp_find_cec_led(char * loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (strcmp(led->loc_code, loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Find encl LED record with ENCL location code in ENCL list */
+static struct fsp_led_data * fsp_find_encl_led(char * loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&encl_ledq, led, next, link) {
+		if (strcmp(led->loc_code, loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Find encl LED record with CEC location code in CEC list */
+static struct fsp_led_data * fsp_find_encl_cec_led(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (strstr(led->loc_code, "-"))
+			continue;
+		if (!strstr(loc_code, led->loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Find encl LED record with CEC location code in ENCL list */
+static struct fsp_led_data * fsp_find_encl_encl_led(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	list_for_each_safe(&encl_ledq, led, next, link) {
+		if (!strstr(loc_code, led->loc_code))
+			continue;
+		return led;
+	}
+	return NULL;
+}
+
+/* Compute the ENCL LED status in CEC list */
+static void compute_encl_status_cec(struct fsp_led_data *encl_led)
+{
+	struct fsp_led_data *led, *next;
+
+	encl_led->status &= ~SPCN_LED_IDENTIFY_MASK;
+	encl_led->status &= ~SPCN_LED_FAULT_MASK;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (!strstr(led->loc_code, encl_led->loc_code))
+			continue;
+
+		/* Dont count the enclsure LED itself */
+		if (!strcmp(led->loc_code, encl_led->loc_code))
+			continue;
+
+		if (led->status & SPCN_LED_IDENTIFY_MASK)
+			encl_led->status |= SPCN_LED_IDENTIFY_MASK;
+
+		if (led->status & SPCN_LED_FAULT_MASK)
+			encl_led->status |= SPCN_LED_FAULT_MASK;
+	}
+}
+
+/* Is a enclosure LED */
+static bool is_enclosure_led(char *loc_code)
+{
+	if (strstr(loc_code, "-"))
+		return false;
+	if (!fsp_find_cec_led(loc_code) || !fsp_find_encl_led(loc_code))
+		return false;
+	return true;
+}
+
+/*
+ * Update both the local LED lists to reflect upon led state changes
+ * occured with the recent SPCN command. Subsequent LED requests will
+ * be served with these updates changed to the list.
+ */
+static void update_led_list(char *loc_code, u32 led_state)
+{
+	struct fsp_led_data *led = NULL, *encl_led = NULL, *encl_cec_led = NULL;
+	bool is_encl_led = is_enclosure_led(loc_code);
+
+	if (is_encl_led)
+		goto enclosure;
+
+	/* Descendant LED in CEC list */
+	led = fsp_find_cec_led(loc_code);
+	if (!led) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"LED: Could not find descendent LED in CEC LC=%s\n",
+			loc_code);
+		return;
+	}
+	led->status = led_state;
+
+enclosure:
+	/* Enclosure LED in CEC list */
+	encl_cec_led = fsp_find_encl_cec_led(loc_code);
+	if (!encl_cec_led) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"LED: Could not find enclosure LED in CEC LC=%s\n",
+			loc_code);
+		return;
+	}
+
+	/* Enclosure LED in ENCL list */
+	encl_led = fsp_find_encl_encl_led(loc_code);
+	if (!encl_led) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"LED: Could not find enclosure LED in ENCL LC=%s\n",
+			loc_code);
+		return;
+	}
+
+	/* Compute descendent rolled up status */
+	compute_encl_status_cec(encl_cec_led);
+
+	/* Check whether exclussive bits set */
+	if (encl_cec_led->excl_bit & FSP_LED_EXCL_FAULT)
+		encl_cec_led->status |= SPCN_LED_FAULT_MASK;
+
+	if (encl_cec_led->excl_bit & FSP_LED_EXCL_IDENTIFY)
+		encl_cec_led->status |= SPCN_LED_IDENTIFY_MASK;
+
+	/* Copy over */
+	encl_led->status = encl_cec_led->status;
+	encl_led->excl_bit = encl_cec_led->excl_bit;
+}
+
+static void fsp_spcn_set_led_completion(struct fsp_msg *msg)
+{
+	bool fail;
+	u16 ckpt_status;
+	char loc_code[LOC_CODE_SIZE + 1];
+	struct fsp_msg *resp = msg->resp;
+	u32 cmd = FSP_RSP_SET_LED_STATE;
+	u8 status = resp->word1 & 0xff00;
+
+	/*
+ 	 * LED state update request came as part of FSP async message
+ 	 * FSP_CMD_SET_LED_STATE, hence need to send response message.
+  	 */
+	fail = (status == FSP_STATUS_INVALID_DATA) ||
+		(status == FSP_STATUS_DMA_ERROR) ||
+		(status == FSP_STATUS_SPCN_ERROR);
+
+	/* SPCN command failed: Identify the command and roll back changes */
+	if (fail) {
+		log_simple_error(&e_info(OPAL_RC_LED_SPCN),
+			"LED: Last SPCN command failed, status=%02x\n",
+			status);
+		cmd |= FSP_STATUS_GENERIC_ERROR;
+
+		/* Identify the failed command */
+		memset(loc_code, 0, sizeof(loc_code));
+		strncpy(loc_code,
+			((struct fsp_led_data *)(msg->user_data))->loc_code,
+			LOC_CODE_SIZE);
+		ckpt_status = ((struct fsp_led_data *)(msg->user_data))
+			->ckpt_status;
+
+		/* Rollback the changes */
+		update_led_list(loc_code, ckpt_status);	
+	}	
+	fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+}
+
+/*
+ * Set the state of the LED pointed by the location code
+ *
+ * LED command:		FAULT state or IDENTIFY state
+ * LED state  : 	OFF (reset) or ON (set)
+ *
+ * SPCN TCE mapped buffer entries for setting LED state
+ *
+ * struct spcn_led_data {
+ * 	u8	lc_len;
+ *	u16	state;
+ *	char	lc_code[LOC_CODE_SIZE];
+ *};
+ */
+static int fsp_msg_set_led_state(char *loc_code, bool command, bool state)
+{
+	struct spcn_led_data sled;
+	struct fsp_msg *msg = NULL;
+	struct fsp_led_data *led = NULL;
+	void *buf = led_buffer;
+	u16 data_len = 0;
+	u32 cmd_hdr = 0;
+	int rc = 0;
+
+	sled.lc_len = strlen(loc_code);
+	strncpy(sled.lc_code, loc_code, sled.lc_len);
+
+	/* Location code length + Location code + LED control */
+	data_len = LOC_CODE_LEN + sled.lc_len + LED_CONTROL_LEN;
+	cmd_hdr =  SPCN_MOD_SET_LED_CTL_LOC_CODE << 24 | SPCN_CMD_SET << 16 |
+		data_len;
+
+	/* Fetch the current state of LED */
+	led = fsp_find_cec_led(loc_code);
+
+	/* LED not present */
+	if (led == NULL) {
+		u32 cmd = 0;
+		int rc = -1;
+
+		cmd = FSP_RSP_SET_LED_STATE | FSP_STATUS_INVALID_LC;
+		fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+		return rc;
+	}
+
+	/* 
+ 	 * Checkpoint the status here, will use it if the SPCN
+ 	 * command eventually fails.
+ 	 */
+	led->ckpt_status = led->status;
+	sled.state = led->status;
+
+	/* Update the exclussive LED bits  */
+	if (is_enclosure_led(loc_code)) {
+		if (command == LED_COMMAND_FAULT) {
+			if (state == LED_STATE_ON)
+				led->excl_bit |= FSP_LED_EXCL_FAULT;
+			if (state == LED_STATE_OFF)
+				led->excl_bit &= ~FSP_LED_EXCL_FAULT;
+		}
+
+		if (command == LED_COMMAND_IDENTIFY) {
+			if (state == LED_STATE_ON)
+				led->excl_bit |= FSP_LED_EXCL_IDENTIFY;
+			if (state == LED_STATE_OFF)
+				led->excl_bit &= ~FSP_LED_EXCL_IDENTIFY;
+		}
+	}
+
+	/* LED FAULT commad */
+	if (command == LED_COMMAND_FAULT) {
+		if (state == LED_STATE_ON)
+			sled.state |= SPCN_LED_FAULT_MASK;
+		if (state == LED_STATE_OFF)
+			sled.state &= ~SPCN_LED_FAULT_MASK;
+	}
+
+	/* LED IDENTIFY command */
+	if (command == LED_COMMAND_IDENTIFY){
+		if (state == LED_STATE_ON)
+			sled.state |= SPCN_LED_IDENTIFY_MASK;
+		if (state == LED_STATE_OFF)
+			sled.state &= ~SPCN_LED_IDENTIFY_MASK;
+	}
+
+	/* Write into SPCN TCE buffer */
+	buf_write(buf, u8, sled.lc_len);	 /* Location code length */
+	strncpy(buf, sled.lc_code, sled.lc_len); /* Location code */
+	buf += sled.lc_len;
+	buf_write(buf, u16, sled.state);  	 /* LED state */
+
+	msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+			SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0, PSI_DMA_LED_BUF);
+	/* 
+ 	 * Update the local lists based on the attempted SPCN command to
+ 	 * set/reset an individual led (CEC or ENCL).
+ 	 */
+	lock(&led_lock);
+	update_led_list(loc_code, sled.state);
+	msg->user_data = led;
+	unlock(&led_lock);
+
+	rc = fsp_queue_msg(msg, fsp_spcn_set_led_completion);
+	return rc;
+}
+
+/*
+ * Write single location code information into the TCE outbound buffer
+ *
+ * Data layout
+ *
+ * 2 bytes - Length of location code structure
+ * 4 bytes - CCIN in ASCII
+ * 1 byte  - Resource status flag
+ * 1 byte  - Indicator state
+ * 1 byte  - Raw loc code length
+ * 1 byte  - Loc code field size
+ * Field size byte - Null terminated ASCII string padded to 4 byte boundary
+ *
+ */
+static u32 fsp_push_data_to_tce(struct fsp_led_data *led, u8 *out_data,
+				u32 total_size)
+{
+	struct fsp_loc_code_data lcode;
+
+	/* CCIN value is irrelevant */
+	lcode.ccin = 0x0;
+
+	lcode.status = FSP_IND_NOT_IMPLMNTD;
+
+	if (led->parms & SPCN_LED_IDENTIFY_MASK)
+		lcode.status = FSP_IND_IMPLMNTD;
+
+	/* LED indicator status */
+	lcode.ind_state = FSP_IND_INACTIVE;
+	if (led->status & SPCN_LED_IDENTIFY_MASK)
+                lcode.ind_state |= FSP_IND_IDENTIFY_ACTV;
+	if (led->status & SPCN_LED_FAULT_MASK)
+                lcode.ind_state |= FSP_IND_FAULT_ACTV;
+
+	/* Location code */
+	memset(lcode.loc_code, 0, LOC_CODE_SIZE);
+	lcode.raw_len = strlen(led->loc_code);
+	strncpy(lcode.loc_code, led->loc_code, lcode.raw_len);
+	lcode.fld_sz = sizeof(lcode.loc_code);
+
+	/* Rest of the structure */
+	lcode.size = sizeof(lcode);
+	lcode.status &= 0x0f;
+
+	/* 
+	 * Check for outbound buffer overflow. If there are still
+	 * more LEDs to be sent across to FSP, dont send, ignore.
+	 */
+	if ((total_size + lcode.size) > PSI_DMA_LOC_COD_BUF_SZ)
+		return 0;
+
+	/* Copy over to the buffer */
+	memcpy(out_data, &lcode, sizeof(lcode));
+
+	return lcode.size;
+}
+
+/*
+ * Send out LED information structure pointed by "loc_code"
+ * to FSP through the PSI DMA mapping. Buffer layout structure
+ * must be followed.
+ */
+static void fsp_ret_loc_code_list(u16 req_type, char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+
+	u8 *data;			/* Start of TCE mapped buffer */
+	u8 *out_data;			/* Start of location code data */
+	u32 bytes_sent = 0, total_size = 0;
+	u16 header_size = 0, flags = 0;
+
+	/* Init the addresses */
+	data = (u8 *) PSI_DMA_LOC_COD_BUF;
+	out_data = NULL;
+
+	/* Unmapping through FSP_CMD_RET_LOC_BUFFER command */
+	fsp_tce_map(PSI_DMA_LOC_COD_BUF, (void*)data, PSI_DMA_LOC_COD_BUF_SZ);
+	out_data = data + 8;
+
+	/* CEC LED list */
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		/*
+ 		 * When the request type is system wide led list
+ 		 * i.e GET_LC_CMPLT_SYS, send the entire contents
+ 		 * of the CEC list including both all descendents
+ 		 * and all of their enclosures.
+ 		 */
+
+		if (req_type == GET_LC_ENCLOSURES)
+			break;
+
+		if (req_type == GET_LC_ENCL_DESCENDANTS) {
+			if (strstr(led->loc_code, loc_code) == NULL)
+				continue;
+		}
+
+		if (req_type == GET_LC_SINGLE_LOC_CODE) {
+			if (strcmp(led->loc_code, loc_code))
+				continue;
+		}
+
+		/* Push the data into TCE buffer */
+		bytes_sent = 0;
+		bytes_sent = fsp_push_data_to_tce(led, out_data, total_size);
+
+	        /* Advance the TCE pointer */
+		out_data += bytes_sent;
+		total_size += bytes_sent;
+	}
+
+	/* Enclosure LED list */
+	if (req_type == GET_LC_ENCLOSURES) {
+		list_for_each_safe(&encl_ledq, led, next, link) {
+
+			/* Push the data into TCE buffer */
+			bytes_sent = 0;
+			bytes_sent = fsp_push_data_to_tce(led,
+							  out_data, total_size);
+
+			/* Advance the TCE pointer */
+			out_data += bytes_sent;
+			total_size += bytes_sent;
+		}
+	}
+
+	/* Count from 'data' instead of 'data_out' */
+	total_size += 8;
+	memcpy(data, &total_size, sizeof(total_size));
+
+	header_size = OUTBUF_HEADER_SIZE;
+	memcpy(data + sizeof(total_size), &header_size, sizeof(header_size));
+
+	if (req_type == GET_LC_ENCL_DESCENDANTS)
+		flags = 0x8000;
+
+	memcpy(data +  sizeof(total_size) + sizeof(header_size), &flags,
+	       sizeof(flags));
+	fsp_queue_msg(fsp_mkmsg(FSP_RSP_GET_LED_LIST,
+				3, 0, PSI_DMA_LOC_COD_BUF, total_size),
+		      fsp_freemsg);
+}
+
+/*
+ * FSP async command: FSP_CMD_GET_LED_LIST
+ *
+ * (1) FSP sends the list of location codes through inbound buffer
+ * (2) HV sends the status of those location codes through outbound buffer
+ *
+ * Inbound buffer data layout (loc code request structure)
+ *
+ * 2 bytes - Length of entire structure
+ * 2 bytes - Request type
+ * 1 byte - Raw length of location code
+ * 1 byte - Location code field size
+ * `Field size` bytes - NULL terminated ASCII location code string
+ */
+void fsp_get_led_list(struct fsp_msg *msg)
+{
+	struct fsp_loc_code_req req;
+	u32 tce_token = msg->data.words[1];
+	void *buf;
+
+	/* Parse inbound buffer */
+	buf = fsp_inbound_buf_from_tce(tce_token);
+	if (!buf) {
+		fsp_queue_msg(fsp_mkmsg(FSP_RSP_GET_LED_LIST |
+					FSP_STATUS_INVALID_DATA,
+					0), fsp_freemsg);
+		return;
+	}
+	memcpy(&req, buf, sizeof(req));
+
+	printf(PREFIX "Request for loc code list type 0x%04x LC=%s\n",
+	       req.req_type, req.loc_code);
+
+	fsp_ret_loc_code_list(req.req_type, req.loc_code);
+}
+
+/*
+ * FSP async command: FSP_CMD_RET_LOC_BUFFER
+ *
+ * With this command FSP returns ownership of the outbound buffer
+ * used by Sapphire to pass the indicator list previous time. That
+ * way FSP tells Sapphire that it has consumed all the data present
+ * on the outbound buffer and Sapphire can reuse it for next request.
+ */
+void fsp_free_led_list_buf(struct fsp_msg *msg)
+{
+	u32 tce_token = msg->data.words[1];
+	u32 cmd = FSP_RSP_RET_LED_BUFFER;
+
+	/* Token does not point to outbound buffer */
+	if (tce_token != PSI_DMA_LOC_COD_BUF) {
+		log_simple_error(&e_info(OPAL_RC_LED_BUFF),
+			"LED: Invalid tce token from FSP\n");
+		cmd |=  FSP_STATUS_GENERIC_ERROR;
+		fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+		return;
+	}
+
+	/* Unmap the location code DMA buffer */
+	fsp_tce_unmap(PSI_DMA_LOC_COD_BUF, PSI_DMA_LOC_COD_BUF_SZ);
+
+	/* Respond the FSP */
+	fsp_queue_msg(fsp_mkmsg(cmd, 0), fsp_freemsg);
+}
+
+static void fsp_ret_led_state(char *loc_code)
+{
+	struct fsp_led_data *led, *next;
+	u8 ind_state = 0;
+
+	list_for_each_safe(&cec_ledq, led, next, link) {
+		if (strcmp(loc_code, led->loc_code))
+			continue;
+
+		/* Found the location code */
+		if (led->status & SPCN_LED_IDENTIFY_MASK)
+			ind_state |= FSP_IND_IDENTIFY_ACTV;
+		if (led->status & SPCN_LED_FAULT_MASK)
+			ind_state |= FSP_IND_FAULT_ACTV;
+		fsp_queue_msg(fsp_mkmsg(FSP_RSP_GET_LED_STATE, 1, ind_state),
+			      fsp_freemsg);
+		return;
+	}
+
+	/* Location code not found */
+	log_simple_error(&e_info(OPAL_RC_LED_LC),
+		"LED: Could not find the location code LC=%s\n", loc_code);
+
+	fsp_queue_msg(fsp_mkmsg(FSP_RSP_GET_LED_STATE |
+				FSP_STATUS_INVALID_LC, 1, 0xff), fsp_freemsg);
+}
+
+/*
+ * FSP async command: FSP_CMD_GET_LED_STATE
+ *
+ * With this command FSP query the state for any given LED
+ */
+void fsp_get_led_state(struct fsp_msg *msg)
+{
+	struct fsp_get_ind_state_req req;
+	u32 tce_token = msg->data.words[1];
+	void *buf;
+
+	/* Parse the inbound buffer */
+	buf = fsp_inbound_buf_from_tce(tce_token);
+	if (!buf) {
+		fsp_queue_msg(fsp_mkmsg(FSP_RSP_GET_LED_STATE |
+					FSP_STATUS_INVALID_DATA, 0),
+			      fsp_freemsg);
+		return;
+	}
+	memcpy(&req, buf, sizeof(req));
+
+	DBG("%s: tce=0x%08x buf=%p rq.sz=%d rq.lc_len=%d rq.fld_sz=%d"
+	    " LC: %02x %02x %02x %02x....\n", __func__,
+	    tce_token, buf, req.size, req.lc_len, req.fld_sz,
+	    req.loc_code[0], req.loc_code[1],
+	    req.loc_code[2], req.loc_code[3]);
+
+	/* Bound check */
+	if (req.lc_len >= LOC_CODE_SIZE) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"LED: Loc code too large in %s: %d bytes\n",
+			__func__, req.lc_len);
+		req.lc_len = LOC_CODE_SIZE - 1;
+	}
+	/* Ensure NULL termination */
+	req.loc_code[req.lc_len] = 0;
+
+	/* Do the deed */
+	fsp_ret_led_state(req.loc_code);
+}
+
+/*
+ * FSP async command: FSP_CMD_SET_LED_STATE
+ *
+ * With this command FSP sets/resets the state for any given LED
+ */
+void fsp_set_led_state(struct fsp_msg *msg)
+{
+	struct fsp_set_ind_state_req req;
+	struct fsp_led_data *led, *next;
+	u32 tce_token = msg->data.words[1];
+	bool command, state;
+	void *buf;
+
+	/* Parse the inbound buffer */
+	buf = fsp_inbound_buf_from_tce(tce_token);
+	if (!buf) {
+		fsp_queue_msg(fsp_mkmsg(FSP_RSP_SET_LED_STATE |
+					FSP_STATUS_INVALID_DATA,
+					0), fsp_freemsg);
+		return;
+	}
+	memcpy(&req, buf, sizeof(req));
+
+	DBG("%s: tce=0x%08x buf=%p rq.sz=%d rq.typ=0x%04x rq.lc_len=%d"
+	    " rq.fld_sz=%d LC: %02x %02x %02x %02x....\n", __func__,
+	    tce_token, buf, req.size, req.lc_len, req.fld_sz,
+	    req.req_type,
+	    req.loc_code[0], req.loc_code[1],
+	    req.loc_code[2], req.loc_code[3]);
+
+	/* Bound check */
+	if (req.lc_len >= LOC_CODE_SIZE) {
+		log_simple_error(&e_info(OPAL_RC_LED_LC),
+			"LED: Loc code too large in %s: %d bytes\n",
+			__func__, req.lc_len);
+		req.lc_len = LOC_CODE_SIZE - 1;
+	}
+	/* Ensure NULL termination */
+	req.loc_code[req.lc_len] = 0;
+
+	/* Decode command */
+	command =  (req.ind_state & LOGICAL_IND_STATE_MASK) ?
+		LED_COMMAND_FAULT : LED_COMMAND_IDENTIFY;
+	state = (req.ind_state & ACTIVE_LED_STATE_MASK) ?
+		LED_STATE_ON : LED_STATE_OFF;
+
+	/* Handle requests */
+	switch(req.req_type) {
+	case SET_IND_ENCLOSURE:
+		list_for_each_safe(&cec_ledq, led, next, link) {
+			/* Only descendants of the same enclosure */
+			if (!strstr(led->loc_code, req.loc_code))
+				continue;
+
+			/* Skip the enclosure */
+			if (!strcmp(led->loc_code, req.loc_code))
+				continue;
+
+			if (fsp_msg_set_led_state(led->loc_code,
+						  command, state))
+				log_simple_error(&e_info(OPAL_RC_LED_STATE),
+					"LED: Set led state failed at LC=%s\n",
+					led->loc_code);
+		}
+		break;
+	case SET_IND_SINGLE_LOC_CODE:
+		/* Set led state for single descendent led */
+		if (fsp_msg_set_led_state(req.loc_code, command, state))
+			log_simple_error(&e_info(OPAL_RC_LED_STATE),
+				"LED: Set led state failed at LC=%s\n",
+				req.loc_code);
+		break;
+	default:
+		fsp_queue_msg(fsp_mkmsg(FSP_RSP_SET_LED_STATE |
+					FSP_STATUS_NOT_SUPPORTED, 0),
+			      fsp_freemsg);
+	}
+}
+
+/* Handle received indicator message from FSP */
+static bool fsp_indicator_message(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	/* LED support not available yet */
+	if (!led_support) {
+		log_simple_error(&e_info(OPAL_RC_LED_SUPPORT),
+			PREFIX "Indicator message while LED support not"
+			" available yet\n");
+		return false;
+	}
+
+	switch(cmd_sub_mod) {
+		case FSP_CMD_GET_LED_LIST:
+			printf(PREFIX
+			       "FSP_CMD_GET_LED_LIST command received\n");
+			fsp_get_led_list(msg);
+			return true;
+		case FSP_CMD_RET_LED_BUFFER:
+			printf(PREFIX
+			       "FSP_CMD_RET_LED_BUFFER command received\n");
+			fsp_free_led_list_buf(msg);
+			return true;
+		case FSP_CMD_GET_LED_STATE:
+			printf(PREFIX
+			       "FSP_CMD_GET_LED_STATE command received\n");
+			fsp_get_led_state(msg);
+			return true;
+		case FSP_CMD_SET_LED_STATE:
+			printf(PREFIX
+			       "FSP_CMD_SET_LED_STATE command received\n");
+			fsp_set_led_state(msg);
+			return true;
+		default:
+			printf(PREFIX
+			       "Invalid FSP async sub command %06x\n",
+			       cmd_sub_mod);
+			return false;
+	}
+}
+
+/* Indicator class client */
+static struct fsp_client fsp_indicator_client = {
+	.message = fsp_indicator_message,
+};
+
+/*
+ * Process the received LED data from SPCN
+ *
+ * Every LED state data is added into the CEC list. If the location
+ * code is a enclosure type, its added into the enclosure list as well.
+ *
+ */
+static void fsp_process_leds_data(u16 len)
+{
+	struct fsp_led_data *led_data = NULL;
+	void *buf = NULL;
+
+	/*
+	 * Process the entire captured data from the last command
+	 *
+	 * TCE mapped 'led_buffer' contains the fsp_led_data structure
+	 * one after the other till the total lenght 'len'.
+	 *
+	 */
+	buf = led_buffer;
+	while (len) {
+		/* Prepare */
+		led_data = zalloc(sizeof(struct fsp_led_data));
+		assert(led_data);
+
+		/* Resource ID */
+		buf_read(buf, u16, &led_data->rid);
+		len -= sizeof(led_data->rid);
+
+		/* Location code length */
+		buf_read(buf, u8, &led_data->lc_len);
+		len -= sizeof(led_data->lc_len);
+
+		if (led_data->lc_len == 0) {
+			free(led_data);
+			break;
+		}
+
+		/* Location code */
+		strncpy(led_data->loc_code, buf, led_data->lc_len);
+		strcat(led_data->loc_code, "\0");
+
+		buf += led_data->lc_len;
+		len -= led_data->lc_len;
+
+		/* Parameters */
+		buf_read(buf, u16, &led_data->parms);
+		len -=  sizeof(led_data->parms);
+
+		/* Status */
+		buf_read(buf, u16, &led_data->status);
+		len -=  sizeof(led_data->status);
+
+		/*
+		 * This is Enclosure LED's location code, need to go
+		 * inside the enclosure LED list as well.
+		 */
+		if (!strstr(led_data->loc_code, "-")) {
+			struct fsp_led_data *encl_led_data = NULL;
+			encl_led_data = zalloc(sizeof(struct fsp_led_data));
+			assert(encl_led_data);
+
+			/* copy over the original */
+			encl_led_data->rid = led_data->rid;
+			encl_led_data->lc_len = led_data->lc_len;
+			strncpy(encl_led_data->loc_code, led_data->loc_code,
+				led_data->lc_len);
+			encl_led_data->loc_code[led_data->lc_len] = '\0';
+			encl_led_data->parms = led_data->parms;
+			encl_led_data->status = led_data->status;
+
+			/* Add to the list of enclosure LEDs */
+			list_add_tail(&encl_ledq, &encl_led_data->link);
+		}
+
+		/* Push this onto the list */
+		list_add_tail(&cec_ledq, &led_data->link);
+	}
+}
+
+/* Replay the SPCN command */
+static void replay_spcn_cmd(u32 last_spcn_cmd)
+{
+	u32 cmd_hdr = 0;
+	int rc = 0;
+
+	/* Reached threshold */
+	if (replay == SPCN_REPLAY_THRESHOLD) {
+		replay = 0;
+		return;
+	}
+
+	replay++;
+	if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_FIRST) {
+		cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 |
+			SPCN_CMD_PRS << 16;
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+					     SPCN_ADDR_MODE_CEC_NODE,
+					     cmd_hdr, 0,
+					     PSI_DMA_LED_BUF),
+				   fsp_read_leds_data_complete);
+		if (rc)
+			printf(PREFIX
+			       "Replay SPCN_MOD_PRS_LED_DATA_FIRST"
+			       " command could not be queued\n");
+	}
+
+	if (last_spcn_cmd == SPCN_MOD_PRS_LED_DATA_SUB) {
+		cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 | SPCN_CMD_PRS << 16;
+		rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+					     SPCN_ADDR_MODE_CEC_NODE, cmd_hdr,
+					     0, PSI_DMA_LED_BUF),
+				   fsp_read_leds_data_complete);
+		if (rc)
+			printf(PREFIX
+			       "Replay SPCN_MOD_PRS_LED_DATA_SUB"
+			       " command could not be queued\n");
+	}
+}
+
+/*
+ * FSP message response handler for following SPCN LED commands
+ * which are used to fetch all of the LED data from SPCN
+ *
+ * 1. SPCN_MOD_PRS_LED_DATA_FIRST      --> First 1KB of LED data
+ * 2. SPCN_MOD_PRS_LED_DATA_SUB        --> Subsequent 1KB of LED data
+ *
+ * Once the SPCN_RSP_STATUS_SUCCESS response code has been received
+ * indicating the last batch of 1KB LED data is here, the list addition
+ * process is now complete and we enable LED support for FSP async commands
+ * and for OPAL interface.
+ */
+static void fsp_read_leds_data_complete(struct fsp_msg *msg)
+{
+	struct fsp_led_data *led, *next;
+	struct fsp_msg *resp = msg->resp;
+	u32 cmd_hdr = 0;
+	int rc = 0;
+
+	u32 msg_status = resp->word1 & 0xff00;
+	u32 led_status = (resp->data.words[1] >> 24) & 0xff;
+        u16 data_len = (u16)(resp->data.words[1] & 0xffff);
+
+	if (msg_status != FSP_STATUS_SUCCESS) {
+		log_simple_error(&e_info(OPAL_RC_LED_SUPPORT),
+			"LED: FSP returned error %x LED not supported\n",
+								 msg_status);
+		/* LED support not available */
+		led_support = false;
+		return;
+	}
+
+	/* SPCN command status */
+        switch (led_status) {
+		/* Last 1KB of LED data */
+		case SPCN_RSP_STATUS_SUCCESS:
+			printf(PREFIX
+			       "SPCN_RSP_STATUS_SUCCESS: %d bytes received\n",
+			       data_len);
+
+			/* Copy data to the local list */
+			fsp_process_leds_data(data_len);
+			led_support = true;
+
+			/* LEDs captured on the system */
+			printf(PREFIX "CEC LEDs captured on the system:\n");
+			list_for_each_safe(&cec_ledq, led, next, link) {
+				printf(PREFIX "rid: %x\t", led->rid);
+				printf("len: %x      ", led->lc_len);
+				printf("lcode: %-30s\t", led->loc_code);
+				printf("parms: %04x\t", led->parms);
+				printf("status: %04x\n", led->status);
+			}
+
+			printf(PREFIX "ENCL LEDs captured on the system:\n");
+			list_for_each_safe(&encl_ledq, led, next, link) {
+				printf(PREFIX "rid: %x\t", led->rid);
+				printf("len: %x      ", led->lc_len);
+				printf("lcode: %-30s\t", led->loc_code);
+				printf("parms: %04x\t", led->parms);
+				printf("status: %04x\n", led->status);
+			}
+
+			break;
+
+		/* If more 1KB of LED data present */
+		case SPCN_RSP_STATUS_COND_SUCCESS:
+			printf(PREFIX
+			       "SPCN_RSP_STATUS_COND_SUCCESS: %d bytes "
+			       " received\n", data_len);
+
+			/* Copy data to the local list */
+			fsp_process_leds_data(data_len);
+
+			/* Fetch the remaining data from SPCN */
+			last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_SUB;
+			cmd_hdr = SPCN_MOD_PRS_LED_DATA_SUB << 24 |
+				SPCN_CMD_PRS << 16;
+			rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+						     SPCN_ADDR_MODE_CEC_NODE,
+						     cmd_hdr,
+						     0, PSI_DMA_LED_BUF),
+					   fsp_read_leds_data_complete);
+			if (rc)
+				printf(PREFIX
+				       "SPCN_MOD_PRS_LED_DATA_SUB command"
+				       " could not be queued\n");
+			break;
+
+		/* Other expected error codes*/
+		case SPCN_RSP_STATUS_INVALID_RACK:
+		case SPCN_RSP_STATUS_INVALID_SLAVE:
+		case SPCN_RSP_STATUS_INVALID_MOD:
+		case SPCN_RSP_STATUS_STATE_PROHIBIT:
+		case SPCN_RSP_STATUS_UNKNOWN:
+			/* Replay the previous SPCN command */
+			replay_spcn_cmd(last_spcn_cmd);
+	}
+	fsp_freemsg(msg);
+}
+
+/*
+ * Init the LED state
+ *
+ * This is called during the host boot process. This is the place where
+ * we figure out all the LEDs present on the system, their state and then
+ * create structure out of those information and popullate two master lists.
+ * One for all the LEDs on the CEC and one for all the LEDs on the enclosure.
+ * The LED information contained in the lists will cater either to various
+ * FSP initiated async commands or POWERNV initiated OPAL calls. Need to make
+ * sure that this initialization process is complete before allowing any requets
+ * on LED. Also need to be called to re-fetch data from SPCN after any LED state
+ * have been updated.
+ */
+static void fsp_leds_query_spcn()
+{
+	struct fsp_led_data *led = NULL;
+	int rc = 0;
+
+	u32 cmd_hdr = SPCN_MOD_PRS_LED_DATA_FIRST << 24 | SPCN_CMD_PRS << 16;
+
+	/* Till the last batch of LED data */
+	led_support = false;
+	last_spcn_cmd = 0;
+
+	/* Empty the lists */
+	while (!list_empty(&cec_ledq)) {
+		led = list_pop(&cec_ledq, struct fsp_led_data, link);
+		free(led);
+	}
+
+	while (!list_empty(&encl_ledq)) {
+		led = list_pop(&encl_ledq, struct fsp_led_data, link);
+		free(led);
+	}
+
+	/* Allocate buffer with alignment requirements */
+	if (led_buffer == NULL) {
+		led_buffer = memalign(TCE_PSIZE, PSI_DMA_LED_BUF_SZ);
+		if (!led_buffer)
+			return;
+	}
+
+	/* TCE mapping - will not unmap */
+	fsp_tce_map(PSI_DMA_LED_BUF, led_buffer, PSI_DMA_LED_BUF_SZ);
+
+	/* Request the first 1KB of LED data */
+	last_spcn_cmd = SPCN_MOD_PRS_LED_DATA_FIRST;
+	rc = fsp_queue_msg(fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+			SPCN_ADDR_MODE_CEC_NODE, cmd_hdr, 0,
+				PSI_DMA_LED_BUF), fsp_read_leds_data_complete);
+	if (rc)
+		printf(PREFIX
+		       "SPCN_MOD_PRS_LED_DATA_FIRST command could"
+		       " not be queued\n");
+}
+
+/* Init the LED subsystem at boot time */
+void fsp_led_init(void)
+{
+	led_buffer = NULL;
+
+	/* Init the master lists */
+	list_head_init(&cec_ledq);
+	list_head_init(&encl_ledq);
+
+	fsp_leds_query_spcn();
+	printf(PREFIX "Init completed\n");
+
+	/* Handle FSP initiated async LED commands */
+	fsp_register_client(&fsp_indicator_client, FSP_MCLASS_INDICATOR);
+	printf(PREFIX "FSP async command client registered\n");
+}
diff --git a/hw/fsp/fsp-mdst-table.c b/hw/fsp/fsp-mdst-table.c
new file mode 100644
index 00000000..5b299482
--- /dev/null
+++ b/hw/fsp/fsp-mdst-table.c
@@ -0,0 +1,252 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * Sapphire dump design:
+ *   - During initialization we setup Memory Dump Source Table (MDST) table
+ *     which contains address, size pair.
+ *   - We send MDST table update notification to FSP via MBOX command.
+ *   - During Sapphire checkstop:
+ *     - FSP retrieves HWDUMP.
+ *     - FSP retrieves CEC memory based on MDST table.
+ *   - Once Sapphire reboot FSP sends new dump avialable notification via HDAT
+ */
+
+#include <fsp.h>
+#include <psi.h>
+#include <opal.h>
+#include <lock.h>
+#include <skiboot.h>
+#include <fsp-elog.h>
+#include <fsp-mdst-table.h>
+
+/*
+ * Sapphire dump size
+ *   This is the maximum memory that FSP can retrieve during checkstop.
+ *
+ * Note:
+ *   Presently we are hardcoding this parameter. Eventually we need
+ *   new System parameter so that we can get max size dynamically.
+ */
+#define MAX_SAPPHIRE_DUMP_SIZE	0x1000000
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_DUMP_MDST_UPDATE, OPAL_PLATFORM_ERR_EVT, OPAL_DUMP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+
+static struct dump_mdst_table *mdst_table;
+
+static int cur_mdst_entry;
+static int max_mdst_entry;
+static int cur_dump_size;
+/*
+ * Presently both sizes are same.. But if someday FSP gives more space
+ * than our TCE mapping then we need this validation..
+ *
+ * Also once FSP implements MAX_SAPPHIRE_DUMP_SIZE system param, we can
+ * move this validation to separate function.
+ */
+static int max_dump_size = MIN(MAX_SAPPHIRE_DUMP_SIZE, PSI_DMA_HYP_DUMP_SIZE);
+
+/* Protect MDST table entries */
+static struct lock mdst_lock = LOCK_UNLOCKED;
+
+/* Not supported on P7 */
+static inline bool fsp_mdst_supported(void)
+{
+	return proc_gen >= proc_gen_p8;
+}
+
+static void update_mdst_table_complete(struct fsp_msg *msg)
+{
+	uint8_t status = (msg->resp->word1 >> 8) & 0xff;
+
+	if (status)
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+				 "MDST: MDST table update failed: 0x%x\n",
+				 status);
+	else
+		printf("MDST: Table updated.\n");
+
+	fsp_freemsg(msg);
+}
+
+/* Send MDST table to FSP */
+static int64_t fsp_update_mdst_table(void)
+{
+	struct fsp_msg *msg;
+	int rc = OPAL_SUCCESS;
+
+	if (cur_mdst_entry <= 0) {
+		printf("MDST: Table is empty\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	lock(&mdst_lock);
+	msg = fsp_mkmsg(FSP_CMD_HYP_MDST_TABLE, 4, 0,
+			PSI_DMA_MDST_TABLE,
+			sizeof(*mdst_table) * cur_mdst_entry,
+			sizeof(*mdst_table));
+	unlock(&mdst_lock);
+
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+				 "MDST: Message allocation failed.!\n");
+		rc = OPAL_INTERNAL_ERROR;
+	} else if (fsp_queue_msg(msg, update_mdst_table_complete)) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_UPDATE),
+				 "MDST: Failed to queue MDST table message.\n");
+		fsp_freemsg(msg);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+	return rc;
+}
+
+/* Add entry to MDST table */
+static int __mdst_table_add_entry(void *addr, uint32_t type, uint32_t size)
+{
+	int rc = OPAL_INTERNAL_ERROR;
+
+	lock(&mdst_lock);
+
+	if (!mdst_table)
+		goto out;
+
+	if (cur_mdst_entry >= max_mdst_entry) {
+		printf("MDST: Table is full.\n");
+		goto out;
+	}
+
+	/* Make sure we don't cross dump size limit */
+	if (cur_dump_size + size > max_dump_size) {
+		printf("MDST: %d is crossing max dump size (%d) limit.\n",
+		       cur_dump_size + size, max_dump_size);
+		goto out;
+	}
+
+	/* TCE mapping */
+	fsp_tce_map(PSI_DMA_HYP_DUMP + cur_dump_size, addr, ALIGN_UP(size, TCE_PSIZE));
+
+	/* Add entry to MDST table */
+	mdst_table[cur_mdst_entry].addr = PSI_DMA_HYP_DUMP + cur_dump_size;
+	mdst_table[cur_mdst_entry].type = type;
+	mdst_table[cur_mdst_entry].size = size;
+
+	/* Update MDST count and dump size */
+	cur_mdst_entry++;
+	cur_dump_size += ALIGN_UP(size, TCE_PSIZE);
+
+	printf("MDST: Addr = 0x%llx [size : %d bytes] added to MDST table.\n",
+	       (uint64_t)addr, size);
+
+	rc = OPAL_SUCCESS;
+
+out:
+	unlock(&mdst_lock);
+	return rc;
+}
+
+static int mdst_table_add_entries(void)
+{
+	int rc;
+
+	/* Add console buffer */
+	rc = __mdst_table_add_entry((void *)INMEM_CON_START,
+				    DUMP_SECTION_CONSOLE, INMEM_CON_LEN);
+	if (rc)
+		return rc;
+
+	/* Add HBRT buffer */
+	rc = __mdst_table_add_entry((void *)HBRT_CON_START,
+				    DUMP_SECTION_HBRT_LOG, HBRT_CON_LEN);
+
+	return rc;
+}
+
+/* TCE mapping */
+static inline void mdst_table_tce_map(void)
+{
+	fsp_tce_map(PSI_DMA_MDST_TABLE, mdst_table, PSI_DMA_MDST_TABLE_SIZE);
+}
+
+/* Initialize MDST table */
+static int mdst_table_init(void)
+{
+	max_mdst_entry = PSI_DMA_MDST_TABLE_SIZE / sizeof(*mdst_table);
+	printf("MDST: Max entries in MDST table : %d\n", max_mdst_entry);
+
+	mdst_table = memalign(TCE_PSIZE, PSI_DMA_MDST_TABLE_SIZE);
+	if (!mdst_table) {
+		log_simple_error(&e_info(OPAL_RC_DUMP_MDST_INIT),
+			 "MDST: Failed to allocate memory for MDST table.\n");
+		return -ENOMEM;
+	}
+
+	memset(mdst_table, 0, PSI_DMA_MDST_TABLE_SIZE);
+	mdst_table_tce_map();
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * Handle FSP R/R event.
+ */
+static bool fsp_mdst_update_rr(uint32_t cmd_sub_mod,
+			       struct fsp_msg *msg __unused)
+{
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		return true;
+	case FSP_RELOAD_COMPLETE: /* Send MDST to FSP */
+		fsp_update_mdst_table();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_mdst_client_rr = {
+	.message = fsp_mdst_update_rr,
+};
+
+/* Initialize MDST table and send notification to FSP */
+void fsp_mdst_table_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	if (!fsp_mdst_supported())
+		return;
+
+	/* Initiate MDST */
+	if (mdst_table_init() != OPAL_SUCCESS)
+		return;
+
+	/*
+	 * Ignore return code from mdst_table_add_entries so that
+	 * we can atleast capture partial dump.
+	 */
+	mdst_table_add_entries();
+	fsp_update_mdst_table();
+
+	/* Register for Class AA (FSP R/R) */
+	fsp_register_client(&fsp_mdst_client_rr, FSP_MCLASS_RR_EVENT);
+}
diff --git a/hw/fsp/fsp-mem-err.c b/hw/fsp/fsp-mem-err.c
new file mode 100644
index 00000000..8ebaaee5
--- /dev/null
+++ b/hw/fsp/fsp-mem-err.c
@@ -0,0 +1,415 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <lock.h>
+#include <fsp.h>
+#include <fsp-elog.h>
+
+/* debug message prefix */
+#define PREFIX			"FSPMEMERR: "
+
+/* FSP sends real address of 4K memory page. */
+#define MEM_ERR_PAGE_SIZE_4K	(1UL << 12)
+
+/* maximum number of error event to hold until linux consumes it. */
+#define MERR_MAX_RECORD		1024
+
+/* FSP response status */
+#define FSP_RESP_STATUS_GENERIC_FAILURE		0xfe
+
+struct fsp_mem_err_node {
+	struct list_node list;
+	struct OpalMemoryErrorData data;
+};
+
+static LIST_HEAD(merr_free_list);
+static LIST_HEAD(mem_error_list);
+/*
+ * lock is used to protect overwriting of merr_free_list and mem_error_list
+ * list.
+ */
+static struct lock mem_err_lock = LOCK_UNLOCKED;
+
+void mem_err_info_dump(struct opal_errorlog *buf, void *data, uint16_t size);
+
+DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_RES, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR,
+			OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+			OPAL_NA, mem_err_info_dump);
+
+DEFINE_LOG_ENTRY(OPAL_RC_MEM_ERR_DEALLOC, OPAL_PLATFORM_ERR_EVT, OPAL_MEM_ERR,
+			OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+			OPAL_NA, mem_err_info_dump);
+
+void mem_err_info_dump(struct opal_errorlog *buf, void *data, uint16_t size)
+{
+	opal_elog_update_user_dump(buf, data, 0x44455350, size);
+}
+
+static bool send_response_to_fsp(u32 cmd_sub_mod)
+{
+	struct fsp_msg *rsp;
+	int rc = -ENOMEM;
+
+	rsp = fsp_mkmsg(cmd_sub_mod, 0);
+	if (rsp)
+		rc = fsp_queue_msg(rsp, fsp_freemsg);
+	if (rc) {
+		/* XXX Generate error logs */
+		prerror(PREFIX "Error %d queueing FSP memory error"
+			" reply\n", rc);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * Queue up the memory error message for delivery.
+ *
+ * queue_event_for_delivery get called from two places.
+ * 1) from queue_mem_err_node when new fsp mem error is available and
+ * 2) from completion callback indicating that linux has consumed an message.
+ *
+ * TODO:
+ * There is a chance that, we may not get a free slot to queue our event
+ * for delivery to linux during both the above invocations. In that case
+ * we end up holding events with us until next fsp memory error comes in.
+ * We need to address this case either here OR fix up messaging infrastructure
+ * to make sure at least one slot will always be available per message type.
+ *
+ * XXX: BenH: I changed the msg infrastructure to attempt an allocation
+ *            in that case, at least until we clarify a bit better how
+ *            we want to handle things.
+ */
+static void queue_event_for_delivery(void *data __unused)
+{
+	struct fsp_mem_err_node *entry;
+	uint64_t *merr_data;
+	int rc;
+
+	lock(&mem_err_lock);
+	entry = list_pop(&mem_error_list, struct fsp_mem_err_node, list);
+	unlock(&mem_err_lock);
+
+	if (!entry)
+		return;
+
+	/*
+	 * struct OpalMemoryErrorData is of (4 * 64 bits) size and well packed
+	 * structure. Hence use uint64_t pointer to pass entire structure
+	 * using 4 params in generic message format.
+	 */
+	merr_data = (uint64_t *)&entry->data;
+
+	/* queue up for delivery */
+	rc = opal_queue_msg(OPAL_MSG_MEM_ERR, NULL,
+			    queue_event_for_delivery,
+			    merr_data[0], merr_data[1],
+			    merr_data[2], merr_data[3]);
+	lock(&mem_err_lock);
+	if (rc) {
+		/*
+		 * Failed to queue up the event for delivery. No free slot
+		 * available. There is a chance that we are trying to queue
+		 * up multiple event at the same time. We may already have
+		 * at least one event queued up, in that case we will be
+		 * called again through completion callback and we should
+		 * be able to grab empty slot then.
+		 *
+		 * For now, put this node back on mem_error_list.
+		 */
+		list_add(&mem_error_list, &entry->list);
+	} else
+		list_add(&merr_free_list, &entry->list);
+	unlock(&mem_err_lock);
+}
+
+static int queue_mem_err_node(struct OpalMemoryErrorData *merr_evt)
+{
+	struct fsp_mem_err_node *entry;
+
+	lock(&mem_err_lock);
+	entry = list_pop(&merr_free_list, struct fsp_mem_err_node, list);
+	if (!entry) {
+		printf(PREFIX "Failed to queue up memory error event.\n");
+		unlock(&mem_err_lock);
+		return -ENOMEM;
+	}
+
+	entry->data = *merr_evt;
+	list_add(&mem_error_list, &entry->list);
+	unlock(&mem_err_lock);
+
+	/* Queue up the event for delivery to OS. */
+	queue_event_for_delivery(NULL);
+	return 0;
+}
+
+/* Check if memory resilience event for same address already exists. */
+static bool is_resilience_event_exist(u64 paddr)
+{
+	struct fsp_mem_err_node *entry;
+	struct OpalMemoryErrorData *merr_evt;
+	int found = 0;
+
+	lock(&mem_err_lock);
+	list_for_each(&mem_error_list, entry, list) {
+		merr_evt = &entry->data;
+		if ((merr_evt->type == OPAL_MEM_ERR_TYPE_RESILIENCE) &&
+		    (merr_evt->u.resilience.physical_address_start
+							    == paddr)) {
+			found = 1;
+			break;
+		}
+	}
+	unlock(&mem_err_lock);
+	return !!found;
+}
+
+/*
+ * handle Memory Resilience error message.
+ * Section 28.2 of Hypervisor to FSP Mailbox Interface Specification.
+ *
+ * The flow for Memory Resilence Event is:
+ * 1. PRD component in FSP gets a recoverable attention from hardware when
+ *    there is a corretable/uncorrectable memory error to free up a page.
+ * 2. PRD sends Memory Resilence Command to hypervisor with the real address of
+ *    the 4K memory page in which the error occurred.
+ * 3. The hypervisor acknowledges with a status immediately. Immediate
+ *    acknowledgment doesn’t require the freeing of the page to be completed.
+ */
+static bool handle_memory_resilience(u32 cmd_sub_mod, u64 paddr)
+{
+	int rc = 0;
+	u8 err = 0;
+	struct OpalMemoryErrorData mem_err_evt;
+
+	memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData));
+	/* Check arguments */
+	if (paddr == 0) {
+		prerror(PREFIX "memory resilience: Invalid real address.\n");
+		err = FSP_RESP_STATUS_GENERIC_FAILURE;
+	}
+
+	/* If we had an error, send response to fsp and return */
+	if (err)
+		return send_response_to_fsp(FSP_RSP_MEM_RES | err);
+
+	/* Check if event already exist for same address. */
+	if (is_resilience_event_exist(paddr))
+		goto send_response;
+
+	/* Populate an event. */
+	mem_err_evt.version = OpalMemErr_V1;
+	mem_err_evt.type = OPAL_MEM_ERR_TYPE_RESILIENCE;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_MEM_RES_CE:
+		/*
+		 * Should we keep counter for corrected errors in
+		 * sapphire OR let linux (PowerNV) handle it?
+		 *
+		 * For now, send corrected errors to linux and let
+		 * linux handle corrected errors thresholding.
+		 */
+		mem_err_evt.flags |= OPAL_MEM_CORRECTED_ERROR;
+		mem_err_evt.u.resilience.resil_err_type =
+					OPAL_MEM_RESILIENCE_CE;
+		break;
+	case FSP_CMD_MEM_RES_UE:
+		mem_err_evt.u.resilience.resil_err_type =
+					OPAL_MEM_RESILIENCE_UE;
+		break;
+	case FSP_CMD_MEM_RES_UE_SCRB:
+		mem_err_evt.u.resilience.resil_err_type =
+					OPAL_MEM_RESILIENCE_UE_SCRUB;
+		break;
+	}
+	mem_err_evt.u.resilience.physical_address_start = paddr;
+	mem_err_evt.u.resilience.physical_address_end =
+					paddr + MEM_ERR_PAGE_SIZE_4K;
+
+	/* Queue up the event and inform OS about it. */
+	rc = queue_mem_err_node(&mem_err_evt);
+
+send_response:
+	/* Queue up an OK response to the resilience message itself */
+	if (!rc)
+		return send_response_to_fsp(FSP_RSP_MEM_RES);
+	else {
+		log_error(&e_info(OPAL_RC_MEM_ERR_RES),
+			&mem_err_evt, sizeof(struct OpalMemoryErrorData),
+			"OPAL_MEM_ERR: Cannot queue up memory "
+			"resilience error event to the OS");
+		return false;
+	}
+}
+
+/* update existing event entry if match is found. */
+static bool update_memory_deallocation_event(u64 paddr_start, u64 paddr_end)
+{
+	struct fsp_mem_err_node *entry;
+	struct OpalMemoryErrorData *merr_evt;
+	int found = 0;
+
+	lock(&mem_err_lock);
+	list_for_each(&mem_error_list, entry, list) {
+		merr_evt = &entry->data;
+		if ((merr_evt->type == OPAL_MEM_ERR_TYPE_DYN_DALLOC) &&
+		    (merr_evt->u.dyn_dealloc.physical_address_start
+							    == paddr_start)) {
+			found = 1;
+			if (merr_evt->u.dyn_dealloc.physical_address_end
+								< paddr_end)
+				merr_evt->u.dyn_dealloc.physical_address_end
+								= paddr_end;
+			break;
+		}
+	}
+	unlock(&mem_err_lock);
+	return !!found;
+}
+
+/*
+ * Handle dynamic memory deallocation message.
+ *
+ * When a condition occurs in which we need to do a large scale memory
+ * deallocation, PRD will send a starting and ending address of an area of
+ * memory to Hypervisor. Hypervisor then need to use this to deallocate all
+ * pages between and including the addresses.
+ *
+ */
+static bool handle_memory_deallocation(u64 paddr_start, u64 paddr_end)
+{
+	int rc = 0;
+	u8 err = 0;
+	struct OpalMemoryErrorData mem_err_evt;
+
+	memset(&mem_err_evt, 0, sizeof(struct OpalMemoryErrorData));
+	/* Check arguments */
+	if ((paddr_start == 0) || (paddr_end == 0)) {
+		prerror(PREFIX "memory deallocation: Invalid "
+			"starting/ending real address.\n");
+		err = FSP_RESP_STATUS_GENERIC_FAILURE;
+	}
+
+	/* If we had an error, send response to fsp and return */
+	if (err)
+		return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC | err);
+
+	/*
+	 * FSP can send dynamic memory deallocation multiple times for the
+	 * same address/address ranges. Hence check and update if we already
+	 * have sam event queued.
+	 */
+	if (update_memory_deallocation_event(paddr_start, paddr_end))
+		goto send_response;
+
+	/* Populate an new event. */
+	mem_err_evt.version = OpalMemErr_V1;
+	mem_err_evt.type = OPAL_MEM_ERR_TYPE_DYN_DALLOC;
+	mem_err_evt.u.dyn_dealloc.dyn_err_type =
+					OPAL_MEM_DYNAMIC_DEALLOC;
+	mem_err_evt.u.dyn_dealloc.physical_address_start = paddr_start;
+	mem_err_evt.u.dyn_dealloc.physical_address_end = paddr_end;
+
+	/* Queue up the event and inform OS about it. */
+	rc = queue_mem_err_node(&mem_err_evt);
+
+send_response:
+	/* Queue up an OK response to the memory deallocation message itself */
+	if (!rc)
+		return send_response_to_fsp(FSP_RSP_MEM_DYN_DEALLOC);
+	else {
+		log_error(&e_info(OPAL_RC_MEM_ERR_DEALLOC),
+			&mem_err_evt, sizeof(struct OpalMemoryErrorData),
+			"OPAL_MEM_ERR: Cannot queue up memory "
+			"deallocation error event to the OS");
+		return false;
+	}
+}
+
+/* Receive a memory error mesages and handle it. */
+static bool fsp_mem_err_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u64 paddr_start, paddr_end;
+
+	printf(PREFIX "Received 0x%08ux command\n", cmd_sub_mod);
+	switch (cmd_sub_mod) {
+	case FSP_CMD_MEM_RES_CE:
+	case FSP_CMD_MEM_RES_UE:
+	case FSP_CMD_MEM_RES_UE_SCRB:
+		/*
+		 * We get the memory relilence command from FSP for
+		 * correctable/Uncorrectable/scrub UE errors with real
+		 * address of 4K memory page in which the error occured.
+		 */
+		paddr_start = *((u64 *)&msg->data.words[0]);
+		printf(PREFIX "Got memory resilience error message for "
+		       "paddr=0x%016llux\n", paddr_start);
+		return handle_memory_resilience(cmd_sub_mod, paddr_start);
+	case FSP_CMD_MEM_DYN_DEALLOC:
+		paddr_start = *((u64 *)&msg->data.words[0]);
+		paddr_end = *((u64 *)&msg->data.words[2]);
+		printf(PREFIX "Got dynamic memory deallocation message: "
+		       "paddr_start=0x%016llux, paddr_end=0x%016llux\n",
+		       paddr_start, paddr_end);
+		return handle_memory_deallocation(paddr_start, paddr_end);
+	}
+	return false;
+}
+
+/*
+ * pre allocate memory to hold maximum of 128 memory error event until linux
+ * consumes it.
+ */
+static int init_merr_free_list(uint32_t num_entries)
+{
+	struct fsp_mem_err_node *entry;
+	int i;
+
+	entry = zalloc(sizeof(struct fsp_mem_err_node) * num_entries);
+	if (!entry)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; ++i, entry++)
+		list_add_tail(&merr_free_list, &entry->list);
+
+	return 0;
+}
+
+static struct fsp_client fsp_mem_err_client = {
+	.message = fsp_mem_err_msg,
+};
+
+void fsp_memory_err_init(void)
+{
+	int rc;
+
+	printf(PREFIX "Intializing fsp memory handling.\n");
+	/* If we have an FSP, register for notifications */
+	if (!fsp_present())
+		return;
+
+	/* pre allocate memory for 128 record */
+	rc = init_merr_free_list(MERR_MAX_RECORD);
+	if (rc < 0)
+		return;
+
+	fsp_register_client(&fsp_mem_err_client, FSP_MCLASS_MEMORY_ERR);
+}
diff --git a/hw/fsp/fsp-nvram.c b/hw/fsp/fsp-nvram.c
new file mode 100644
index 00000000..b432c376
--- /dev/null
+++ b/hw/fsp/fsp-nvram.c
@@ -0,0 +1,414 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <lock.h>
+#include <device.h>
+#include <fsp-elog.h>
+
+//#define DBG(fmt...)	printf("RTC: " fmt)
+#define DBG(fmt...)	do { } while(0)
+
+/*
+ * The FSP NVRAM API operates in "blocks" of 4K. It is entirely exposed
+ * to the OS via the OPAL APIs.
+ *
+ * In order to avoid dealing with complicated read/modify/write state
+ * machines (and added issues related to FSP failover in the middle)
+ * we keep a memory copy of the entire nvram which we load at boot
+ * time. We save only modified blocks.
+ *
+ * To limit the amount of memory used by the nvram image, we limit
+ * how much nvram we support to NVRAM_SIZE. Additionally, this limit
+ * of 1M is the maximum that the CHRP/PAPR nvram partition format
+ * supports for a partition entry.
+ *
+ * (Q: should we save the whole thing in case of FSP failover ?)
+ *
+ * The nvram is expected to comply with the CHRP/PAPR defined format,
+ * and specifically contain a System partition (ID 0x70) named "common"
+ * with configuration variables for the bootloader and a FW private
+ * partition for future use by skiboot.
+ *
+ * If the partition layout appears broken or lacks one of the above
+ * partitions, we reformat the entire nvram at boot time.
+ *
+ * We do not exploit the ability of the FSP to store a checksum. This
+ * is documented as possibly going away. The CHRP format for nvram
+ * that Linux uses has its own (though weak) checksum mechanism already
+ *
+ */
+
+#define NVRAM_BLKSIZE	0x1000
+
+struct nvram_triplet {
+	uint64_t	dma_addr;
+	uint32_t	blk_offset;
+	uint32_t	blk_count;
+} __packed;
+
+#define NVRAM_FLAG_CLEAR_WPEND	0x80000000
+
+enum nvram_state {
+	NVRAM_STATE_CLOSED,
+	NVRAM_STATE_OPENING,
+	NVRAM_STATE_BROKEN,
+	NVRAM_STATE_OPEN,
+	NVRAM_STATE_ABSENT,
+};
+
+static void *fsp_nvram_image;
+static uint32_t fsp_nvram_size;
+static struct lock fsp_nvram_lock = LOCK_UNLOCKED;
+static struct fsp_msg *fsp_nvram_msg;
+static uint32_t fsp_nvram_dirty_start;
+static uint32_t fsp_nvram_dirty_end;
+static bool fsp_nvram_was_read;
+static struct nvram_triplet fsp_nvram_triplet __align(0x1000);
+static enum nvram_state fsp_nvram_state = NVRAM_STATE_CLOSED;
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_INIT, OPAL_PLATFORM_ERR_EVT , OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_OPEN, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_SIZE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_READ, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_NVRAM_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_NVRAM,
+		OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+static void fsp_nvram_send_write(void);
+
+static void fsp_nvram_wr_complete(struct fsp_msg *msg)
+{
+	struct fsp_msg *resp = msg->resp;
+	uint8_t rc;
+
+	lock(&fsp_nvram_lock);
+	fsp_nvram_msg = NULL;
+
+	/* Check for various errors. If an error occurred,
+	 * we generally assume the nvram is completely dirty
+	 * but we won't trigger a new write until we get
+	 * either a new attempt at writing, or an FSP reset
+	 * reload (TODO)
+	 */
+	if (!resp || resp->state != fsp_msg_response)
+		goto fail_dirty;
+	rc = (msg->word1 >> 8) & 0xff;
+	switch(rc) {
+	case 0:
+	case 0x44:
+		/* Sync to secondary required... XXX */
+	case 0x45:
+		break;
+	case 0xef:
+		/* Sync to secondary failed, let's ignore that for now,
+		 * maybe when (if) we handle redundant FSPs ...
+		 */
+		prerror("FSP: NVRAM sync to secondary failed\n");
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE),
+			"FSP: NVRAM write return error 0x%02x\n", rc);
+		goto fail_dirty;
+	}
+	fsp_freemsg(msg);
+	if (fsp_nvram_dirty_start <= fsp_nvram_dirty_end)
+		fsp_nvram_send_write();
+	unlock(&fsp_nvram_lock);
+	return;
+ fail_dirty:
+	fsp_nvram_dirty_start = 0;
+	fsp_nvram_dirty_end = fsp_nvram_size - 1;
+	fsp_freemsg(msg);
+	unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_write(void)
+{
+	uint32_t start = fsp_nvram_dirty_start;
+	uint32_t end = fsp_nvram_dirty_end;
+	uint32_t count;
+
+	if (start > end || fsp_nvram_state != NVRAM_STATE_OPEN)
+		return;
+	count = (end - start) / NVRAM_BLKSIZE + 1;
+	fsp_nvram_triplet.dma_addr = PSI_DMA_NVRAM_BODY + start;
+	fsp_nvram_triplet.blk_offset = start / NVRAM_BLKSIZE;
+	fsp_nvram_triplet.blk_count = count;
+	fsp_nvram_msg = fsp_mkmsg(FSP_CMD_WRITE_VNVRAM, 6,
+				  0, PSI_DMA_NVRAM_TRIPL, 1,
+				  NVRAM_FLAG_CLEAR_WPEND, 0, 0);
+	if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_wr_complete)) {
+		fsp_freemsg(fsp_nvram_msg);
+		fsp_nvram_msg = NULL;
+		log_simple_error(&e_info(OPAL_RC_NVRAM_WRITE),
+				"FSP: Error queueing nvram update\n");
+		return;
+	}
+	fsp_nvram_dirty_start = fsp_nvram_size;
+	fsp_nvram_dirty_end = 0;
+}
+
+static void fsp_nvram_rd_complete(struct fsp_msg *msg)
+{
+	int64_t rc;
+
+	lock(&fsp_nvram_lock);
+
+	/* Read complete, check status. What to do if the read fails ?
+	 *
+	 * Well, there could be various reasons such as an FSP reboot
+	 * at the wrong time, but there is really not much we can do
+	 * so for now I'll just mark the nvram as closed, and we'll
+	 * attempt a re-open and re-read whenever the OS tries to
+	 * access it
+	 */
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_nvram_msg = NULL;
+	fsp_freemsg(msg);
+	if (rc) {
+		prerror("FSP: NVRAM read failed, will try again later\n");
+		fsp_nvram_state = NVRAM_STATE_CLOSED;
+	} else {
+		/* nvram was read once, no need to do it ever again */
+		fsp_nvram_was_read = true;
+		fsp_nvram_state = NVRAM_STATE_OPEN;
+
+		/* XXX Here we should look for nvram settings that concern
+		 * us such as guest kernel arguments etc...
+		 */
+	}
+	unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_read(void)
+{
+	fsp_nvram_msg = fsp_mkmsg(FSP_CMD_READ_VNVRAM, 4,
+				  0, PSI_DMA_NVRAM_BODY, 0,
+				  fsp_nvram_size / NVRAM_BLKSIZE);
+	if (fsp_queue_msg(fsp_nvram_msg, fsp_nvram_rd_complete)) {
+		/* If the nvram read fails to queue, we mark ourselves
+		 * closed. Shouldn't have happened anyway. Not much else
+		 * we can do.
+		 */
+		fsp_nvram_state = NVRAM_STATE_CLOSED;
+		fsp_freemsg(fsp_nvram_msg);
+		fsp_nvram_msg = NULL;
+		log_simple_error(&e_info(OPAL_RC_NVRAM_READ),
+				"FSP: Error queueing nvram read\n");
+		return;
+	}
+}
+
+static void fsp_nvram_open_complete(struct fsp_msg *msg)
+{
+	int8_t rc;
+
+	lock(&fsp_nvram_lock);
+
+	/* Open complete, check status */
+	rc = (msg->resp->word1 >> 8) & 0xff;
+	fsp_nvram_msg = NULL;
+	fsp_freemsg(msg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_NVRAM_OPEN),
+			"FSP: NVRAM open failed, FSP error 0x%02x\n", rc);
+		goto failed;
+	}
+	if (fsp_nvram_was_read)
+		fsp_nvram_state = NVRAM_STATE_OPEN;
+	else
+		fsp_nvram_send_read();
+	unlock(&fsp_nvram_lock);
+	return;
+ failed:
+	fsp_nvram_state = NVRAM_STATE_CLOSED;
+	unlock(&fsp_nvram_lock);
+}
+
+static void fsp_nvram_send_open(void)
+{
+	printf("FSP NVRAM: Opening nvram...\n");
+	fsp_nvram_msg = fsp_mkmsg(FSP_CMD_OPEN_VNVRAM, 1, fsp_nvram_size);
+	assert(fsp_nvram_msg);
+	fsp_nvram_state = NVRAM_STATE_OPENING;
+	if (!fsp_queue_msg(fsp_nvram_msg, fsp_nvram_open_complete))
+		return;
+
+	prerror("FSP NVRAM: Failed to queue nvram open message\n");
+	fsp_freemsg(fsp_nvram_msg);
+	fsp_nvram_msg = NULL;
+	fsp_nvram_state = NVRAM_STATE_CLOSED;
+}
+
+static bool fsp_nvram_get_size(uint32_t *out_size)
+{
+	struct fsp_msg *msg;
+	int rc, size;
+
+	msg = fsp_mkmsg(FSP_CMD_GET_VNVRAM_SIZE, 0);
+	rc = fsp_sync_msg(msg, false);
+	size = msg->resp ? msg->resp->data.words[0] : 0;
+	fsp_freemsg(msg);
+	if (rc || size == 0) {
+		log_simple_error(&e_info(OPAL_RC_NVRAM_SIZE),
+			"FSP: Error %d nvram size reported is %d\n", rc, size);
+		fsp_nvram_state = NVRAM_STATE_BROKEN;
+		return false;
+	}
+	printf("FSP: NVRAM file size from FSP is %d bytes\n", size);
+	*out_size = size;
+	return true;
+}
+
+static bool fsp_nvram_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		printf("FSP: Closing NVRAM on account of FSP Reset\n");
+		fsp_nvram_state = NVRAM_STATE_CLOSED;
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		printf("FSP: Reopening NVRAM of FSP Reload complete\n");
+		lock(&fsp_nvram_lock);
+		fsp_nvram_send_open();
+		unlock(&fsp_nvram_lock);
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_nvram_client_rr = {
+	.message = fsp_nvram_msg_rr,
+};
+
+int fsp_nvram_info(uint32_t *total_size)
+{
+	if (!fsp_present()) {
+		fsp_nvram_state = NVRAM_STATE_ABSENT;
+		return OPAL_HARDWARE;
+	}
+
+	if (!fsp_nvram_get_size(total_size))
+		return OPAL_HARDWARE;
+	return OPAL_SUCCESS;
+}
+
+int fsp_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+	/* We are currently limited to fully aligned transfers */
+	assert((((uint64_t)dst) & 0xfff) == 0);
+	assert(dst);
+
+	/* Currently don't support src!=0 */
+	assert(src == 0);
+
+	if (!fsp_present())
+		return -ENODEV;
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x0007);
+
+	lock(&fsp_nvram_lock);
+
+	/* Store image info */
+	fsp_nvram_image = dst;
+	fsp_nvram_size = len;
+
+	/* Mark nvram as not dirty */
+	fsp_nvram_dirty_start = len;
+	fsp_nvram_dirty_end = 0;
+
+	/* Map TCEs */
+	fsp_tce_map(PSI_DMA_NVRAM_TRIPL, &fsp_nvram_triplet,
+		    PSI_DMA_NVRAM_TRIPL_SZ);
+	fsp_tce_map(PSI_DMA_NVRAM_BODY, dst, PSI_DMA_NVRAM_BODY_SZ);
+
+	/* Register for the reset/reload event */
+	fsp_register_client(&fsp_nvram_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Open and load the nvram from the FSP */
+	fsp_nvram_send_open();
+
+	unlock(&fsp_nvram_lock);
+
+	return 0;
+}
+
+int fsp_nvram_write(uint32_t offset, void *src, uint32_t size)
+{
+	uint64_t end = offset + size - 1;
+
+	/* We only support writing from the original image */
+	if (src != fsp_nvram_image + offset)
+		return OPAL_HARDWARE;
+
+	offset &= ~(NVRAM_BLKSIZE - 1);
+	end &= ~(NVRAM_BLKSIZE - 1);
+
+	lock(&fsp_nvram_lock);
+	/* If the nvram is closed, try re-opening */
+	if (fsp_nvram_state == NVRAM_STATE_CLOSED)
+		fsp_nvram_send_open();
+	if (fsp_nvram_dirty_start > offset)
+		fsp_nvram_dirty_start = offset;
+	if (fsp_nvram_dirty_end < end)
+		fsp_nvram_dirty_end = end;
+	if (!fsp_nvram_msg && fsp_nvram_state == NVRAM_STATE_OPEN)
+		fsp_nvram_send_write();
+	unlock(&fsp_nvram_lock);
+
+	return 0;
+}
+
+/* This is called right before starting the payload (Linux) to
+ * ensure the initial open & read of nvram has happened before
+ * we transfer control as the guest OS. This is necessary as
+ * Linux will not handle a OPAL_BUSY return properly and treat
+ * it as an error
+ */
+void fsp_nvram_wait_open(void)
+{
+	if (!fsp_present())
+		return;
+
+	while(fsp_nvram_state == NVRAM_STATE_OPENING)
+		fsp_poll();
+
+	if (!fsp_nvram_was_read) {
+		log_simple_error(&e_info(OPAL_RC_NVRAM_INIT),
+			"FSP: NVRAM not read, skipping init\n");
+		nvram_read_complete(false);
+		return;
+	}
+
+	nvram_read_complete(true);
+}
diff --git a/hw/fsp/fsp-op-panel.c b/hw/fsp/fsp-op-panel.c
new file mode 100644
index 00000000..e2df34ea
--- /dev/null
+++ b/hw/fsp/fsp-op-panel.c
@@ -0,0 +1,249 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <opal.h>
+#include <device.h>
+#include <processor.h>
+#include <opal-msg.h>
+#include <fsp-elog.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_PANEL_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_OP_PANEL,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+static struct fsp_msg op_msg_resp;
+static struct fsp_msg op_msg = {
+	.resp = &op_msg_resp,
+};
+static struct fsp_msg *op_req;
+static uint64_t op_async_token;
+static struct lock op_lock = LOCK_UNLOCKED;
+
+void op_display(enum op_severity sev, enum op_module mod, uint16_t code)
+{
+	uint32_t w0 = sev << 16 | mod;
+	uint32_t w1;
+	bool clean_lock;
+
+	if (!fsp_present())
+		return;
+
+	w1 =  tohex((code >> 12) & 0xf) << 24;
+	w1 |= tohex((code >>  8) & 0xf) << 16;
+	w1 |= tohex((code >>  4) & 0xf) <<  8;
+	w1 |= tohex((code      ) & 0xf);
+
+	/*
+	 * We use lock_recursive to detect recursion. We avoid sending
+	 * the message if that happens as this could be a case of a
+	 * locking error in the FSP driver for example
+	 */
+	clean_lock = lock_recursive(&op_lock);
+	if (!clean_lock)
+		return;
+
+	/* We don't use mkmsg, we use a preallocated msg to avoid
+	 * going down the malloc path etc... since this can be called
+	 * in case of fatal errors
+	 */
+	fsp_fillmsg(&op_msg, FSP_CMD_DISP_SRC_DIRECT, 3, 1, w0, w1);
+	fsp_sync_msg(&op_msg, false);
+	unlock(&op_lock);
+}
+
+void op_panel_disable_src_echo(void)
+{
+	if (!fsp_present())
+		return;
+
+	lock(&op_lock);
+	fsp_fillmsg(&op_msg, FSP_CMD_DIS_SRC_ECHO, 0);
+	fsp_sync_msg(&op_msg, false);
+	unlock(&op_lock);
+}
+
+void op_panel_clear_src(void)
+{
+	if (!fsp_present())
+		return;
+
+	lock(&op_lock);
+	fsp_fillmsg(&op_msg, FSP_CMD_CLEAR_SRC, 0);
+	fsp_sync_msg(&op_msg, false);
+	unlock(&op_lock);
+}
+
+/* opal_write_oppanel - Write to the physical op panel.
+ *
+ * Pass in an array of oppanel_line_t structs defining the ASCII characters
+ * to display on each line of the oppanel. If there are two lines on the
+ * physical panel, and you only want to write to the first line, you only
+ * need to pass in one line. If you only want to write to the second line,
+ * you need to pass in both lines, and set the line_len of the first line
+ * to zero.
+ *
+ * This command is asynchronous. If OPAL_SUCCESS is returned, then the
+ * operation was initiated successfully. Subsequent calls will return
+ * OPAL_BUSY until the current operation is complete.
+ */
+struct op_src {
+	uint8_t version;
+#define OP_SRC_VERSION	2
+	uint8_t	flags;
+	uint8_t reserved;
+	uint8_t	hex_word_cnt;
+	uint16_t reserved2;
+	uint16_t total_size;
+	uint32_t word2; /* SRC format in low byte */
+	uint32_t word3;
+	uint32_t word4;
+	uint32_t word5;
+	uint32_t word6;
+	uint32_t word7;
+	uint32_t word8;
+	uint32_t word9;
+#define OP_SRC_ASCII_LEN 32
+	uint8_t	ascii[OP_SRC_ASCII_LEN]; /* Word 11 */
+} __packed __align(4);
+
+/* Page align for the sake of TCE mapping */
+static struct op_src op_src __align(0x1000);
+
+static void __op_panel_write_complete(struct fsp_msg *msg)
+{
+	fsp_tce_unmap(PSI_DMA_OP_PANEL_MISC, 0x1000);
+	lwsync();
+	op_req = NULL;
+	fsp_freemsg(msg);
+}
+
+static void op_panel_write_complete(struct fsp_msg *msg)
+{
+	uint8_t rc = (msg->resp->word1 >> 8) & 0xff;
+
+	if (rc)
+		prerror("OPPANEL: Error 0x%02x in display command\n", rc);
+
+	__op_panel_write_complete(msg);
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, 1, op_async_token);
+}
+
+static int64_t __opal_write_oppanel(oppanel_line_t *lines, uint64_t num_lines,
+				    uint64_t async_token)
+{
+	int64_t rc = OPAL_ASYNC_COMPLETION;
+	int len;
+	int i;
+
+	if (num_lines < 1 || num_lines > 2)
+		return OPAL_PARAMETER;
+
+	lock(&op_lock);
+
+	/* Only one in flight */
+	if (op_req) {
+		rc = OPAL_BUSY_EVENT;
+		goto bail;
+	}
+
+	op_req = fsp_allocmsg(true);
+	if (!op_req) {
+		rc = OPAL_NO_MEM;
+		goto bail;
+	}
+
+	op_async_token = async_token;
+
+	memset(&op_src, 0, sizeof(op_src));
+
+	op_src.version = OP_SRC_VERSION;
+	op_src.flags = 0;
+	op_src.reserved = 0;
+	op_src.hex_word_cnt = 1; /* header word only */
+	op_src.reserved2 = 0;
+	op_src.total_size = sizeof(op_src);
+	op_src.word2 = 0; /* should be unneeded */
+
+	len = lines[0].line_len > 16 ? 16 : lines[0].line_len;
+
+	memset(op_src.ascii + len, ' ', 16-len);
+	memcpy(op_src.ascii, lines[0].line, len);
+	if (num_lines > 1) {
+		len = lines[1].line_len > 16 ? 16 : lines[1].line_len;
+		memcpy(op_src.ascii + 16, lines[1].line, len);
+		memset(op_src.ascii + 16 + len, ' ', 16-len);
+	}
+
+	for (i = 0; i < sizeof(op_src.ascii); i++) {
+		/*
+		 * So, there's this interesting thing if you send
+		 * HTML/Javascript through the Operator Panel.
+		 * You get to inject it into the ASM web ui!
+		 * So we filter out anything suspect here,
+		 * at least for the time being.
+		 *
+		 * Allowed characters:
+		 *  . / 0-9 : a-z A-Z SPACE
+		 */
+		if (! ((op_src.ascii[i] >= '.' && op_src.ascii[i] <= ':') ||
+		       (op_src.ascii[i] >= 'a' && op_src.ascii[i] <= 'z') ||
+		       (op_src.ascii[i] >= 'A' && op_src.ascii[i] <= 'Z') ||
+		       op_src.ascii[i] == ' ')) {
+			op_src.ascii[i] = '.';
+		}
+	}
+
+	fsp_tce_map(PSI_DMA_OP_PANEL_MISC, &op_src, 0x1000);
+
+	fsp_fillmsg(op_req, FSP_CMD_DISP_SRC_INDIR, 3, 0,
+		    PSI_DMA_OP_PANEL_MISC, sizeof(struct op_src));
+	rc = fsp_queue_msg(op_req, op_panel_write_complete);
+	if (rc) {
+		__op_panel_write_complete(op_req);
+		rc = OPAL_INTERNAL_ERROR;
+	}
+ bail:
+	unlock(&op_lock);
+	log_simple_error(&e_info(OPAL_RC_PANEL_WRITE),
+			"FSP: Error updating Op Panel: %lld\n", rc);
+	return rc;
+}
+
+static int64_t opal_write_oppanel_async(uint64_t async_token,
+					oppanel_line_t *lines,
+					uint64_t num_lines)
+{
+	return __opal_write_oppanel(lines, num_lines, async_token);
+}
+
+void fsp_oppanel_init(void)
+{
+	struct dt_node *oppanel;
+
+	if (!fsp_present())
+		return;
+
+	opal_register(OPAL_WRITE_OPPANEL_ASYNC, opal_write_oppanel_async, 3);
+
+	oppanel = dt_new(opal_node, "oppanel");
+	dt_add_property_cells(oppanel, "#length", 16);
+	dt_add_property_cells(oppanel, "#lines", 2);
+	dt_add_property_string(oppanel, "compatible", "ibm,opal-oppanel");
+}
diff --git a/hw/fsp/fsp-rtc.c b/hw/fsp/fsp-rtc.c
new file mode 100644
index 00000000..887091ab
--- /dev/null
+++ b/hw/fsp/fsp-rtc.c
@@ -0,0 +1,572 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <timebase.h>
+#include <time.h>
+#include <fsp-elog.h>
+
+//#define DBG(fmt...)	printf("RTC: " fmt)
+#define DBG(fmt...)	do { } while(0)
+
+/*
+ * Note on how those operate:
+ *
+ * Because the RTC calls can be pretty slow, these functions will shoot
+ * an asynchronous request to the FSP (if none is already pending)
+ *
+ * The requests will return OPAL_BUSY_EVENT as long as the event has
+ * not been completed.
+ *
+ * WARNING: An attempt at doing an RTC write while one is already pending
+ * will simply ignore the new arguments and continue returning
+ * OPAL_BUSY_EVENT. This is to be compatible with existing Linux code.
+ *
+ * Completion of the request will result in an event OPAL_EVENT_RTC
+ * being signaled, which will remain raised until a corresponding call
+ * to opal_rtc_read() or opal_rtc_write() finally returns OPAL_SUCCESS,
+ * at which point the operation is complete and the event cleared.
+ *
+ * If we end up taking longer than rtc_read_timeout_ms millieconds waiting
+ * for the response from a read request, we simply return a cached value (plus
+ * an offset calculated from the timebase. When the read request finally
+ * returns, we update our cache value accordingly.
+ *
+ * There is two separate set of state for reads and writes. If both are
+ * attempted at the same time, the event bit will remain set as long as either
+ * of the two has a pending event to signal.
+ */
+
+enum {
+	RTC_TOD_VALID,
+	RTC_TOD_INVALID,
+	RTC_TOD_PERMANENT_ERROR,
+} rtc_tod_state = RTC_TOD_INVALID;
+
+static struct lock rtc_lock;
+static struct fsp_msg *rtc_read_msg;
+static struct fsp_msg *rtc_write_msg;
+/* TODO We'd probably want to export and use this variable declared in fsp.c,
+ * instead of each component individually maintaining the state.. may be for
+ * later optimization
+ */
+static bool fsp_in_reset = false;
+
+/* last synchonisation point */
+static struct {
+	struct tm	tm;
+	unsigned long	tb;
+	bool		dirty;
+} rtc_tod_cache;
+
+/* Timebase value when we last initiated a RTC read request */
+static unsigned long read_req_tb;
+
+/* If a RTC read takes longer than this, we return a value generated
+ * from the cache + timebase */
+static const int rtc_read_timeout_ms = 1500;
+
+DEFINE_LOG_ENTRY(OPAL_RC_RTC_TOD, OPAL_PLATFORM_ERR_EVT, OPAL_RTC,
+			OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+			OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_RTC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_RTC,
+			OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+			OPAL_NA, NULL);
+
+static int days_in_month(int month, int year)
+{
+	static int month_days[] = {
+		31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31,
+	};
+
+	assert(1 <= month && month <= 12);
+
+	/* we may need to update this in the year 4000, pending a
+	 * decision on whether or not it's a leap year */
+	if (month == 2) {
+		bool is_leap = !(year % 400) || ((year % 100) && !(year % 4));
+		return is_leap ? 29 : 28;
+	}
+
+	return month_days[month - 1];
+}
+
+static void tm_add(struct tm *in, struct tm *out, unsigned long secs)
+{
+	unsigned long year, month, mday, hour, minute, second, d;
+	static const unsigned long sec_in_400_years =
+		((3903ul * 365) + (97 * 366)) * 24 * 60 * 60;
+
+	assert(in);
+	assert(out);
+
+	second = in->tm_sec;
+	minute = in->tm_min;
+	hour = in->tm_hour;
+	mday = in->tm_mday;
+	month = in->tm_mon;
+	year = in->tm_year;
+
+	second += secs;
+
+	/* There are the same number of seconds in any 400-year block; this
+	 * limits the iterations in the loop below */
+	year += 400 * (second / sec_in_400_years);
+	second = second % sec_in_400_years;
+
+	if (second >= 60) {
+		minute += second / 60;
+		second = second % 60;
+	}
+
+	if (minute >= 60) {
+		hour += minute / 60;
+		minute = minute % 60;
+	}
+
+	if (hour >= 24) {
+		mday += hour / 24;
+		hour = hour % 24;
+	}
+
+	for (d = days_in_month(month, year); mday >= d;
+			d = days_in_month(month, year)) {
+		month++;
+		if (month > 12) {
+			month = 1;
+			year++;
+		}
+		mday -= d;
+	}
+
+	out->tm_year = year;
+	out->tm_mon = month;
+	out->tm_mday = mday;
+	out->tm_hour = hour;
+	out->tm_min = minute;
+	out->tm_sec = second;
+}
+
+/* MSB is byte 3, LSB is byte 0 */
+static unsigned int bcd_byte(uint32_t bcd, int byteno)
+{
+	bcd >>= byteno * 8;
+	return (bcd >> 4 & 0xf) * 10 + (bcd & 0xf);
+}
+
+static uint32_t int_to_bcd2(unsigned int x)
+{
+	return (((x / 10) << 4) & 0xf0) | (x % 10);
+}
+
+static uint32_t int_to_bcd4(unsigned int x)
+{
+	return int_to_bcd2(x / 100) << 8 | int_to_bcd2(x % 100);
+}
+
+static void rtc_to_tm(struct fsp_msg *msg, struct tm *tm)
+{
+	uint32_t x;
+
+	/* The FSP returns in BCD:
+	 *
+	 *  |      year       | month |   mday   |
+	 *  +------------------------------------+
+	 *  |  hour  | minute | secs  | reserved |
+	 *  +------------------------------------+
+	 *  |             microseconds           |
+	 */
+	x = msg->data.words[0];
+	tm->tm_year = bcd_byte(x, 3) * 100 + bcd_byte(x, 2);
+	tm->tm_mon = bcd_byte(x, 1);
+	tm->tm_mday = bcd_byte(x, 0);
+
+	x = msg->data.words[1];
+	tm->tm_hour = bcd_byte(x, 3);
+	tm->tm_min = bcd_byte(x, 2);
+	tm->tm_sec = bcd_byte(x, 1);
+}
+
+static void tm_to_datetime(struct tm *tm, uint32_t *y_m_d, uint64_t *h_m_s_m)
+{
+	uint64_t h_m_s;
+	/*
+	 * The OPAL API is defined as returned a u64 of a similar
+	 * format to the FSP message; the 32-bit date field is
+	 * in the format:
+	 *
+	 * |  year | year | month  | day |
+	 *
+	 */
+	*y_m_d = int_to_bcd4(tm->tm_year) << 16 |
+		 int_to_bcd2(tm->tm_mon) << 8 |
+		 int_to_bcd2(tm->tm_mday);
+
+	/*
+	 * ... and the 64-bit time field is in the format
+	 *
+	 *  |  hour  | minutes | secs  | millisec |
+	 *  | -------------------------------------
+	 *  |        millisec          | reserved |
+	 *
+	 * We simply ignore the microseconds/milliseconds for now
+	 * as I don't quite understand why the OPAL API defines that
+	 * it needs 6 digits for the milliseconds :-) I suspect the
+	 * doc got that wrong and it's supposed to be micro but
+	 * let's ignore it.
+	 *
+	 * Note that Linux doesn't use nor set the ms field anyway.
+	 */
+	h_m_s = int_to_bcd2(tm->tm_hour) << 24 |
+	        int_to_bcd2(tm->tm_min) << 16 |
+	        int_to_bcd2(tm->tm_sec) << 8;
+
+	*h_m_s_m = h_m_s << 32;
+}
+
+static void fsp_rtc_process_read(struct fsp_msg *read_resp)
+{
+	int val = (read_resp->word1 >> 8) & 0xff;
+
+	switch (val) {
+	case 0xa9:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+				"RTC TOD in invalid state\n");
+		rtc_tod_state = RTC_TOD_INVALID;
+		break;
+
+	case 0xaf:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+			"RTC TOD in permanent error state\n");
+		rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+		break;
+
+	case 0:
+		/* Save the read RTC value in our cache */
+		rtc_to_tm(read_resp, &rtc_tod_cache.tm);
+		rtc_tod_cache.tb = mftb();
+		rtc_tod_state = RTC_TOD_VALID;
+		break;
+
+	default:
+		log_simple_error(&e_info(OPAL_RC_RTC_TOD),
+				"RTC TOD read failed: %d\n", val);
+		rtc_tod_state = RTC_TOD_INVALID;
+	}
+}
+
+static void opal_rtc_eval_events(void)
+{
+	bool pending = false;
+
+	if (rtc_read_msg && !fsp_msg_busy(rtc_read_msg))
+		pending = true;
+	if (rtc_write_msg && !fsp_msg_busy(rtc_write_msg))
+		pending = true;
+	opal_update_pending_evt(OPAL_EVENT_RTC, pending ? OPAL_EVENT_RTC : 0);
+}
+
+static void fsp_rtc_req_complete(struct fsp_msg *msg)
+{
+	lock(&rtc_lock);
+	DBG("RTC completion %p\n", msg);
+	if (msg == rtc_read_msg)
+		fsp_rtc_process_read(msg->resp);
+	opal_rtc_eval_events();
+	unlock(&rtc_lock);
+}
+
+static int64_t fsp_rtc_send_read_request(void)
+{
+	struct fsp_msg *msg;
+	int rc;
+
+	msg = fsp_mkmsg(FSP_CMD_READ_TOD, 0);
+	if (!msg) {
+		log_simple_error(&e_info(OPAL_RC_RTC_READ),
+			"RTC: failed to allocate read message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	rc = fsp_queue_msg(msg, fsp_rtc_req_complete);
+	if (rc) {
+		fsp_freemsg(msg);
+		log_simple_error(&e_info(OPAL_RC_RTC_READ),
+			"RTC: failed to queue read message: %d\n", rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	read_req_tb = mftb();
+	rtc_read_msg = msg;
+
+	return OPAL_BUSY_EVENT;
+}
+
+static void encode_cached_tod(uint32_t *year_month_day,
+		uint64_t *hour_minute_second_millisecond)
+{
+	unsigned long cache_age_sec;
+	struct tm tm;
+
+	cache_age_sec = tb_to_msecs(mftb() - rtc_tod_cache.tb) / 1000;
+
+	tm_add(&rtc_tod_cache.tm, &tm, cache_age_sec);
+
+	/* Format to OPAL API values */
+	tm_to_datetime(&tm, year_month_day, hour_minute_second_millisecond);
+}
+
+int fsp_rtc_get_cached_tod(uint32_t *year_month_day,
+		uint64_t *hour_minute_second_millisecond)
+{
+
+	if (rtc_tod_state != RTC_TOD_VALID)
+		return -1;
+
+	encode_cached_tod(year_month_day,
+			hour_minute_second_millisecond);
+	return 0;
+}
+
+static int64_t fsp_opal_rtc_read(uint32_t *year_month_day,
+				 uint64_t *hour_minute_second_millisecond)
+{
+	struct fsp_msg *msg;
+	int64_t rc;
+
+	if (!year_month_day || !hour_minute_second_millisecond)
+		return OPAL_PARAMETER;
+
+	lock(&rtc_lock);
+	/* During R/R of FSP, read cached TOD */
+	if (fsp_in_reset) {
+		fsp_rtc_get_cached_tod(year_month_day,
+				hour_minute_second_millisecond);
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	msg = rtc_read_msg;
+
+	if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) {
+		if (msg && !fsp_msg_busy(msg))
+			fsp_freemsg(msg);
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	/* If we don't have a read pending already, fire off a request and
+	 * return */
+	if (!msg) {
+		DBG("Sending new RTC read request\n");
+		rc = fsp_rtc_send_read_request();
+
+	/* If our pending read is done, clear events and return the time
+	 * from the cache */
+	} else if (!fsp_msg_busy(msg)) {
+		DBG("RTC read complete, state %d\n", rtc_tod_state);
+
+		rtc_read_msg = NULL;
+		opal_rtc_eval_events();
+		fsp_freemsg(msg);
+
+		if (rtc_tod_state == RTC_TOD_VALID) {
+			encode_cached_tod(year_month_day,
+					hour_minute_second_millisecond);
+			rc = OPAL_SUCCESS;
+		} else
+			rc = OPAL_INTERNAL_ERROR;
+
+	/* Timeout: return our cached value (updated from tb), but leave the
+	 * read request pending so it will update the cache later */
+	} else if (mftb() > read_req_tb + msecs_to_tb(rtc_read_timeout_ms)) {
+		DBG("RTC read timed out\n");
+
+		encode_cached_tod(year_month_day,
+				hour_minute_second_millisecond);
+		rc = OPAL_SUCCESS;
+
+	/* Otherwise, we're still waiting on the read to complete */
+	} else {
+		rc = OPAL_BUSY_EVENT;
+	}
+out:
+	unlock(&rtc_lock);
+	return rc;
+}
+
+static int64_t fsp_opal_rtc_write(uint32_t year_month_day,
+				  uint64_t hour_minute_second_millisecond)
+{
+	struct fsp_msg *msg;
+	uint32_t w0, w1, w2;
+	int64_t rc;
+
+	lock(&rtc_lock);
+	if (rtc_tod_state == RTC_TOD_PERMANENT_ERROR) {
+		rc = OPAL_HARDWARE;
+		msg = NULL;
+		goto bail;
+	}
+
+	/* Do we have a request already ? */
+	msg = rtc_write_msg;
+	if (msg) {
+		/* If it's still in progress, return */
+		if (fsp_msg_busy(msg)) {
+			/* Don't free the message */
+			msg = NULL;
+			rc = OPAL_BUSY_EVENT;
+			goto bail;
+		}
+
+		DBG("Completed write request @%p, state=%d\n", msg, msg->state);
+		/* It's complete, clear events */
+		rtc_write_msg = NULL;
+		opal_rtc_eval_events();
+
+		/* Check error state */
+		if (msg->state != fsp_msg_done) {
+			DBG(" -> request not in done state -> error !\n");
+			rc = OPAL_INTERNAL_ERROR;
+			goto bail;
+		}
+		rc = OPAL_SUCCESS;
+		goto bail;
+	}
+
+	DBG("Sending new write request...\n");
+
+	/* Create a request and send it. Just like for read, we ignore
+	 * the "millisecond" field which is probably supposed to be
+	 * microseconds and which Linux ignores as well anyway
+	 */
+	w0 = year_month_day;
+	w1 = (hour_minute_second_millisecond >> 32) & 0xffffff00;
+	w2 = 0;
+	
+	rtc_write_msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, w0, w1, w2);
+	if (!rtc_write_msg) {
+		DBG(" -> allocation failed !\n");
+		rc = OPAL_INTERNAL_ERROR;
+		goto bail;
+	}
+	DBG(" -> req at %p\n", rtc_write_msg);
+
+	if (fsp_in_reset) {
+		rtc_to_tm(rtc_write_msg,  &rtc_tod_cache.tm);
+		rtc_tod_cache.tb = mftb();
+		rtc_tod_cache.dirty = true;
+		fsp_freemsg(rtc_write_msg);
+		rtc_write_msg = NULL;
+		rc = OPAL_SUCCESS;
+		goto bail;
+	} else if (fsp_queue_msg(rtc_write_msg, fsp_rtc_req_complete)) {
+		DBG(" -> queueing failed !\n");
+		rc = OPAL_INTERNAL_ERROR;
+		fsp_freemsg(rtc_write_msg);
+		rtc_write_msg = NULL;
+		goto bail;
+	}
+	rc = OPAL_BUSY_EVENT;
+ bail:
+	unlock(&rtc_lock);
+	if (msg)
+		fsp_freemsg(msg);
+	return rc;
+}
+
+static void rtc_flush_cached_tod(void)
+{
+	struct fsp_msg *msg;
+	uint64_t h_m_s_m;
+	uint32_t y_m_d;
+
+	if (fsp_rtc_get_cached_tod(&y_m_d, &h_m_s_m))
+		return;
+	msg = fsp_mkmsg(FSP_CMD_WRITE_TOD, 3, y_m_d,
+			(h_m_s_m >> 32) & 0xffffff00, 0);
+	if (msg)
+		fsp_queue_msg(msg, fsp_freemsg);
+}
+
+static bool fsp_rtc_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+
+	int rc = false;
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		lock(&rtc_lock);
+		fsp_in_reset = true;
+		unlock(&rtc_lock);
+		rc = true;
+		break;
+	case FSP_RELOAD_COMPLETE:
+		lock(&rtc_lock);
+		fsp_in_reset = false;
+		if (rtc_tod_cache.dirty) {
+			rtc_flush_cached_tod();
+			rtc_tod_cache.dirty = false;
+		}
+		unlock(&rtc_lock);
+		rc = true;
+		break;
+	}
+
+	return rc;
+}
+
+static struct fsp_client fsp_rtc_client_rr = {
+	.message = fsp_rtc_msg_rr,
+};
+
+void fsp_rtc_init(void)
+{
+	struct fsp_msg msg, resp;
+	int rc;
+
+	if (!fsp_present()) {
+		rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+		return;
+	}
+
+	opal_register(OPAL_RTC_READ, fsp_opal_rtc_read, 2);
+	opal_register(OPAL_RTC_WRITE, fsp_opal_rtc_write, 2);
+
+	/* Register for the reset/reload event */
+	fsp_register_client(&fsp_rtc_client_rr, FSP_MCLASS_RR_EVENT);
+
+	msg.resp = &resp;
+	fsp_fillmsg(&msg, FSP_CMD_READ_TOD, 0);
+
+	DBG("Getting initial RTC TOD\n");
+
+	lock(&rtc_lock);
+
+	rc = fsp_sync_msg(&msg, false);
+
+	if (rc >= 0)
+		fsp_rtc_process_read(&resp);
+	else
+		rtc_tod_state = RTC_TOD_PERMANENT_ERROR;
+
+	unlock(&rtc_lock);
+}
diff --git a/hw/fsp/fsp-sensor.c b/hw/fsp/fsp-sensor.c
new file mode 100644
index 00000000..f4fc19d2
--- /dev/null
+++ b/hw/fsp/fsp-sensor.c
@@ -0,0 +1,788 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ */
+
+
+/*
+ * Design note:
+ * This code will enable the 'powernv' to retrieve sensor related data from FSP
+ * using SPCN passthru mailbox commands.
+ *
+ * The OPAL read sensor API in Sapphire is implemented as an 'asynchronous' read
+ * call that returns after queuing the read request. A unique sensor-id is
+ * expected as an argument for OPAL read call which has already been exported
+ * to the device tree during fsp init. The sapphire code decodes this Id to
+ * determine requested attribute and sensor.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <device.h>
+#include <spcn.h>
+#include <opal-msg.h>
+#include<fsp-elog.h>
+
+//#define DBG(fmt...)	printf("SENSOR: " fmt)
+#define DBG(fmt...)	do { } while (0)
+
+#define SENSOR_PREFIX	"sensor: "
+#define INVALID_DATA	((uint32_t)-1)
+
+/* Entry size of PRS command modifiers */
+#define PRS_STATUS_ENTRY_SZ	0x08
+#define SENSOR_PARAM_ENTRY_SZ	0x10
+#define SENSOR_DATA_ENTRY_SZ	0x08
+#define PROC_JUNC_ENTRY_SZ	0x04
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR,
+			OPAL_MISC_SUBSYSTEM,
+			OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT,
+			OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_READ, OPAL_PLATFORM_ERR_EVT, OPAL_SENSOR,
+			OPAL_MISC_SUBSYSTEM, OPAL_INFO,
+			OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SENSOR_ASYNC_COMPLETE, OPAL_PLATFORM_ERR_EVT,
+			OPAL_SENSOR, OPAL_MISC_SUBSYSTEM, OPAL_INFO,
+			OPAL_NA, NULL);
+
+/* FSP response status codes */
+enum {
+	SP_RSP_STATUS_VALID_DATA = 0x00,
+	SP_RSP_STATUS_INVALID_DATA = 0x22,
+	SP_RSP_STATUS_SPCN_ERR = 0xA8,
+	SP_RSP_STATUS_DMA_ERR = 0x24,
+};
+
+enum sensor_state {
+	SENSOR_VALID_DATA,
+	SENSOR_INVALID_DATA,
+	SENSOR_SPCN_ERROR,
+	SENSOR_DMA_ERROR,
+	SENSOR_PERMANENT_ERROR,
+	SENSOR_OPAL_ERROR,
+};
+
+enum spcn_attr {
+	/* mod 0x01, 0x02 */
+	SENSOR_PRESENT,
+	SENSOR_FAULTED,
+	SENSOR_AC_FAULTED,
+	SENSOR_ON,
+	SENSOR_ON_SUPPORTED,
+	/* mod 0x10, 0x11 */
+	SENSOR_THRS,
+	SENSOR_LOCATION,
+	/* mod 0x12, 0x13 */
+	SENSOR_DATA,
+	/* mod 0x1c */
+	SENSOR_POWER,
+
+	SENSOR_MAX,
+};
+
+/* Parsed sensor attributes, passed through OPAL */
+struct opal_sensor_data {
+	uint64_t	async_token;	/* Asynchronous token */
+	uint32_t	*sensor_data;	/* Kernel pointer to copy data */
+	enum spcn_attr	spcn_attr;	/* Modifier attribute */
+	uint16_t	rid;		/* Sensor RID */
+	uint8_t		frc;		/* Sensor resource class */
+	uint32_t	mod_index;	/* Modifier index*/
+	uint32_t	offset;		/* Offset in sensor buffer */
+};
+
+struct spcn_mod_attr {
+	const char *name;
+	enum spcn_attr val;
+};
+
+struct spcn_mod {
+	uint8_t mod;		/* Modifier code */
+	uint8_t entry_size;	/* Size of each entry in response buffer */
+	uint16_t entry_count;	/* Number of entries */
+	struct spcn_mod_attr *mod_attr;
+};
+
+static struct spcn_mod_attr prs_status_attrs[] = {
+		{"present", SENSOR_PRESENT},
+		{"faulted", SENSOR_FAULTED},
+		{"ac-faulted", SENSOR_AC_FAULTED},
+		{"on", SENSOR_ON},
+		{"on-supported", SENSOR_ON_SUPPORTED}
+};
+
+static struct spcn_mod_attr sensor_param_attrs[] = {
+		{"thrs", SENSOR_THRS},
+		{"loc", SENSOR_LOCATION}
+};
+
+static struct spcn_mod_attr sensor_data_attrs[] = {
+		{"data", SENSOR_DATA}
+};
+
+static struct spcn_mod_attr sensor_power_attrs[] = {
+		{"power", SENSOR_POWER}
+};
+
+static struct spcn_mod spcn_mod_data[] = {
+		{SPCN_MOD_PRS_STATUS_FIRST, PRS_STATUS_ENTRY_SZ, 0,
+				prs_status_attrs},
+		{SPCN_MOD_PRS_STATUS_SUBS, PRS_STATUS_ENTRY_SZ, 0,
+				prs_status_attrs},
+		{SPCN_MOD_SENSOR_PARAM_FIRST, SENSOR_PARAM_ENTRY_SZ, 0,
+				sensor_param_attrs},
+		{SPCN_MOD_SENSOR_PARAM_SUBS, SENSOR_PARAM_ENTRY_SZ, 0,
+				sensor_param_attrs},
+		{SPCN_MOD_SENSOR_DATA_FIRST, SENSOR_DATA_ENTRY_SZ, 0,
+				sensor_data_attrs},
+		{SPCN_MOD_SENSOR_DATA_SUBS, SENSOR_DATA_ENTRY_SZ, 0,
+				sensor_data_attrs},
+		/* TODO Support this modifier '0x14', if required */
+		/* {SPCN_MOD_PROC_JUNC_TEMP, PROC_JUNC_ENTRY_SZ, 0, NULL}, */
+		{SPCN_MOD_SENSOR_POWER, SENSOR_DATA_ENTRY_SZ, 0,
+				sensor_power_attrs},
+		{SPCN_MOD_LAST, 0xff, 0xffff, NULL}
+};
+
+/* Frame resource class (FRC) names */
+static const char *frc_names[] = {
+		/* 0x00 and 0x01 are reserved */
+		NULL,
+		NULL,
+		"power-controller",
+		"power-supply",
+		"regulator",
+		"cooling-fan",
+		"cooling-controller",
+		"battery-charger",
+		"battery-pack",
+		"amb-temp",
+		"temp",
+		"vrm",
+		"riser-card",
+		"io-backplane"
+};
+
+#define SENSOR_MAX_SIZE		0x00100000
+static void *sensor_buffer = NULL;
+static enum sensor_state sensor_state;
+static bool prev_msg_consumed = true;
+static struct lock sensor_lock;
+
+/* Function prototypes */
+static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr);
+static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr);
+
+
+/*
+ * Power Resource Status (PRS)
+ * Command: 0x42
+ *
+ * Modifier: 0x01
+ * --------------------------------------------------------------------------
+ * |    0        1         2      3         4        5         6        7   |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class|      PRID       |      SRC        |       Status     |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x10
+ * --------------------------------------------------------------------------
+ * |    0        1         2      3         4        5         6        7   |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class|      PRID       |            Sensor location         |
+ * --------------------------------------------------------------------------
+ * --------------------------------------------------------------------------
+ * |    8        9         10      11         12   13          14    15     |
+ * --------------------------------------------------------------------------
+ * |    Reserved     |   Reserved      |   Threshold     |       Status     |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x12
+ * --------------------------------------------------------------------------
+ * |    0        1         2      3         4        5         6        7   |
+ * --------------------------------------------------------------------------
+ * |Frame resrc class|      PRID      |   Sensor data    |       Status     |
+ * --------------------------------------------------------------------------
+ *
+ *
+ * Modifier: 0x14
+ * --------------------------------------------------------------------------
+ * |       0                 1                2                   3         |
+ * --------------------------------------------------------------------------
+ * |Enclosure Tj Avg | Chip Tj Avg    |    Reserved      |     Reserved     |
+ * --------------------------------------------------------------------------
+ */
+
+static void fsp_sensor_process_data(struct opal_sensor_data *attr)
+{
+	uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer;
+	uint32_t sensor_data = INVALID_DATA;
+	uint16_t sensor_mod_data[8];
+	int count, i;
+	uint8_t valid, nr_power;
+	uint32_t power;
+
+	for (count = 0; count < spcn_mod_data[attr->mod_index].entry_count;
+			count++) {
+		memcpy((void *)sensor_mod_data, sensor_buf_ptr,
+				spcn_mod_data[attr->mod_index].entry_size);
+		if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+			/* TODO Support this modifier '0x14', if required */
+
+		} else if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_SENSOR_POWER) {
+			valid = sensor_buf_ptr[0];
+			if (valid & 0x80) {
+				nr_power = valid & 0x0f;
+				sensor_data = 0;
+				for (i=0; i < nr_power; i++) {
+					power = *(uint32_t *) &sensor_buf_ptr[2 + i * 5];
+					DBG("Power[%d]: %d mW\n", i, power);
+					sensor_data += power/1000;
+				}
+			} else {
+				DBG("Power Sensor data not valid\n");
+			}
+		} else if (sensor_mod_data[0] == attr->frc &&
+				sensor_mod_data[1] == attr->rid) {
+			switch (attr->spcn_attr) {
+			/* modifier 0x01, 0x02 */
+			case SENSOR_PRESENT:
+				DBG("Not exported to device tree\n");
+				break;
+			case SENSOR_FAULTED:
+				sensor_data = sensor_mod_data[3] & 0x02;
+				break;
+			case SENSOR_AC_FAULTED:
+			case SENSOR_ON:
+			case SENSOR_ON_SUPPORTED:
+				DBG("Not exported to device tree\n");
+				break;
+			/* modifier 0x10, 0x11 */
+			case SENSOR_THRS:
+				sensor_data = sensor_mod_data[6];
+				break;
+			case SENSOR_LOCATION:
+				DBG("Not exported to device tree\n");
+				break;
+			/* modifier 0x12, 0x13 */
+			case SENSOR_DATA:
+				sensor_data = sensor_mod_data[2];
+				break;
+			default:
+				break;
+			}
+
+			break;
+		}
+
+		sensor_buf_ptr += spcn_mod_data[attr->mod_index].entry_size;
+	}
+
+	*(attr->sensor_data) = sensor_data;
+	if (sensor_data == INVALID_DATA)
+		queue_msg_for_delivery(OPAL_PARTIAL, attr);
+	else
+		queue_msg_for_delivery(OPAL_SUCCESS, attr);
+}
+
+static int fsp_sensor_process_read(struct fsp_msg *resp_msg)
+{
+	uint8_t mbx_rsp_status;
+	uint32_t size = 0;
+
+	mbx_rsp_status = (resp_msg->word1 >> 8) & 0xff;
+	switch (mbx_rsp_status) {
+	case SP_RSP_STATUS_VALID_DATA:
+		sensor_state = SENSOR_VALID_DATA;
+		size = resp_msg->data.words[1] & 0xffff;
+		break;
+	case SP_RSP_STATUS_INVALID_DATA:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR: %s: Received invalid data\n", __func__);
+		sensor_state = SENSOR_INVALID_DATA;
+		break;
+	case SP_RSP_STATUS_SPCN_ERR:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR: %s: Failure due to SPCN error\n", __func__);
+		sensor_state = SENSOR_SPCN_ERROR;
+		break;
+	case SP_RSP_STATUS_DMA_ERR:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR: %s: Failure due to DMA error\n", __func__);
+		sensor_state = SENSOR_DMA_ERROR;
+		break;
+	default:
+		log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+			"SENSOR %s: Read failed, status:0x%02X\n",
+					__func__, mbx_rsp_status);
+		sensor_state = SENSOR_INVALID_DATA;
+		break;
+	}
+
+	return size;
+}
+
+static void queue_msg_for_delivery(int rc, struct opal_sensor_data *attr)
+{
+	DBG("%s: rc:%d, data:%d\n", __func__, rc, *(attr->sensor_data));
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			attr->async_token, rc);
+	spcn_mod_data[attr->mod_index].entry_count = 0;
+	free(attr);
+	prev_msg_consumed = true;
+}
+
+static void fsp_sensor_read_complete(struct fsp_msg *msg)
+{
+	struct opal_sensor_data *attr = msg->user_data;
+	enum spcn_rsp_status status;
+	int rc, size;
+
+	DBG("Sensor read completed\n");
+
+	status = (msg->resp->data.words[1] >> 24) & 0xff;
+	size = fsp_sensor_process_read(msg->resp);
+	fsp_freemsg(msg);
+
+	lock(&sensor_lock);
+	if (sensor_state == SENSOR_VALID_DATA) {
+		spcn_mod_data[attr->mod_index].entry_count += (size /
+				spcn_mod_data[attr->mod_index].entry_size);
+		attr->offset += size;
+		/* Fetch the subsequent entries of the same modifier type */
+		if (status == SPCN_RSP_STATUS_COND_SUCCESS) {
+			switch (spcn_mod_data[attr->mod_index].mod) {
+			case SPCN_MOD_PRS_STATUS_FIRST:
+			case SPCN_MOD_SENSOR_PARAM_FIRST:
+			case SPCN_MOD_SENSOR_DATA_FIRST:
+				attr->mod_index++;
+				spcn_mod_data[attr->mod_index].entry_count =
+						spcn_mod_data[attr->mod_index - 1].
+						entry_count;
+				spcn_mod_data[attr->mod_index - 1].entry_count = 0;
+				break;
+			default:
+				break;
+			}
+
+			rc = fsp_sensor_send_read_request(attr);
+			if (rc != OPAL_ASYNC_COMPLETION)
+				goto err;
+		} else { /* Notify 'powernv' of read completion */
+			fsp_sensor_process_data(attr);
+		}
+	} else {
+		rc = OPAL_INTERNAL_ERROR;
+		goto err;
+	}
+	unlock(&sensor_lock);
+	return;
+err:
+	*(attr->sensor_data) = INVALID_DATA;
+	queue_msg_for_delivery(rc, attr);
+	unlock(&sensor_lock);
+	log_simple_error(&e_info(OPAL_RC_SENSOR_ASYNC_COMPLETE),
+		"SENSOR: %s: Failed to queue the "
+		"read request to fsp\n", __func__);
+}
+
+static int64_t fsp_sensor_send_read_request(struct opal_sensor_data *attr)
+{
+	int rc;
+	struct fsp_msg *msg;
+	uint32_t *sensor_buf_ptr;
+	uint32_t align;
+	uint32_t cmd_header;
+
+	DBG("Get the data for modifier [%d]\n", spcn_mod_data[attr->mod_index].mod);
+	if (spcn_mod_data[attr->mod_index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+		/* TODO Support this modifier '0x14', if required */
+		align = attr->offset % sizeof(*sensor_buf_ptr);
+		if (align)
+			attr->offset += (sizeof(*sensor_buf_ptr) - align);
+
+		sensor_buf_ptr = (uint32_t *)((uint8_t *)sensor_buffer +
+				attr->offset);
+
+		/* TODO Add 8 byte command data required for mod 0x14 */
+
+		attr->offset += 8;
+
+		cmd_header = spcn_mod_data[attr->mod_index].mod << 24 |
+				SPCN_CMD_PRS << 16 | 0x0008;
+	} else {
+		cmd_header = spcn_mod_data[attr->mod_index].mod << 24 |
+				SPCN_CMD_PRS << 16;
+	}
+
+	msg = fsp_mkmsg(FSP_CMD_SPCN_PASSTHRU, 4,
+			SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0,
+			PSI_DMA_SENSOR_BUF + attr->offset);
+
+	if (!msg) {
+		prerror(SENSOR_PREFIX "%s: Failed to allocate read message"
+				"\n", __func__);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	msg->user_data = attr;
+	rc = fsp_queue_msg(msg, fsp_sensor_read_complete);
+	if (rc) {
+		fsp_freemsg(msg);
+		msg = NULL;
+		prerror(SENSOR_PREFIX "%s: Failed to queue read message, "
+				"%d\n", __func__, rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+static int64_t parse_sensor_id(uint32_t id, struct opal_sensor_data *attr)
+{
+	uint32_t mod, index;
+
+	attr->spcn_attr = id >> 24;
+	if (attr->spcn_attr >= SENSOR_MAX)
+		return OPAL_PARAMETER;
+
+	if (attr->spcn_attr <= SENSOR_ON_SUPPORTED)
+		mod = SPCN_MOD_PRS_STATUS_FIRST;
+	else if (attr->spcn_attr <= SENSOR_LOCATION)
+		mod = SPCN_MOD_SENSOR_PARAM_FIRST;
+	else if (attr->spcn_attr <= SENSOR_DATA)
+		mod = SPCN_MOD_SENSOR_DATA_FIRST;
+	else if (attr->spcn_attr <= SENSOR_POWER)
+		mod = SPCN_MOD_SENSOR_POWER;
+	else
+		return OPAL_PARAMETER;
+
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST; index++) {
+		if (spcn_mod_data[index].mod == mod)
+			break;
+	}
+
+	attr->mod_index = index;
+	attr->frc = (id >> 16) & 0xff;
+	attr->rid = id & 0xffff;
+
+	return 0;
+}
+
+
+static int64_t fsp_opal_read_sensor(uint32_t sensor_hndl, int token,
+		uint32_t *sensor_data)
+{
+	struct opal_sensor_data *attr;
+	int64_t rc;
+
+	DBG("fsp_opal_read_sensor [%08x]\n", sensor_hndl);
+	if (sensor_state == SENSOR_PERMANENT_ERROR) {
+		rc = OPAL_HARDWARE;
+		goto out;
+	}
+
+	if (!sensor_hndl) {
+		rc = OPAL_PARAMETER;
+		goto out;
+	}
+
+	lock(&sensor_lock);
+	if (prev_msg_consumed) {
+		attr = zalloc(sizeof(*attr));
+		if (!attr) {
+			log_simple_error(&e_info(OPAL_RC_SENSOR_INIT),
+				"SENSOR: Failed to allocate memory\n");
+			rc = OPAL_NO_MEM;
+			goto out_lock;
+		}
+
+		/* Parse the sensor id and store them to the local structure */
+		rc = parse_sensor_id(sensor_hndl, attr);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+				"SENSOR: %s: Failed to parse the sensor "
+				"handle[0x%08x]\n", __func__, sensor_hndl);
+			goto out_free;
+		}
+		/* Kernel buffer pointer to copy the data later when ready */
+		attr->sensor_data = sensor_data;
+		attr->async_token = token;
+
+		rc = fsp_sensor_send_read_request(attr);
+		if (rc != OPAL_ASYNC_COMPLETION) {
+			log_simple_error(&e_info(OPAL_RC_SENSOR_READ),
+				"SENSOR: %s: Failed to queue the read "
+					"request to fsp\n", __func__);
+			goto out_free;
+		}
+
+		prev_msg_consumed = false;
+	} else {
+		rc = OPAL_BUSY_EVENT;
+	}
+
+	unlock(&sensor_lock);
+	return rc;
+
+out_free:
+	free(attr);
+out_lock:
+	unlock(&sensor_lock);
+out:
+	return rc;
+}
+
+
+#define MAX_RIDS	64
+#define MAX_NAME	64
+
+static uint32_t get_index(uint32_t *prids, uint16_t rid)
+{
+	int index;
+
+	for (index = 0; prids[index] && index < MAX_RIDS; index++) {
+		if (prids[index] == rid)
+			return index;
+	}
+
+	prids[index] = rid;
+	return index;
+}
+
+static void create_sensor_nodes(int index, uint16_t frc, uint16_t rid,
+		uint32_t *prids, struct dt_node *sensors)
+{
+	char name[MAX_NAME];
+	struct dt_node *fs_node;
+	uint32_t value;
+
+	switch (spcn_mod_data[index].mod) {
+	case SPCN_MOD_PRS_STATUS_FIRST:
+	case SPCN_MOD_PRS_STATUS_SUBS:
+		switch (frc) {
+		case SENSOR_FRC_POWER_SUPPLY:
+		case SENSOR_FRC_COOLING_FAN:
+			snprintf(name, MAX_NAME, "%s#%d-%s", frc_names[frc],
+					/* Start enumeration from 1 */
+					get_index(prids, rid) + 1,
+					spcn_mod_data[index].mod_attr[1].name);
+			fs_node = dt_new(sensors, name);
+			snprintf(name, MAX_NAME, "ibm,opal-sensor-%s",
+					frc_names[frc]);
+			dt_add_property_string(fs_node, "compatible", name);
+			value = spcn_mod_data[index].mod_attr[1].val << 24 |
+					(frc & 0xff) << 16 | rid;
+			dt_add_property_cells(fs_node, "sensor-id", value);
+			break;
+		default:
+			break;
+		}
+		break;
+	case SPCN_MOD_SENSOR_PARAM_FIRST:
+	case SPCN_MOD_SENSOR_PARAM_SUBS:
+	case SPCN_MOD_SENSOR_DATA_FIRST:
+	case SPCN_MOD_SENSOR_DATA_SUBS:
+		switch (frc) {
+		case SENSOR_FRC_POWER_SUPPLY:
+		case SENSOR_FRC_COOLING_FAN:
+		case SENSOR_FRC_AMB_TEMP:
+			snprintf(name, MAX_NAME, "%s#%d-%s", frc_names[frc],
+					/* Start enumeration from 1 */
+					get_index(prids, rid) + 1,
+					spcn_mod_data[index].mod_attr[0].name);
+			fs_node = dt_new(sensors, name);
+			snprintf(name, MAX_NAME, "ibm,opal-sensor-%s",
+					frc_names[frc]);
+			dt_add_property_string(fs_node, "compatible", name);
+			value = spcn_mod_data[index].mod_attr[0].val << 24 |
+					(frc & 0xff) << 16 | rid;
+			dt_add_property_cells(fs_node, "sensor-id", value);
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case SPCN_MOD_SENSOR_POWER:
+		fs_node = dt_new(sensors, "power#1-data");
+		dt_add_property_string(fs_node, "compatible", "ibm,opal-sensor-power");
+		value = spcn_mod_data[index].mod_attr[0].val << 24;
+		dt_add_property_cells(fs_node, "sensor-id", value);
+		break;
+	}
+}
+
+static void add_sensor_ids(struct dt_node *sensors)
+{
+	uint32_t MAX_FRC_NAMES = sizeof(frc_names) / sizeof(*frc_names);
+	uint8_t *sensor_buf_ptr = (uint8_t *)sensor_buffer;
+	uint32_t frc_rids[MAX_FRC_NAMES][MAX_RIDS];
+	uint16_t sensor_frc, power_rid;
+	uint16_t sensor_mod_data[8];
+	int index, count;
+
+	memset(frc_rids, 0, sizeof(frc_rids));
+
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST; index++) {
+		if (spcn_mod_data[index].mod == SPCN_MOD_SENSOR_POWER) {
+			create_sensor_nodes(index, 0, 0, 0, sensors);
+			continue;
+		}
+		for (count = 0; count < spcn_mod_data[index].entry_count;
+				count++) {
+			if (spcn_mod_data[index].mod ==
+					SPCN_MOD_PROC_JUNC_TEMP) {
+				/* TODO Support this modifier '0x14', if
+				 * required */
+			} else {
+				memcpy((void *)sensor_mod_data, sensor_buf_ptr,
+						spcn_mod_data[index].entry_size);
+				sensor_frc = sensor_mod_data[0];
+				power_rid = sensor_mod_data[1];
+
+				if (sensor_frc < MAX_FRC_NAMES &&
+						frc_names[sensor_frc])
+					create_sensor_nodes(index, sensor_frc,
+							power_rid,
+							frc_rids[sensor_frc],
+							sensors);
+			}
+
+			sensor_buf_ptr += spcn_mod_data[index].entry_size;
+		}
+	}
+}
+
+static void add_opal_sensor_node(void)
+{
+	int index;
+	struct dt_node *sensors;
+
+	if (!fsp_present())
+		return;
+
+	sensors = dt_new(opal_node, "sensors");
+
+	add_sensor_ids(sensors);
+
+	/* Reset the entry count of each modifier */
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST;
+			index++)
+		spcn_mod_data[index].entry_count = 0;
+}
+
+void fsp_init_sensor(void)
+{
+	uint32_t cmd_header, align, size, psi_dma_offset = 0;
+	enum spcn_rsp_status status;
+	uint32_t *sensor_buf_ptr;
+	struct fsp_msg msg, resp;
+	int index, rc;
+
+	if (!fsp_present()) {
+		sensor_state = SENSOR_PERMANENT_ERROR;
+		return;
+	}
+
+	sensor_buffer = memalign(TCE_PSIZE, SENSOR_MAX_SIZE);
+	if (!sensor_buffer) {
+		prerror("FSP: could not allocate sensor_buffer!\n");
+		return;
+	}
+
+	/* Map TCE */
+	fsp_tce_map(PSI_DMA_SENSOR_BUF, sensor_buffer, PSI_DMA_SENSOR_BUF_SZ);
+
+	/* Register OPAL interface */
+	opal_register(OPAL_SENSOR_READ, fsp_opal_read_sensor, 3);
+
+	msg.resp = &resp;
+
+	/* Traverse using all the modifiers to know all the sensors available
+	 * in the system */
+	for (index = 0; spcn_mod_data[index].mod != SPCN_MOD_LAST &&
+			sensor_state == SENSOR_VALID_DATA;) {
+		DBG("Get the data for modifier [%d]\n", spcn_mod_data[index].mod);
+		if (spcn_mod_data[index].mod == SPCN_MOD_PROC_JUNC_TEMP) {
+			/* TODO Support this modifier 0x14, if required */
+			align = psi_dma_offset % sizeof(*sensor_buf_ptr);
+			if (align)
+				psi_dma_offset += (sizeof(*sensor_buf_ptr) - align);
+
+			sensor_buf_ptr = (uint32_t *)((uint8_t *)sensor_buffer
+					+ psi_dma_offset);
+
+			/* TODO Add 8 byte command data required for mod 0x14 */
+			psi_dma_offset += 8;
+
+			cmd_header = spcn_mod_data[index].mod << 24 |
+					SPCN_CMD_PRS << 16 | 0x0008;
+		} else {
+			cmd_header = spcn_mod_data[index].mod << 24 |
+					SPCN_CMD_PRS << 16;
+		}
+
+		fsp_fillmsg(&msg, FSP_CMD_SPCN_PASSTHRU, 4,
+				SPCN_ADDR_MODE_CEC_NODE, cmd_header, 0,
+				PSI_DMA_SENSOR_BUF + psi_dma_offset);
+
+		rc = fsp_sync_msg(&msg, false);
+		if (rc >= 0) {
+			status = (resp.data.words[1] >> 24) & 0xff;
+			size = fsp_sensor_process_read(&resp);
+			psi_dma_offset += size;
+			spcn_mod_data[index].entry_count += (size /
+					spcn_mod_data[index].entry_size);
+		} else {
+			sensor_state = SENSOR_PERMANENT_ERROR;
+			break;
+		}
+
+		switch (spcn_mod_data[index].mod) {
+		case SPCN_MOD_PRS_STATUS_FIRST:
+		case SPCN_MOD_SENSOR_PARAM_FIRST:
+		case SPCN_MOD_SENSOR_DATA_FIRST:
+			if (status == SPCN_RSP_STATUS_COND_SUCCESS)
+				index++;
+			else
+				index += 2;
+
+			break;
+		case SPCN_MOD_PRS_STATUS_SUBS:
+		case SPCN_MOD_SENSOR_PARAM_SUBS:
+		case SPCN_MOD_SENSOR_DATA_SUBS:
+			if (status != SPCN_RSP_STATUS_COND_SUCCESS)
+				index++;
+			break;
+		case SPCN_MOD_SENSOR_POWER:
+			index++;
+		default:
+			break;
+		}
+	}
+
+	if (sensor_state != SENSOR_VALID_DATA)
+		sensor_state = SENSOR_PERMANENT_ERROR;
+	else
+		add_opal_sensor_node();
+}
diff --git a/hw/fsp/fsp-surveillance.c b/hw/fsp/fsp-surveillance.c
new file mode 100644
index 00000000..c1d19b64
--- /dev/null
+++ b/hw/fsp/fsp-surveillance.c
@@ -0,0 +1,209 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <fsp.h>
+#include <lock.h>
+#include <processor.h>
+#include <timebase.h>
+#include <fsp-sysparam.h>
+#include <fsp-elog.h>
+
+static bool fsp_surv_state = false;
+static bool fsp_surv_ack_pending = false;
+static u64 surv_timer;
+static u64 surv_ack_timer;
+static u32 surv_state_param;
+static struct lock surv_lock = LOCK_UNLOCKED;
+
+#define FSP_SURV_ACK_TIMEOUT	120	/* surv ack timeout in seconds */
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_INIT, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SURVE_STATUS, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY, NULL);
+
+static void fsp_surv_ack(struct fsp_msg *msg)
+{
+	uint8_t val;
+
+	if (!msg->resp)
+		return;
+
+	val = (msg->resp->word1 >> 8) & 0xff;
+	if (val == 0) {
+		/* reset the pending flag */
+		printf("SURV: Received heartbeat acknowledge from FSP\n");
+		lock(&surv_lock);
+		fsp_surv_ack_pending = false;
+		unlock(&surv_lock);
+	} else
+		prerror("SURV: Heartbeat Acknowledgment error from FSP\n");
+
+	fsp_freemsg(msg);
+}
+
+static void fsp_surv_check_timeout(void)
+{
+	u64 now = mftb();
+
+	/*
+	 * We just checked fsp_surv_ack_pending to be true in fsp_surv_hbeat
+	 * and we haven't dropped the surv_lock between then and now. So, we
+	 * just go ahead and check timeouts.
+	 */
+	if (tb_compare(now, surv_ack_timer) == TB_AAFTERB) {
+		/* XXX: We should be logging a PEL to the host, assuming
+		 * the FSP is dead, pending a R/R.
+		 */
+		prerror("SURV: [%16llx] Surv ACK timed out; initiating R/R\n",
+			now);
+
+		/* Reset the pending trigger too */
+		fsp_surv_ack_pending = false;
+		fsp_trigger_reset();
+	}
+
+	return;
+}
+
+/* Send surveillance heartbeat based on a timebase trigger */
+static void fsp_surv_hbeat(void)
+{
+	u64 now = mftb();
+
+	/* Check if an ack is pending... if so, don't send the ping just yet */
+	if (fsp_surv_ack_pending) {
+		fsp_surv_check_timeout();
+		return;
+	}
+
+	/* add timebase callbacks */
+	/*
+	 * XXX This packet needs to be pushed to FSP in an interval
+	 * less than 120s that's advertised to FSP.
+	 *
+	 * Verify if the command building format and call is fine.
+	 */
+	if (surv_timer == 0 ||
+	    (tb_compare(now, surv_timer) == TB_AAFTERB) ||
+	    (tb_compare(now, surv_timer) == TB_AEQUALB)) {
+		printf("SURV: [%16llx] Sending the hearbeat command to FSP\n",
+		       now);
+		fsp_queue_msg(fsp_mkmsg(FSP_CMD_SURV_HBEAT, 1, 120),
+			      fsp_surv_ack);
+
+		fsp_surv_ack_pending = true;
+		surv_timer = now + secs_to_tb(60);
+		surv_ack_timer = now + secs_to_tb(FSP_SURV_ACK_TIMEOUT);
+	}
+}
+
+static void fsp_surv_poll(void *data __unused)
+{
+	if (!fsp_surv_state)
+		return;
+	lock(&surv_lock);
+	fsp_surv_hbeat();
+	unlock(&surv_lock);
+}
+
+static void fsp_surv_got_param(uint32_t param_id __unused, int err_len,
+			       void *data __unused)
+{
+	if (err_len != 4) {
+		log_simple_error(&e_info(OPAL_RC_SURVE_STATUS),
+			"SURV: Error retreiving surveillance status: %d\n",
+			err_len);
+		return;
+	}
+
+	printf("SURV: Status from FSP: %d\n", surv_state_param);
+	if (!(surv_state_param & 0x01))
+		return;
+
+	lock(&surv_lock);
+	fsp_surv_state = true;
+
+	/* Also send one heartbeat now. The next one will not happen
+	 * until we hit the OS.
+	 */
+	fsp_surv_hbeat();
+	unlock(&surv_lock);
+}
+
+void fsp_surv_query(void)
+{
+	int rc;
+
+	printf("SURV: Querying FSP's surveillance status\n");
+
+	/* Reset surveillance settings */
+	lock(&surv_lock);
+	fsp_surv_state = false;
+	surv_timer = 0;
+	surv_ack_timer = 0;
+	unlock(&surv_lock);
+
+	/* Query FPS for surveillance state */
+	rc = fsp_get_sys_param(SYS_PARAM_SURV, &surv_state_param, 4,
+			       fsp_surv_got_param, NULL);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SURVE_INIT),
+			"SURV: Error %d queueing param request\n", rc);
+	}
+}
+
+static bool fsp_surv_msg_rr(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	assert(msg == NULL);
+
+	switch (cmd_sub_mod) {
+	case FSP_RESET_START:
+		printf("SURV: Disabling surveillance\n");
+		fsp_surv_state = false;
+		fsp_surv_ack_pending = false;
+		return true;
+	case FSP_RELOAD_COMPLETE:
+		fsp_surv_query();
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_surv_client_rr = {
+	.message = fsp_surv_msg_rr,
+};
+
+/* This is called at boot time */
+void fsp_init_surveillance(void)
+{
+	/* Always register the poller, so we don't have to add/remove
+	 * it on reset-reload or change of surveillance state. Also the
+	 * poller list has no locking so we don't want to play with it
+	 * at runtime.
+	 */
+	opal_add_poller(fsp_surv_poll, NULL);
+
+	/* Register for the reset/reload event */
+	fsp_register_client(&fsp_surv_client_rr, FSP_MCLASS_RR_EVENT);
+
+	/* Send query to FSP */
+	fsp_surv_query();
+}
+
diff --git a/hw/fsp/fsp-sysparam.c b/hw/fsp/fsp-sysparam.c
new file mode 100644
index 00000000..e9e5b164
--- /dev/null
+++ b/hw/fsp/fsp-sysparam.c
@@ -0,0 +1,454 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <device.h>
+#include <lock.h>
+#include <processor.h>
+#include <psi.h>
+#include <opal-msg.h>
+#include <fsp-sysparam.h>
+
+struct sysparam_comp_data {
+	uint32_t param_len;
+	uint64_t async_token;
+};
+
+struct sysparam_req {
+	sysparam_compl_t	completion;
+	void			*comp_data;
+	void			*ubuf;
+	uint32_t		ulen;
+	struct fsp_msg		msg;
+	struct fsp_msg		resp;
+	bool			done;
+};
+
+static struct sysparam_attr {
+	const char	*name;
+	uint32_t	id;
+	uint32_t	length;
+	uint8_t		perm;
+} sysparam_attrs[] = {
+#define _R	OPAL_SYSPARAM_READ
+#define _W	OPAL_SYSPARAM_WRITE
+#define _RW	OPAL_SYSPARAM_RW
+	{"surveillance",	SYS_PARAM_SURV, 	4,	_RW},
+	{"hmc-management", 	SYS_PARAM_HMC_MANAGED,	4,	_R},
+	{"cupd-policy",		SYS_PARAM_FLASH_POLICY, 4,	_RW},
+	{"plat-hmc-managed",	SYS_PARAM_NEED_HMC,	4,	_RW},
+	{"fw-license-policy",	SYS_PARAM_FW_LICENSE,	4,	_RW},
+	{"world-wide-port-num", SYS_PARAM_WWPN,		12,	_W},
+	{"default-boot-device",	SYS_PARAM_DEF_BOOT_DEV,	1,	_RW},
+	{"next-boot-device",	SYS_PARAM_NEXT_BOOT_DEV,1,	_RW}
+#undef _R
+#undef _W
+#undef _RW
+};
+
+static int fsp_sysparam_process(struct sysparam_req *r)
+{
+	u32 param_id, len;
+	int stlen = 0;
+	u8 fstat;
+	/* Snapshot completion before we set the "done" flag */
+	sysparam_compl_t comp = r->completion;
+	void *cdata = r->comp_data;
+
+	if (r->msg.state != fsp_msg_done) {
+		prerror("FSP: Request for sysparam 0x%x got FSP failure!\n",
+			r->msg.data.words[0]);
+		stlen = -1; /* XXX Find saner error codes */
+		goto complete;
+	}
+
+	param_id = r->resp.data.words[0];
+	len = r->resp.data.words[1] & 0xffff;
+
+	/* Check params validity */
+	if (param_id != r->msg.data.words[0]) {
+		prerror("FSP: Request for sysparam 0x%x got resp. for 0x%x!\n",
+			r->msg.data.words[0], param_id);
+		stlen = -2; /* XXX Sane error codes */
+		goto complete;
+	}
+	if (len > r->ulen) {
+		prerror("FSP: Request for sysparam 0x%x truncated!\n",
+			param_id);
+		len = r->ulen;
+	}
+
+	/* Decode the request status */
+	fstat = (r->msg.resp->word1 >> 8) & 0xff;
+	switch(fstat) {
+	case 0x00: /* XXX Is that even possible ? */
+	case 0x11: /* Data in request */
+		memcpy(r->ubuf, &r->resp.data.words[2], len);
+		/* pass through */
+	case 0x12: /* Data in TCE */
+		stlen = len;
+		break;
+	default:
+		stlen = -fstat;
+	}
+ complete:
+	/* Call completion if any */
+	if (comp)
+		comp(r->msg.data.words[0], stlen, cdata);
+	
+	free(r);
+
+	return stlen;
+}
+
+static void fsp_sysparam_get_complete(struct fsp_msg *msg)
+{
+	struct sysparam_req *r = container_of(msg, struct sysparam_req, msg);
+
+	/* If it's an asynchronous request, process it now */
+	if (r->completion) {
+		fsp_sysparam_process(r);
+		return;
+	}
+
+	/* Else just set the done flag */
+
+	/* Another CPU can be polling on the "done" flag without the
+	 * lock held, so let's order the udpates to the structure
+	 */
+	lwsync();
+	r->done = true;
+}
+
+int fsp_get_sys_param(uint32_t param_id, void *buffer, uint32_t length,
+		      sysparam_compl_t async_complete, void *comp_data)
+{
+	struct sysparam_req *r;
+	uint64_t baddr, tce_token;
+	int rc;
+
+	if (!fsp_present())
+		return -ENODEV;
+	/*
+	 * XXX FIXME: We currently always allocate the sysparam_req here
+	 * however, we want to avoid runtime allocations as much as
+	 * possible, so if this is going to be used a lot at runtime,
+	 * we probably want to pre-allocate a pool of these
+	 */
+	r = zalloc(sizeof(struct sysparam_req));
+	if (!r)
+		return -ENOMEM;
+	if (length > 4096)
+		return -EINVAL;
+	r->completion = async_complete;
+	r->comp_data = comp_data;
+	r->done = false;
+	r->ubuf = buffer;
+	r->ulen = length;
+	r->msg.resp = &r->resp;
+
+	/* Map always 1 page ... easier that way and none of that
+	 * is performance critical
+	 */
+	baddr = (uint64_t)buffer;
+	fsp_tce_map(PSI_DMA_GET_SYSPARAM, (void *)(baddr & ~0xffful), 0x1000);
+	tce_token = PSI_DMA_GET_SYSPARAM | (baddr & 0xfff);
+	fsp_fillmsg(&r->msg, FSP_CMD_QUERY_SPARM, 3,
+		    param_id, length, tce_token);
+	rc = fsp_queue_msg(&r->msg, fsp_sysparam_get_complete);
+
+	/* Asynchronous operation or queueing failure, return */
+	if (rc || async_complete)
+		return rc;
+
+	/* Synchronous operation requested, spin and process */
+	while(!r->done)
+		fsp_poll();
+
+	/* Will free the request */
+	return fsp_sysparam_process(r);
+}
+
+static void fsp_opal_getparam_complete(uint32_t param_id __unused, int err_len,
+		void *data)
+{
+	struct sysparam_comp_data *comp_data = data;
+	int rc = OPAL_SUCCESS;
+
+	if (comp_data->param_len != err_len)
+		rc = OPAL_INTERNAL_ERROR;
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			comp_data->async_token, rc);
+	free(comp_data);
+}
+
+static void fsp_opal_setparam_complete(struct fsp_msg *msg)
+{
+	struct sysparam_comp_data *comp_data = msg->user_data;
+	u8 fstat;
+	uint32_t param_id;
+	int rc = OPAL_SUCCESS;
+
+	if (msg->state != fsp_msg_done) {
+		prerror("FSP: Request for set sysparam 0x%x got FSP failure!\n",
+				msg->data.words[0]);
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+
+	param_id = msg->resp->data.words[0];
+	if (param_id != msg->data.words[0]) {
+		prerror("FSP: Request for set sysparam 0x%x got resp. for 0x%x!"
+				"\n", msg->data.words[0], param_id);
+		rc = OPAL_INTERNAL_ERROR;
+		goto out;
+	}
+
+	fstat = (msg->resp->word1 >> 8) & 0xff;
+	switch (fstat) {
+	case 0x00:
+		rc = OPAL_SUCCESS;
+		break;
+	case 0x22:
+		prerror("%s: Response status 0x%x, invalid data\n", __func__,
+				fstat);
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	case 0x24:
+		prerror("%s: Response status 0x%x, DMA error\n", __func__,
+				fstat);
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	default:
+		rc = OPAL_INTERNAL_ERROR;
+		break;
+	}
+
+out:
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
+			comp_data->async_token, rc);
+	free(comp_data);
+	fsp_freemsg(msg);
+}
+
+/* OPAL interface for PowerNV to read the system parameter from FSP */
+static int64_t fsp_opal_get_param(uint64_t async_token, uint32_t param_id,
+				  uint64_t buffer, uint64_t length)
+{
+	struct sysparam_comp_data *comp_data;
+	int count, rc, i;
+
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	count = ARRAY_SIZE(sysparam_attrs);
+	for (i = 0; i < count; i++)
+		if (sysparam_attrs[i].id == param_id)
+			break;
+	if (i == count)
+		return OPAL_PARAMETER;
+
+	if (length < sysparam_attrs[i].length)
+		return OPAL_PARAMETER;
+	if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_READ))
+		return OPAL_PERMISSION;
+
+	comp_data = zalloc(sizeof(struct sysparam_comp_data));
+	if (!comp_data)
+		return OPAL_NO_MEM;
+
+	comp_data->param_len = sysparam_attrs[i].length;
+	comp_data->async_token = async_token;
+	rc = fsp_get_sys_param(param_id, (void *)buffer,
+			sysparam_attrs[i].length, fsp_opal_getparam_complete,
+			comp_data);
+	if (rc) {
+		free(comp_data);
+		prerror("%s: Error %d queuing param request\n", __func__, rc);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+/* OPAL interface for PowerNV to update the system parameter to FSP */
+static int64_t fsp_opal_set_param(uint64_t async_token, uint32_t param_id,
+				  uint64_t buffer, uint64_t length)
+{
+	struct sysparam_comp_data *comp_data;
+	struct fsp_msg *msg;
+	uint64_t tce_token;
+	int count, rc, i;
+
+	if (!fsp_present())
+		return OPAL_HARDWARE;
+
+	count = ARRAY_SIZE(sysparam_attrs);
+	for (i = 0; i < count; i++)
+		if (sysparam_attrs[i].id == param_id)
+			break;
+	if (i == count)
+		return OPAL_PARAMETER;
+
+	if (length < sysparam_attrs[i].length)
+		return OPAL_PARAMETER;
+	if (!(sysparam_attrs[i].perm & OPAL_SYSPARAM_WRITE))
+		return OPAL_PERMISSION;
+
+	fsp_tce_map(PSI_DMA_SET_SYSPARAM, (void *)(buffer & ~0xffful), 0x1000);
+	tce_token = PSI_DMA_SET_SYSPARAM | (buffer & 0xfff);
+
+	msg = fsp_mkmsg(FSP_CMD_SET_SPARM_2, 4, param_id, length,
+			tce_token >> 32, tce_token);
+	if (!msg) {
+		prerror("%s: Failed to allocate the message\n", __func__);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	comp_data = zalloc(sizeof(struct sysparam_comp_data));
+	if (!comp_data)
+		return OPAL_NO_MEM;
+
+	comp_data->param_len = length;
+	comp_data->async_token = async_token;
+	msg->user_data = comp_data;
+
+	rc = fsp_queue_msg(msg, fsp_opal_setparam_complete);
+	if (rc) {
+		free(comp_data);
+		fsp_freemsg(msg);
+		prerror("%s: Failed to queue the message\n", __func__);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+static bool fsp_sysparam_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	struct fsp_msg *rsp;
+	int rc = -ENOMEM;
+
+	switch(cmd_sub_mod) {
+	case FSP_CMD_SP_SPARM_UPD_0:
+	case FSP_CMD_SP_SPARM_UPD_1:
+		printf("FSP: Got sysparam update, param ID 0x%x\n",
+		       msg->data.words[0]);
+		rsp = fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008000, 0);
+		if (rsp)
+			rc = fsp_queue_msg(rsp, fsp_freemsg);
+		if (rc) {
+			prerror("FSP: Error %d queuing sysparam reply\n", rc);
+			/* What to do here ? R/R ? */
+			fsp_freemsg(rsp);
+		}
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_sysparam_client = {
+	.message = fsp_sysparam_msg,
+};
+
+static void add_opal_sysparam_node(void)
+{
+	struct dt_node *sysparams;
+	char *names, *s;
+	uint32_t *ids, *lens;
+	uint8_t *perms;
+	unsigned int i, count, size = 0;
+
+	if (!fsp_present())
+		return;
+
+	sysparams = dt_new(opal_node, "sysparams");
+	dt_add_property_string(sysparams, "compatible", "ibm,opal-sysparams");
+
+	count = ARRAY_SIZE(sysparam_attrs);
+	for (i = 0; i < count; i++)
+		size = size + strlen(sysparam_attrs[i].name) + 1;
+
+	names = zalloc(size);
+	if (!names) {
+		prerror("%s: Failed to allocate memory for parameter names\n",
+				__func__);
+		return;
+	}
+
+	ids = zalloc(count * sizeof(*ids));
+	if (!ids) {
+		prerror("%s: Failed to allocate memory for parameter ids\n",
+				__func__);
+		goto out_free_name;
+	}
+
+	lens = zalloc(count * sizeof(*lens));
+	if (!lens) {
+		prerror("%s: Failed to allocate memory for parameter length\n",
+				__func__);
+		goto out_free_id;
+	}
+
+	perms = zalloc(count * sizeof(*perms));
+	if (!perms) {
+		prerror("%s: Failed to allocate memory for parameter length\n",
+				__func__);
+		goto out_free_len;
+	}
+
+	s = names;
+	for (i = 0; i < count; i++) {
+		strcpy(s, sysparam_attrs[i].name);
+		s = s + strlen(sysparam_attrs[i].name) + 1;
+
+		ids[i] = sysparam_attrs[i].id;
+		lens[i] = sysparam_attrs[i].length;
+		perms[i] = sysparam_attrs[i].perm;
+	}
+
+	dt_add_property(sysparams, "param-name", names, size);
+	dt_add_property(sysparams, "param-id", ids, count * sizeof(*ids));
+	dt_add_property(sysparams, "param-len", lens, count * sizeof(*lens));
+	dt_add_property(sysparams, "param-perm", perms, count * sizeof(*perms));
+
+	free(perms);
+
+out_free_len:
+	free(lens);
+out_free_id:
+	free(ids);
+out_free_name:
+	free(names);
+}
+
+void fsp_sysparam_init(void)
+{
+	if (!fsp_present())
+		return;
+
+	/* Register change notifications */
+	fsp_register_client(&fsp_sysparam_client, FSP_MCLASS_SERVICE);
+
+	/* Register OPAL interfaces */
+	opal_register(OPAL_GET_PARAM, fsp_opal_get_param, 4);
+	opal_register(OPAL_SET_PARAM, fsp_opal_set_param, 4);
+
+	/* Add device-tree nodes */
+	add_opal_sysparam_node();
+}
diff --git a/hw/fsp/fsp.c b/hw/fsp/fsp.c
new file mode 100644
index 00000000..5dc298aa
--- /dev/null
+++ b/hw/fsp/fsp.c
@@ -0,0 +1,2147 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Service Processor handling code
+ *
+ * XXX This mixes PSI and FSP and currently only supports
+ * P7/P7+ PSI and FSP1
+ *
+ * If we are going to support P8 PSI and FSP2, we probably want
+ * to split the PSI support from the FSP support proper first.
+ */
+#include <stdarg.h>
+#include <processor.h>
+#include <io.h>
+#include <fsp.h>
+#include <lock.h>
+#include <interrupts.h>
+#include <gx.h>
+#include <device.h>
+#include <trace.h>
+#include <timebase.h>
+#include <cpu.h>
+#include <fsp-elog.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_POLL_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+		OPAL_PLATFORM_FIRMWARE, OPAL_ERROR_PANIC, OPAL_NA, NULL);
+
+//#define DBG(fmt...)	printf(fmt)
+#define DBG(fmt...)	do { } while(0)
+#define FSP_TRACE_MSG
+#define FSP_TRACE_EVENT
+
+#define FSP_MAX_IOPATH	4
+
+enum fsp_path_state {
+	fsp_path_bad,
+	fsp_path_backup,
+	fsp_path_active,
+};
+
+struct fsp_iopath {
+	enum fsp_path_state	state;
+	void			*fsp_regs;
+	struct psi		*psi;
+};
+
+enum fsp_mbx_state {
+	fsp_mbx_idle,		/* Mailbox ready to send */
+	fsp_mbx_send,		/* Mailbox sent, waiting for ack */
+	fsp_mbx_crit_op,	/* Critical operation in progress */
+	fsp_mbx_prep_for_reset,	/* Prepare for reset sent */
+	fsp_mbx_err,		/* Mailbox in error state, waiting for r&r */
+	fsp_mbx_rr,		/* Mailbox in r&r */
+};
+
+struct fsp {
+	struct fsp		*link;
+	unsigned int		index;
+	enum fsp_mbx_state	state;
+	struct fsp_msg		*pending;
+
+	unsigned int		iopath_count;
+	int			active_iopath;	/* -1: no active IO path */
+	struct fsp_iopath	iopath[FSP_MAX_IOPATH];
+};
+
+static struct fsp *first_fsp;
+static struct fsp *active_fsp;
+static u16 fsp_curseq = 0x8000;
+static u64 *fsp_tce_table;
+
+#define FSP_INBOUND_SIZE	0x00100000UL
+static void *fsp_inbound_buf = NULL;
+static u32 fsp_inbound_off;
+
+static struct lock fsp_lock = LOCK_UNLOCKED;
+
+static u64 fsp_cmdclass_resp_bitmask;
+static u64 timeout_timer;
+
+static u64 fsp_hir_timeout;
+
+#define FSP_CRITICAL_OP_TIMEOUT		128
+#define FSP_DRCR_CLEAR_TIMEOUT		128
+
+/*
+ * We keep track on last logged values for some things to print only on
+ * value changes, but also to releive pressure on the tracer which
+ * doesn't do a very good job at detecting repeats when called from
+ * many different CPUs
+ */
+static u32 disr_last_print;
+static u32 drcr_last_print;
+static u32 hstate_last_print;
+
+void fsp_handle_resp(struct fsp_msg *msg);
+
+struct fsp_cmdclass {
+	int timeout;
+	bool busy;
+	struct list_head msgq;
+	struct list_head clientq;
+	struct list_head rr_queue;	/* To queue up msgs during R/R */
+	u64 timesent;
+};
+
+static struct fsp_cmdclass fsp_cmdclass_rr;
+
+static struct fsp_cmdclass fsp_cmdclass[FSP_MCLASS_LAST - FSP_MCLASS_FIRST + 1]
+= {
+#define DEF_CLASS(_cl, _to) [_cl - FSP_MCLASS_FIRST] = { .timeout = _to }
+	DEF_CLASS(FSP_MCLASS_SERVICE,		16),
+	DEF_CLASS(FSP_MCLASS_PCTRL_MSG,		16),
+	DEF_CLASS(FSP_MCLASS_PCTRL_ABORTS,	16),
+	DEF_CLASS(FSP_MCLASS_ERR_LOG,		16),
+	DEF_CLASS(FSP_MCLASS_CODE_UPDATE,	40),
+	DEF_CLASS(FSP_MCLASS_FETCH_SPDATA,	16),
+	DEF_CLASS(FSP_MCLASS_FETCH_HVDATA,	16),
+	DEF_CLASS(FSP_MCLASS_NVRAM,		16),
+	DEF_CLASS(FSP_MCLASS_MBOX_SURV,		 2),
+	DEF_CLASS(FSP_MCLASS_RTC,		16),
+	DEF_CLASS(FSP_MCLASS_SMART_CHIP,	20),
+	DEF_CLASS(FSP_MCLASS_INDICATOR,	       180),
+	DEF_CLASS(FSP_MCLASS_HMC_INTFMSG,	16),
+	DEF_CLASS(FSP_MCLASS_HMC_VT,		16),
+	DEF_CLASS(FSP_MCLASS_HMC_BUFFERS,	16),
+	DEF_CLASS(FSP_MCLASS_SHARK,		16),
+	DEF_CLASS(FSP_MCLASS_MEMORY_ERR,	16),
+	DEF_CLASS(FSP_MCLASS_CUOD_EVENT,	16),
+	DEF_CLASS(FSP_MCLASS_HW_MAINT,		16),
+	DEF_CLASS(FSP_MCLASS_VIO,		16),
+	DEF_CLASS(FSP_MCLASS_SRC_MSG,		16),
+	DEF_CLASS(FSP_MCLASS_DATA_COPY,		16),
+	DEF_CLASS(FSP_MCLASS_TONE,		16),
+	DEF_CLASS(FSP_MCLASS_VIRTUAL_NVRAM,	16),
+	DEF_CLASS(FSP_MCLASS_TORRENT,		16),
+	DEF_CLASS(FSP_MCLASS_NODE_PDOWN,	16),
+	DEF_CLASS(FSP_MCLASS_DIAG,		16),
+	DEF_CLASS(FSP_MCLASS_PCIE_LINK_TOPO,	16),
+	DEF_CLASS(FSP_MCLASS_OCC,		16),
+};
+
+static void fsp_trace_msg(struct fsp_msg *msg, u8 dir __unused)
+{
+	union trace fsp __unused;
+#ifdef FSP_TRACE_MSG
+	size_t len = offsetof(struct trace_fsp_msg, data[msg->dlen]);
+
+	fsp.fsp_msg.dlen = msg->dlen;
+	fsp.fsp_msg.word0 = msg->word0;
+	fsp.fsp_msg.word1 = msg->word1;
+	fsp.fsp_msg.dir = dir;
+	memcpy(fsp.fsp_msg.data, msg->data.bytes, msg->dlen);
+	trace_add(&fsp, TRACE_FSP_MSG, len);
+#endif /* FSP_TRACE_MSG */
+	assert(msg->dlen <= sizeof(fsp.fsp_msg.data));
+}
+
+static struct fsp *fsp_get_active(void)
+{
+	/* XXX Handle transition between FSPs */
+	return active_fsp;
+}
+
+static u64 fsp_get_class_bit(u8 class)
+{
+	/* Alias classes CE and CF as the FSP has a single queue */
+	if (class == FSP_MCLASS_IPL)
+		class = FSP_MCLASS_SERVICE;
+
+	return 1ul << (class - FSP_MCLASS_FIRST);
+}
+
+static struct fsp_cmdclass *__fsp_get_cmdclass(u8 class)
+{
+	struct fsp_cmdclass *ret;
+
+	/* RR class is special */
+	if (class == FSP_MCLASS_RR_EVENT)
+		return &fsp_cmdclass_rr;
+
+	/* Bound check */
+	if (class < FSP_MCLASS_FIRST || class > FSP_MCLASS_LAST)
+		return NULL;
+
+	/* Alias classes CE and CF as the FSP has a single queue */
+	if (class == FSP_MCLASS_IPL)
+		class = FSP_MCLASS_SERVICE;
+
+	ret = &fsp_cmdclass[class - FSP_MCLASS_FIRST];
+
+	/* Unknown class */
+	if (ret->timeout == 0)
+		return NULL;
+
+	return ret;
+}
+
+static struct fsp_cmdclass *fsp_get_cmdclass(struct fsp_msg *msg)
+{
+	u8 c = msg->word0 & 0xff;
+
+	return __fsp_get_cmdclass(c);
+}
+
+static struct fsp_msg *__fsp_allocmsg(void)
+{
+	return zalloc(sizeof(struct fsp_msg));
+}
+
+struct fsp_msg *fsp_allocmsg(bool alloc_response)
+{
+	struct fsp_msg *msg;
+
+	msg = __fsp_allocmsg();
+	if (!msg)
+		return NULL;
+	if (alloc_response)
+		msg->resp = __fsp_allocmsg();
+	return msg;
+}
+
+void __fsp_freemsg(struct fsp_msg *msg)
+{
+	free(msg);
+}
+
+void fsp_freemsg(struct fsp_msg *msg)
+{
+	if (msg->resp)
+		__fsp_freemsg(msg->resp);
+	__fsp_freemsg(msg);
+}
+
+void fsp_cancelmsg(struct fsp_msg *msg)
+{
+	bool need_unlock = false;
+	struct fsp_cmdclass* cmdclass = fsp_get_cmdclass(msg);
+	struct fsp *fsp = fsp_get_active();
+
+	if (fsp->state != fsp_mbx_rr) {
+		prerror("FSP: Message cancel allowed only when"
+						"FSP is in reset\n");
+		return;
+	}
+
+	if (!cmdclass)
+		return;
+
+	/* Recursive locking */
+	need_unlock = lock_recursive(&fsp_lock);
+
+	list_del(&msg->link);
+	msg->state = fsp_msg_cancelled;
+
+	if (need_unlock)
+		unlock(&fsp_lock);
+}
+
+static void fsp_wreg(struct fsp *fsp, u32 reg, u32 val)
+{
+	struct fsp_iopath *iop;
+
+	if (fsp->active_iopath < 0)
+		return;
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (iop->state == fsp_path_bad)
+		return;
+	out_be32(iop->fsp_regs + reg, val);
+}
+
+static u32 fsp_rreg(struct fsp *fsp, u32 reg)
+{
+	struct fsp_iopath *iop;
+
+	if (fsp->active_iopath < 0)
+		return 0xffffffff;
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (iop->state == fsp_path_bad)
+		return 0xffffffff;
+	return in_be32(iop->fsp_regs + reg);
+}
+
+static void fsp_reg_dump(void)
+{
+#define FSP_DUMP_ONE(x)	\
+	printf("  %20s: %x\n", #x, fsp_rreg(fsp, x));
+
+	struct fsp *fsp = fsp_get_active();
+
+	if (!fsp)
+		return;
+
+	printf("FSP #%d: Register dump (state=%d)\n",
+	       fsp->index, fsp->state);
+	FSP_DUMP_ONE(FSP_DRCR_REG);
+	FSP_DUMP_ONE(FSP_DISR_REG);
+	FSP_DUMP_ONE(FSP_MBX1_HCTL_REG);
+	FSP_DUMP_ONE(FSP_MBX1_FCTL_REG);
+	FSP_DUMP_ONE(FSP_MBX2_HCTL_REG);
+	FSP_DUMP_ONE(FSP_MBX2_FCTL_REG);
+	FSP_DUMP_ONE(FSP_SDES_REG);
+	FSP_DUMP_ONE(FSP_HDES_REG);
+	FSP_DUMP_ONE(FSP_HDIR_REG);
+	FSP_DUMP_ONE(FSP_HDIM_SET_REG);
+	FSP_DUMP_ONE(FSP_PDIR_REG);
+	FSP_DUMP_ONE(FSP_PDIM_SET_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH0_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH1_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH2_REG);
+	FSP_DUMP_ONE(FSP_SCRATCH3_REG);
+}
+
+static void fsp_notify_rr_state(u32 state)
+{
+	struct fsp_client *client, *next;
+	struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(FSP_MCLASS_RR_EVENT);
+
+	assert(cmdclass);
+	list_for_each_safe(&cmdclass->clientq, client, next, link)
+		client->message(state, NULL);
+}
+
+static void fsp_reset_cmdclass(void)
+{
+	int i;
+	struct fsp_msg *msg;
+
+	for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+		struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+		cmdclass->busy = false;
+		cmdclass->timesent = 0;
+
+		/* We also need to reset the 'timeout' timers here */
+
+		/* Make sure the message queue is empty */
+		while(!list_empty(&cmdclass->msgq)) {
+			msg = list_pop(&cmdclass->msgq, struct fsp_msg,
+				       link);
+			list_add_tail(&cmdclass->rr_queue, &msg->link);
+		}
+	}
+}
+
+static bool fsp_in_hir(struct fsp *fsp)
+{
+	switch (fsp->state) {
+	case fsp_mbx_crit_op:
+	case fsp_mbx_prep_for_reset:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool fsp_in_reset(struct fsp *fsp)
+{
+	switch (fsp->state) {
+	case fsp_mbx_err:	/* Will be reset soon */
+	case fsp_mbx_rr:	/* Already in reset */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool fsp_hir_state_timeout(void)
+{
+	u64 now = mftb();
+
+	if (tb_compare(now, fsp_hir_timeout) == TB_AAFTERB)
+		return true;
+
+	return false;
+}
+
+static void fsp_set_hir_timeout(u32 seconds)
+{
+	u64 now = mftb();
+	fsp_hir_timeout = now + secs_to_tb(seconds);
+}
+
+static bool fsp_crit_op_in_progress(struct fsp *fsp)
+{
+	u32 disr = fsp_rreg(fsp, FSP_DISR_REG);
+
+	if (disr & FSP_DISR_CRIT_OP_IN_PROGRESS)
+		return true;
+
+	return false;
+}
+
+/* Notify the FSP that it will be reset soon by writing to the DRCR */
+static void fsp_prep_for_reset(struct fsp *fsp)
+{
+	u32 drcr = fsp_rreg(fsp, FSP_DRCR_REG);
+
+	printf("FSP: Writing reset to DRCR\n");
+	drcr_last_print = drcr;
+	fsp_wreg(fsp, FSP_DRCR_REG, (drcr | FSP_PREP_FOR_RESET_CMD));
+	fsp->state = fsp_mbx_prep_for_reset;
+	fsp_set_hir_timeout(FSP_DRCR_CLEAR_TIMEOUT);
+}
+
+static void fsp_hir_poll(struct fsp *fsp, struct psi *psi)
+{
+	u32 drcr;
+
+	switch (fsp->state) {
+	case fsp_mbx_crit_op:
+		if (fsp_crit_op_in_progress(fsp)) {
+			if (fsp_hir_state_timeout())
+				prerror("FSP: Critical operation timeout\n");
+				/* XXX What do do next? Check with FSP folks */
+		} else {
+			fsp_prep_for_reset(fsp);
+		}
+		break;
+	case fsp_mbx_prep_for_reset:
+		drcr = fsp_rreg(fsp, FSP_DRCR_REG);
+
+		if (drcr != drcr_last_print) {
+			printf("FSP: DRCR changed, old = %x, new = %x\n",
+					drcr_last_print, drcr);
+			drcr_last_print = drcr;
+		}
+
+		if (drcr & FSP_DRCR_ACK_MASK) {
+			if (fsp_hir_state_timeout()) {
+				prerror("FSP: Ack timeout. Triggering reset\n");
+				psi_disable_link(psi);
+				fsp->state = fsp_mbx_err;
+			}
+		} else {
+			printf("FSP: DRCR ack received. Triggering reset\n");
+			psi_disable_link(psi);
+			fsp->state = fsp_mbx_err;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * This is the main entry for the host initiated reset case.
+ * This gets called when:
+ *	a. Surveillance ack is not received in 120 seconds
+ *	b. A mailbox command doesn't get a response within the stipulated time.
+ */
+static void __fsp_trigger_reset(void)
+{
+	struct fsp *fsp = fsp_get_active();
+	u32 disr;
+
+	/* Already in one of the error processing states */
+	if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+		return;
+
+	prerror("FSP: fsp_trigger_reset() entry\n");
+
+	drcr_last_print = 0;
+	/*
+	 * Check if we are allowed to reset the FSP. We aren't allowed to
+	 * reset the FSP if the FSP_DISR_DBG_IN_PROGRESS is set.
+	 */
+	disr = fsp_rreg(fsp, FSP_DISR_REG);
+	if (disr & FSP_DISR_DBG_IN_PROGRESS) {
+		prerror("FSP: Host initiated reset disabled\n");
+		return;
+	}
+
+	/*
+	 * Check if some critical operation is in progress as indicated
+	 * by FSP_DISR_CRIT_OP_IN_PROGRESS. Timeout is 128 seconds
+	 */
+	if (fsp_crit_op_in_progress(fsp)) {
+		printf("FSP: Critical operation in progress\n");
+		fsp->state = fsp_mbx_crit_op;
+		fsp_set_hir_timeout(FSP_CRITICAL_OP_TIMEOUT);
+	} else
+		fsp_prep_for_reset(fsp);
+}
+
+void fsp_trigger_reset(void)
+{
+	lock(&fsp_lock);
+	__fsp_trigger_reset();
+	unlock(&fsp_lock);
+}
+
+static void fsp_start_rr(struct fsp *fsp)
+{
+	struct fsp_iopath *iop;
+
+	if (fsp->state == fsp_mbx_rr)
+		return;
+
+	/* We no longer have an active path on that FSP */
+	if (fsp->active_iopath >= 0) {
+		iop = &fsp->iopath[fsp->active_iopath];
+		iop->state = fsp_path_bad;
+		fsp->active_iopath = -1;
+	}
+	fsp->state = fsp_mbx_rr;
+	disr_last_print = 0;
+	hstate_last_print = 0;
+
+	/*
+	 * Mark all command classes as non-busy and clear their
+	 * timeout, then flush all messages in our staging queue
+	 */
+	fsp_reset_cmdclass();
+
+	/* Notify clients. We have to drop the lock here */
+	unlock(&fsp_lock);
+	fsp_notify_rr_state(FSP_RESET_START);
+	lock(&fsp_lock);
+
+	/* Start polling PSI */
+	psi_set_link_polling(true);
+}
+
+static void fsp_trace_event(struct fsp *fsp, u32 evt,
+			    u32 data0, u32 data1, u32 data2, u32 data3)
+{
+	union trace tfsp __unused;
+#ifdef FSP_TRACE_EVENT
+	size_t len = sizeof(struct trace_fsp_event);
+
+	tfsp.fsp_evt.event = evt;
+	tfsp.fsp_evt.fsp_state = fsp->state;
+	tfsp.fsp_evt.data[0] = data0;
+	tfsp.fsp_evt.data[1] = data1;
+	tfsp.fsp_evt.data[2] = data2;
+	tfsp.fsp_evt.data[3] = data3;
+	trace_add(&tfsp, TRACE_FSP_EVENT, len);
+#endif /* FSP_TRACE_EVENT */
+}
+
+static void fsp_handle_errors(struct fsp *fsp)
+{
+	u32 hstate;
+	struct fsp_iopath *iop;
+	struct psi *psi;
+	u32 disr;
+
+	if (fsp->active_iopath < 0) {
+		prerror("FSP #%d: fsp_handle_errors() with no active IOP\n",
+			fsp->index);
+		return;
+	}
+
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (!iop->psi) {
+		prerror("FSP: Active IOP with no PSI link !\n");
+		return;
+	}
+	psi = iop->psi;
+
+	/*
+	 * If the link is not up, start R&R immediately, we do call
+	 * psi_disable_link() in this case as while the link might
+	 * not be up, it might still be enabled and the PSI layer
+	 * "active" bit still set
+	 */
+	if (!psi_check_link_active(psi)) {
+		/* Start R&R process */
+		fsp_trace_event(fsp, TRACE_FSP_EVT_LINK_DOWN, 0, 0, 0, 0);
+		prerror("FSP #%d: Link down, starting R&R\n", fsp->index);
+
+		/* If we got here due to a host initiated reset, the link
+		 * is already driven down.
+		 */
+		if (fsp->state == fsp_mbx_err)
+			psi_disable_link(psi);
+		fsp_start_rr(fsp);
+		return;
+	}
+
+	/* Link is up, check for other conditions */
+	disr = fsp_rreg(fsp, FSP_DISR_REG);
+
+	/* If in R&R, log values */
+	if (disr != disr_last_print) {
+		fsp_trace_event(fsp, TRACE_FSP_EVT_DISR_CHG, disr, 0, 0, 0);
+
+		printf("FSP #%d: DISR stat change = 0x%08x\n",
+		       fsp->index, disr);
+		disr_last_print = disr;
+	}
+
+	/*
+	 * We detect FSP_IN_RR in DSISR or we have a deferred mbox
+	 * error, we trigger an R&R after a bit of housekeeping to
+	 * limit the chance of a stray interrupt
+	 */
+	if ((disr & FSP_DISR_FSP_IN_RR) || (fsp->state == fsp_mbx_err)) {
+		/*
+		 * When the linux comes back up, we still see that bit
+		 * set for a bit, so just move on, nothing to see here
+		 */
+		if (fsp->state == fsp_mbx_rr)
+			return;
+
+		fsp_trace_event(fsp, TRACE_FSP_EVT_SOFT_RR, disr, 0, 0, 0);
+
+		printf("FSP #%d: FSP in reset or delayed error, starting R&R\n",
+		       fsp->index);
+
+		/* Clear all interrupt conditions */
+		fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
+
+		/* Make sure this happened */
+		fsp_rreg(fsp, FSP_HDIR_REG);
+
+		/* Bring the PSI link down */
+		psi_disable_link(psi);
+
+		/* Start R&R process */
+		fsp_start_rr(fsp);
+		return;
+	}
+
+	/*
+	 * We detect an R&R complete indication, acknolwedge it
+	 */
+	if (disr & FSP_DISR_FSP_RR_COMPLETE) {
+		/*
+		 * Acking this bit doens't make it go away immediately, so
+		 * only do it while still in R&R state
+		 */
+		if (fsp->state == fsp_mbx_rr) {
+			fsp_trace_event(fsp, TRACE_FSP_EVT_RR_COMPL, 0,0,0,0);
+
+			printf("FSP #%d: Detected R&R complete, acking\n",
+			       fsp->index);
+
+			/* Clear HDATA area */
+			fsp_wreg(fsp, FSP_MBX1_HDATA_AREA, 0xff);
+
+			/* Ack it (XDN) and clear HPEND & counts */
+			fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+				 FSP_MBX_CTL_PTS |
+				 FSP_MBX_CTL_XDN |
+				 FSP_MBX_CTL_HPEND |
+				 FSP_MBX_CTL_HCSP_MASK |
+				 FSP_MBX_CTL_DCSP_MASK);
+
+			/*
+			 * Mark the mbox as usable again so we can process
+			 * incoming messages
+			 */
+			fsp->state = fsp_mbx_idle;
+		}
+	}
+
+	/*
+	 * XXX
+	 *
+	 * Here we detect a number of errors, should we initiate
+	 * and R&R ?
+	 */
+
+	hstate = fsp_rreg(fsp, FSP_HDES_REG);
+	if (hstate != hstate_last_print) {
+		fsp_trace_event(fsp, TRACE_FSP_EVT_HDES_CHG, hstate, 0, 0, 0);
+
+		printf("FSP #%d: HDES stat change = 0x%08x\n",
+		       fsp->index, hstate);
+		hstate_last_print = disr;
+	}
+
+	if (hstate == 0xffffffff)
+		return;
+
+	/* Clear errors */
+	fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1);
+
+	/*
+	 * Most of those errors shouldn't have happened, we just clear
+	 * the error state and return. In the long run, we might want
+	 * to start retrying commands, switching FSPs or links, etc...
+	 *
+	 * We currently don't set our mailbox to a permanent error state.
+	 */
+	if (hstate & FSP_DBERRSTAT_ILLEGAL1)
+		prerror("FSP #%d: Illegal command error !\n", fsp->index);
+
+	if (hstate & FSP_DBERRSTAT_WFULL1)
+		prerror("FSP #%d: Write to a full mbox !\n", fsp->index);
+
+	if (hstate & FSP_DBERRSTAT_REMPTY1)
+		prerror("FSP #%d: Read from an empty mbox !\n", fsp->index);
+
+	if (hstate & FSP_DBERRSTAT_PAR1)
+		prerror("FSP #%d: Parity error !\n", fsp->index);
+}
+
+/*
+ * This is called by fsp_post_msg() to check if the mbox
+ * is in a state that allows sending of a message
+ *
+ * Due to the various "interesting" contexts fsp_post_msg()
+ * can be called from, including recursive locks from lock
+ * error messages or console code, this should avoid doing
+ * anything more complex than checking a bit of state.
+ *
+ * Specifically, we cannot initiate an R&R and call back into
+ * clients etc... from this function.
+ *
+ * The best we can do is to se the mbox in error state and
+ * handle it later during a poll or interrupts.
+ */
+static bool fsp_check_can_send(struct fsp *fsp)
+{
+	struct fsp_iopath *iop;
+	struct psi *psi;
+
+	/* Look for FSP in non-idle state */
+	if (fsp->state != fsp_mbx_idle)
+		return false;
+
+	/* Look for an active IO path */
+	if (fsp->active_iopath < 0)
+		goto mbox_error;
+	iop = &fsp->iopath[fsp->active_iopath];
+	if (!iop->psi) {
+		prerror("FSP: Active IOP with no PSI link !\n");
+		goto mbox_error;
+	}
+	psi = iop->psi;
+
+	/* Check if link has gone down. This will be handled later */
+	if (!psi_check_link_active(psi)) {
+		prerror("FSP #%d: Link seems to be down on send\n", fsp->index);
+		goto mbox_error;
+	}
+
+	/* XXX Do we want to check for other error conditions ? */
+	return true;
+
+	/*
+	 * An error of some case occurred, we'll handle it later
+	 * from a more normal "poll" context
+	 */
+ mbox_error:
+	fsp->state = fsp_mbx_err;
+	return false;
+}
+
+static bool fsp_post_msg(struct fsp *fsp, struct fsp_msg *msg)
+{
+	u32 ctl, reg;
+	int i, wlen;
+
+	DBG("FSP #%d: fsp_post_msg (w0: 0x%08x w1: 0x%08x)\n",
+	    fsp->index, msg->word0, msg->word1);
+
+	/* Note: We used to read HCTL here and only modify some of
+	 * the bits in it. This was bogus, because we would write back
+	 * the incoming bits as '1' and clear them, causing fsp_poll()
+	 * to then miss them. Let's just start with 0, which is how
+	 * I suppose the HW intends us to do.
+	 */
+
+	/* Set ourselves as busy */
+	fsp->pending = msg;
+	fsp->state = fsp_mbx_send;
+	msg->state = fsp_msg_sent;
+
+	/* We trace after setting the mailbox state so that if the
+	 * tracing recurses, it ends up just queuing the message up
+	 */
+	fsp_trace_msg(msg, TRACE_FSP_MSG_OUT);
+
+	/* Build the message in the mailbox */
+	reg = FSP_MBX1_HDATA_AREA;
+	fsp_wreg(fsp, reg, msg->word0); reg += 4;
+	fsp_wreg(fsp, reg, msg->word1); reg += 4;
+	wlen = (msg->dlen + 3) >> 2;
+	for (i = 0; i < wlen; i++) {
+		fsp_wreg(fsp, reg, msg->data.words[i]);
+		reg += 4;
+	}
+
+	/* Write the header */
+	fsp_wreg(fsp, FSP_MBX1_HHDR0_REG, (msg->dlen + 8) << 16);
+
+	/* Write the control register */
+	ctl = 4 << FSP_MBX_CTL_HCHOST_SHIFT;
+	ctl |= (msg->dlen + 8) << FSP_MBX_CTL_DCHOST_SHIFT;
+	ctl |= FSP_MBX_CTL_PTS | FSP_MBX_CTL_SPPEND;
+	DBG("    new ctl: %08x\n", ctl);
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG, ctl);
+
+	return true;
+}
+
+static void fsp_poke_queue(struct fsp_cmdclass *cmdclass)
+{
+	struct fsp *fsp = fsp_get_active();
+	struct fsp_msg *msg;
+
+	if (!fsp)
+		return;
+	if (!fsp_check_can_send(fsp))
+		return;
+
+	/* From here to the point where fsp_post_msg() sets fsp->state
+	 * to !idle we must not cause any re-entrancy (no debug or trace)
+	 * in a code path that may hit fsp_post_msg() (it's ok to do so
+	 * if we are going to bail out), as we are committed to calling
+	 * fsp_post_msg() and so a re-entrancy could cause us to do a
+	 * double-send into the mailbox.
+	 */
+	if (cmdclass->busy || list_empty(&cmdclass->msgq))
+		return;
+
+	msg = list_top(&cmdclass->msgq, struct fsp_msg, link);
+	assert(msg);
+	cmdclass->busy = true;
+
+	if (!fsp_post_msg(fsp, msg)) {
+		prerror("FSP #%d: Failed to send message\n", fsp->index);
+		cmdclass->busy = false;
+		return;
+	}
+}
+
+static void __fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod,
+			  u8 add_words, va_list list)
+{
+	bool response = !!(cmd_sub_mod & 0x1000000);
+	u8 cmd = (cmd_sub_mod >> 16) & 0xff;
+	u8 sub = (cmd_sub_mod >>  8) & 0xff;
+	u8 mod =  cmd_sub_mod & 0xff;
+	int i;
+
+	msg->word0 = cmd & 0xff;
+	msg->word1 = mod << 8 | sub;
+	msg->response = response;
+	msg->dlen = add_words << 2;
+
+	for (i = 0; i < add_words; i++)
+		msg->data.words[i] = va_arg(list, unsigned int);
+	va_end(list);
+
+	/* Initialize the value with false. If this ends up
+	 * in fsp_sync_msg, we will set it to true.
+	 */
+	msg->sync_msg = false;
+}
+
+extern void fsp_fillmsg(struct fsp_msg *msg, u32 cmd_sub_mod, u8 add_words, ...)
+{
+	va_list list;
+
+	va_start(list, add_words);
+	__fsp_fillmsg(msg, cmd_sub_mod, add_words, list);
+	va_end(list);
+}
+
+struct fsp_msg *fsp_mkmsg(u32 cmd_sub_mod, u8 add_words, ...)
+{
+	struct fsp_msg *msg = fsp_allocmsg(!!(cmd_sub_mod & 0x1000000));
+	va_list list;
+
+	if (!msg) {
+		prerror("FSP: Failed to allocate struct fsp_msg\n");
+		return NULL;
+	}
+
+	va_start(list, add_words);
+	__fsp_fillmsg(msg, cmd_sub_mod, add_words, list);
+	va_end(list);
+
+	return msg;
+}
+
+/*
+ * IMPORTANT NOTE: This is *guaranteed* to not call the completion
+ *                 routine recusrively for *any* fsp message, either the
+ *                 queued one or a previous one. Thus it is *ok* to call
+ *                 this function with a lock held which will itself be
+ *                 taken by the completion function.
+ *
+ *                 Any change to this implementation must respect this
+ *                 rule. This will be especially true of things like
+ *                 reset/reload and error handling, if we fail to queue
+ *                 we must just return an error, not call any completion
+ *                 from the scope of fsp_queue_msg().
+ */
+int fsp_queue_msg(struct fsp_msg *msg, void (*comp)(struct fsp_msg *msg))
+{
+	struct fsp_cmdclass *cmdclass;
+	struct fsp *fsp = fsp_get_active();
+	bool need_unlock;
+	u16 seq;
+	int rc = 0;
+
+	if (!fsp)
+		return -1;
+
+	/* Recursive locking */
+	need_unlock = lock_recursive(&fsp_lock);
+
+	/* Grab a new sequence number */
+	seq = fsp_curseq;
+	fsp_curseq = fsp_curseq + 1;
+	if (fsp_curseq == 0)
+		fsp_curseq = 0x8000;
+	msg->word0 = (msg->word0 & 0xffff) | seq << 16;
+
+	/* Set completion */
+	msg->complete = comp;
+
+	/* Clear response state */
+	if (msg->resp)
+		msg->resp->state = fsp_msg_unused;
+
+	/* Queue the message in the appropriate queue */
+	cmdclass = fsp_get_cmdclass(msg);
+	if (!cmdclass) {
+		prerror("FSP: Invalid msg in fsp_queue_msg w0/1=0x%08x/%08x\n",
+			msg->word0, msg->word1);
+		rc = -1;
+		goto unlock;
+	}
+
+	msg->state = fsp_msg_queued;
+
+	/*
+	 * If we have initiated or about to initiate a reset/reload operation,
+	 * we stash the message on the R&R backup queue. Otherwise, queue it
+	 * normally and poke the HW
+	 */
+	if (fsp_in_hir(fsp) || fsp_in_reset(fsp))
+		list_add_tail(&cmdclass->rr_queue, &msg->link);
+	else {
+		list_add_tail(&cmdclass->msgq, &msg->link);
+		fsp_poke_queue(cmdclass);
+	}
+
+ unlock:
+	if (need_unlock)
+		unlock(&fsp_lock);
+
+	return rc;
+}
+
+/* WARNING: This will drop the FSP lock !!! */
+static void fsp_complete_msg(struct fsp_msg *msg)
+{
+	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+	void (*comp)(struct fsp_msg *msg);
+
+	assert(cmdclass);
+
+	DBG("  completing msg,  word0: 0x%08x\n", msg->word0);
+
+	comp = msg->complete;
+	list_del_from(&cmdclass->msgq, &msg->link);
+	cmdclass->busy = false;
+	msg->state = fsp_msg_done;
+
+	unlock(&fsp_lock);
+	if (comp)
+		(*comp)(msg);
+	lock(&fsp_lock);
+}
+
+/* WARNING: This will drop the FSP lock !!! */
+static void fsp_complete_send(struct fsp *fsp)
+{
+	struct fsp_msg *msg = fsp->pending;
+	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+
+	assert(msg);
+	assert(cmdclass);
+
+	fsp->pending = NULL;
+
+	DBG("  completing send, word0: 0x%08x, resp: %d\n",
+	    msg->word0, msg->response);
+
+	if (msg->response) {
+		u64 setbit = fsp_get_class_bit(msg->word0 & 0xff);
+		msg->state = fsp_msg_wresp;
+		fsp_cmdclass_resp_bitmask |= setbit;
+		cmdclass->timesent = mftb();
+	} else
+		fsp_complete_msg(msg);
+}
+
+static void  fsp_alloc_inbound(struct fsp_msg *msg)
+{
+	u16 func_id = msg->data.words[0] & 0xffff;
+	u32 len = msg->data.words[1];
+	u32 tce_token = 0, act_len = 0;
+	u8 rc = 0;
+	void *buf;
+
+	printf("FSP: Allocate inbound buffer func: %04x len: %d\n",
+	       func_id, len);
+
+	lock(&fsp_lock);
+	if ((fsp_inbound_off + len) > FSP_INBOUND_SIZE) {
+		prerror("FSP: Out of space in buffer area !\n");
+		rc = 0xeb;
+		goto reply;
+	}
+
+	if (!fsp_inbound_buf) {
+		fsp_inbound_buf = memalign(TCE_PSIZE, FSP_INBOUND_SIZE);
+		if (!fsp_inbound_buf) {
+			prerror("FSP: could not allocate fsp_inbound_buf!\n");
+			rc = 0xeb;
+			goto reply;
+		}
+	}
+
+	buf = fsp_inbound_buf + fsp_inbound_off;
+	tce_token = PSI_DMA_INBOUND_BUF + fsp_inbound_off;
+	len = (len + 0xfff) & ~0xfff;
+	fsp_inbound_off += len;
+	fsp_tce_map(tce_token, buf, len);
+	printf("FSP:  -> buffer at 0x%p, TCE: 0x%08x, alen: 0x%x\n",
+	       buf, tce_token, len);
+	act_len = len;
+
+ reply:
+	unlock(&fsp_lock);
+	fsp_queue_msg(fsp_mkmsg(FSP_RSP_ALLOC_INBOUND | rc,
+				3, 0, tce_token, act_len), fsp_freemsg);
+}
+
+void *fsp_inbound_buf_from_tce(u32 tce_token)
+{
+	u32 offset = tce_token - PSI_DMA_INBOUND_BUF;
+
+	if (tce_token < PSI_DMA_INBOUND_BUF || offset >= fsp_inbound_off) {
+		prerror("FSP: TCE token 0x%x out of bounds\n", tce_token);
+		return NULL;
+	}
+	return fsp_inbound_buf + offset;
+}
+
+static void fsp_repost_queued_msgs_post_rr(void)
+{
+	struct fsp_msg *msg;
+	int i;
+
+	for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+		struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+		bool poke = false;
+
+		while(!list_empty(&cmdclass->rr_queue)) {
+			msg = list_pop(&cmdclass->rr_queue,
+				       struct fsp_msg, link);
+			list_add_tail(&cmdclass->msgq, &msg->link);
+			poke = true;
+		}
+		if (poke)
+			fsp_poke_queue(cmdclass);
+	}
+}
+
+static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u32 cmd = 0;
+	u32 rsp_data = 0;
+
+	switch(cmd_sub_mod) {
+	case FSP_CMD_CONTINUE_IPL:
+		/* We get a CONTINUE_IPL as a response to OPL */
+		printf("FSP: Got CONTINUE_IPL !\n");
+		ipl_state |= ipl_got_continue;
+		return true;
+
+	case FSP_CMD_HV_STATE_CHG:
+		printf("FSP: Got HV state change request to %d\n",
+		       msg->data.bytes[0]);
+
+		/* Send response synchronously for now, we might want to
+		 * deal with that sort of stuff asynchronously if/when
+		 * we add support for auto-freeing of messages
+		 */
+		fsp_sync_msg(fsp_mkmsg(FSP_RSP_HV_STATE_CHG, 0), true);
+		return true;
+
+	case FSP_CMD_SP_NEW_ROLE:
+		/* FSP is assuming a new role */
+		printf("FSP: FSP assuming new role\n");
+		fsp_sync_msg(fsp_mkmsg(FSP_RSP_SP_NEW_ROLE, 0), true);
+		ipl_state |= ipl_got_new_role;
+		return true;
+
+	case FSP_CMD_SP_QUERY_CAPS:
+		printf("FSP: FSP query capabilities\n");
+		/* XXX Do something saner. For now do a synchronous
+	         * response and hard code our capabilities
+		 */
+		fsp_sync_msg(fsp_mkmsg(FSP_RSP_SP_QUERY_CAPS, 4,
+				       0x3ff80000, 0, 0, 0), true);
+		ipl_state |= ipl_got_caps;
+		return true;
+	case FSP_CMD_FSP_FUNCTNAL:
+		printf("FSP: Got FSP Functional\n");
+		ipl_state |= ipl_got_fsp_functional;
+		return true;
+	case FSP_CMD_ALLOC_INBOUND:
+		fsp_alloc_inbound(msg);
+		return true;
+	case FSP_CMD_SP_RELOAD_COMP:
+		printf("FSP: SP says Reset/Reload complete\n");
+		if (msg->data.bytes[3] & PPC_BIT8(0)) {
+			fsp_fips_dump_notify(msg->data.words[1],
+					     msg->data.words[2]);
+
+			if (msg->data.bytes[3] & PPC_BIT8(1))
+				printf("      PLID is %x\n",
+						msg->data.words[3]);
+		}
+		if (msg->data.bytes[3] & PPC_BIT8(2))
+			printf("  A Reset/Reload was NOT done\n");
+		else {
+			/* Notify clients that the FSP is back up */
+			fsp_notify_rr_state(FSP_RELOAD_COMPLETE);
+			fsp_repost_queued_msgs_post_rr();
+		}
+		return true;
+	case FSP_CMD_PANELSTATUS:
+	case FSP_CMD_PANELSTATUS_EX1:
+	case FSP_CMD_PANELSTATUS_EX2:
+		/* Panel status messages. We currently just ignore them */
+		return true;
+	case FSP_CMD_CLOSE_HMC_INTF:
+		/* Close the HMC interface */
+		/* Though Sapphire does not support a HMC connection, the FSP
+		 * sends this message when it is trying to open any new
+		 * hypervisor session. So returning an error 0x51.
+		 */
+		cmd = FSP_RSP_CLOSE_HMC_INTF | FSP_STAUS_INVALID_HMC_ID;
+		rsp_data = msg->data.bytes[0] << 24 | msg->data.bytes[1] << 16;
+		rsp_data &= 0xffff0000;
+		fsp_queue_msg(fsp_mkmsg(cmd, 1, rsp_data), fsp_freemsg);
+		return true;
+	}
+	return false;
+}
+
+
+/* This is called without the FSP lock */
+static void fsp_handle_command(struct fsp_msg *msg)
+{
+	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
+	struct fsp_client *client, *next;
+	u32 cmd_sub_mod;
+
+	if (!cmdclass) {
+		prerror("FSP: Got message for unknown class %x\n",
+			msg->word0 & 0xff);
+		goto free;
+	}
+
+	cmd_sub_mod =  (msg->word0 & 0xff) << 16;
+	cmd_sub_mod |= (msg->word1 & 0xff) << 8;
+	cmd_sub_mod |= (msg->word1 >> 8) & 0xff;
+	
+	/* Some commands are handled locally */
+	if (fsp_local_command(cmd_sub_mod, msg))
+		goto free;
+
+	/* The rest go to clients */
+	list_for_each_safe(&cmdclass->clientq, client, next, link) {
+		if (client->message(cmd_sub_mod, msg))
+			goto free;
+	}
+
+	prerror("FSP: Unhandled message %06x\n", cmd_sub_mod);
+
+	/* We don't know whether the message expected some kind of
+	 * response, so we send one anyway
+	 */
+	fsp_queue_msg(fsp_mkmsg((cmd_sub_mod & 0xffff00) | 0x008020, 0),
+		      fsp_freemsg);
+ free:
+	fsp_freemsg(msg);
+}
+
+static void __fsp_fill_incoming(struct fsp *fsp, struct fsp_msg *msg,
+				int dlen, u32 w0, u32 w1)
+{
+	unsigned int wlen, i, reg;
+
+	msg->dlen = dlen - 8;
+	msg->word0 = w0;
+	msg->word1 = w1;
+	wlen = (dlen + 3) >> 2;
+	reg = FSP_MBX1_FDATA_AREA + 8;
+	for (i = 0; i < wlen; i++) {
+		msg->data.words[i] = fsp_rreg(fsp, reg);
+		reg += 4;
+	}
+
+	/* Ack it (XDN) and clear HPEND & counts */
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+		 FSP_MBX_CTL_PTS |
+		 FSP_MBX_CTL_XDN |
+		 FSP_MBX_CTL_HPEND |
+		 FSP_MBX_CTL_HCSP_MASK |
+		 FSP_MBX_CTL_DCSP_MASK);
+
+	fsp_trace_msg(msg, TRACE_FSP_MSG_IN);
+}
+
+static void __fsp_drop_incoming(struct fsp *fsp)
+{
+	/* Ack it (XDN) and clear HPEND & counts */
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG,
+		 FSP_MBX_CTL_PTS |
+		 FSP_MBX_CTL_XDN |
+		 FSP_MBX_CTL_HPEND |
+		 FSP_MBX_CTL_HCSP_MASK |
+		 FSP_MBX_CTL_DCSP_MASK);
+}
+
+/* WARNING: This will drop the FSP lock */
+static void fsp_handle_incoming(struct fsp *fsp)
+{
+	struct fsp_msg *msg;
+	u32 h0, w0, w1;
+	unsigned int dlen;
+	bool special_response = false;
+
+	h0 = fsp_rreg(fsp, FSP_MBX1_FHDR0_REG);
+	dlen = (h0 >> 16) & 0xff;
+
+	w0 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA);
+	w1 = fsp_rreg(fsp, FSP_MBX1_FDATA_AREA + 4);
+
+	DBG("  Incoming: w0: 0x%08x, w1: 0x%08x, dlen: %d\n",
+	    w0, w1, dlen);
+
+	/* Some responses are expected out of band */
+	if ((w0 & 0xff) == FSP_MCLASS_HMC_INTFMSG  &&
+	    ((w1 & 0xff) == 0x8a || ((w1 & 0xff) == 0x8b)))
+		special_response = true;
+
+	/* Check for response bit */
+	if (w1 & 0x80 && !special_response) {
+		struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(w0 & 0xff);
+		struct fsp_msg *req;
+
+		if (!cmdclass) {
+			prerror("FSP: Got response for unknown class %x\n",
+				w0 & 0xff);
+			__fsp_drop_incoming(fsp);
+			return;
+		}
+		
+		if (!cmdclass->busy || list_empty(&cmdclass->msgq)) {	
+			prerror("FSP #%d: Got orphan response !\n", fsp->index);
+			__fsp_drop_incoming(fsp);
+			return;
+		}
+		req = list_top(&cmdclass->msgq, struct fsp_msg, link);
+
+		/* Check if the response seems to match the message */
+		if (req->state != fsp_msg_wresp ||
+		    (req->word0 & 0xff) != (w0 & 0xff) ||
+		    (req->word1 & 0xff) != (w1 & 0x7f)) {
+			__fsp_drop_incoming(fsp);
+			prerror("FSP #%d: Response doesn't match pending msg\n",
+				fsp->index);
+			return;
+		} else {
+			u64 resetbit = ~fsp_get_class_bit(req->word0 & 0xff);
+			fsp_cmdclass_resp_bitmask &= resetbit;
+			cmdclass->timesent = 0;
+		}
+
+		/* Allocate response if needed XXX We need to complete
+		 * the original message with some kind of error here ?
+		 */
+		if (!req->resp) {
+			req->resp = __fsp_allocmsg();
+			if (!req->resp) {
+				__fsp_drop_incoming(fsp);
+				prerror("FSP #%d: Failed to allocate response\n",
+					fsp->index);
+				return;
+			}
+		}
+
+		/* Populate and complete (will drop the lock) */
+		req->resp->state = fsp_msg_response;
+		__fsp_fill_incoming(fsp, req->resp, dlen, w0, w1);
+		fsp_complete_msg(req);
+		return;
+	}
+
+	/* Allocate an incoming message */
+	msg = __fsp_allocmsg();
+	if (!msg) {
+		__fsp_drop_incoming(fsp);
+		prerror("FSP #%d: Failed to allocate incoming msg\n",
+			fsp->index);
+		return;
+	}
+	msg->state = fsp_msg_incoming;
+	__fsp_fill_incoming(fsp, msg, dlen, w0, w1);
+
+	/* Handle FSP commands. This can recurse into fsp_queue_msg etc.. */
+	unlock(&fsp_lock);
+	fsp_handle_command(msg);
+	lock(&fsp_lock);
+}
+
+static void fsp_check_queues(struct fsp *fsp)
+{
+	int i;
+
+	/* XXX In the long run, we might want to have a queue of
+	 * classes waiting to be serviced to speed this up, either
+	 * that or a bitmap.
+	 */
+	for (i = 0; i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+		struct fsp_cmdclass *cmdclass = &fsp_cmdclass[i];
+
+		if (fsp->state != fsp_mbx_idle)
+			break;
+		if (cmdclass->busy || list_empty(&cmdclass->msgq))
+			continue;
+		fsp_poke_queue(cmdclass);
+	}
+}
+
+static void __fsp_poll(bool interrupt)
+{
+	struct fsp_iopath *iop;
+	struct fsp *fsp = fsp_get_active();
+	u32 ctl, hdir = 0;
+	bool psi_irq;
+
+	/*
+	 * The tracer isn't terribly efficient at detecting dups
+	 * especially when coming from multiple CPUs so we do our
+	 * own change-detection locally
+	 */
+	static u32 hdir_last_trace;
+	static u32 ctl_last_trace;
+	static bool psi_irq_last_trace;
+	static bool irq_last_trace;
+
+	if (!fsp)
+		return;
+
+	/* Crazy interrupt handling scheme:
+	 *
+	 * In order to avoid "losing" interrupts when polling the mbox
+	 * we only clear interrupt conditions when called as a result of
+	 * an interrupt.
+	 *
+	 * That way, if a poll clears, for example, the HPEND condition,
+	 * the interrupt remains, causing a dummy interrupt later on
+	 * thus allowing the OS to be notified of a state change (ie it
+	 * doesn't need every poll site to monitor every state change).
+	 *
+	 * However, this scheme is complicated by the fact that we need
+	 * to clear the interrupt condition after we have cleared the
+	 * original condition in HCTL, and we might have long stale
+	 * interrupts which we do need to eventually get rid of. However
+	 * clearing interrupts in such a way is racy, so we need to loop
+	 * and re-poll HCTL after having done so or we might miss an
+	 * event. It's a latency risk, but unlikely and probably worth it.
+	 */
+
+ again:
+	if (fsp->active_iopath < 0) {
+		/* That should never happen */
+		if (interrupt)
+			prerror("FSP: Interrupt with no working IO path\n");
+		return;
+	}
+	iop = &fsp->iopath[fsp->active_iopath];
+
+	/* Handle host initiated resets */
+	if (fsp_in_hir(fsp)) {
+		fsp_hir_poll(fsp, iop->psi);
+		return;
+	}
+
+	/* Check for error state and handle R&R completion */
+	fsp_handle_errors(fsp);
+
+	/*
+	 * The above might have triggered and R&R, check that we
+	 * are still functional
+	 */
+	if ((fsp->active_iopath < 0) || fsp_in_hir(fsp))
+		return;
+	iop = &fsp->iopath[fsp->active_iopath];
+
+	/* Read interrupt status (we may or may not use it) */
+	hdir = fsp_rreg(fsp, FSP_HDIR_REG);
+
+	/* Read control now as well so we can trace them */
+	ctl = fsp_rreg(fsp, FSP_MBX1_HCTL_REG);
+
+	/* Ditto with PSI irq state */
+	psi_irq = psi_poll_fsp_interrupt(iop->psi);
+
+	/* Trace it if anything changes */
+	if (hdir != hdir_last_trace || ctl != ctl_last_trace ||
+	    interrupt != irq_last_trace || psi_irq != psi_irq_last_trace) {
+		fsp_trace_event(fsp, TRACE_FSP_EVT_POLL_IRQ,
+				interrupt, hdir, ctl, psi_irq);
+
+		hdir_last_trace = hdir;
+		ctl_last_trace = ctl;
+		irq_last_trace = interrupt;
+		psi_irq_last_trace = psi_irq;
+	}
+
+	/*
+	 * We *MUST* ignore the MBOX2 bits here. While MBOX2 cannot generate
+	 * interrupt, it might still latch some bits here (and we found cases
+	 * where the MBOX2 XUP would be set). If that happens, clearing HDIR
+	 * never works (the bit gets set again immediately) because we don't
+	 * clear the condition in HTCL2 and thus we loop forever.
+	 */
+	hdir &= FSP_DBIRQ_MBOX1;
+
+	/*
+	 * Sanity check: If an interrupt is pending and we are in polling
+	 * mode, check that the PSI side is also pending. If some bit is
+	 * set, just clear and move on.
+	 */
+	if (hdir && !interrupt && !psi_irq) {
+		prerror("FSP: WARNING ! HDIR 0x%08x but no PSI irq !\n", hdir);
+		fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+	}
+
+	/*
+	 * We should never have the mbox in error state here unless it
+	 * was fine until some printf inside fsp_handle_errors() caused
+	 * the console to poke the FSP which detected a branch new error
+	 * in the process. Let's be safe rather than sorry and handle that
+	 * here
+	 */
+	if (fsp_in_hir(fsp) || fsp->state == fsp_mbx_err) {
+		prerror("FSP: Late error state detection\n");
+		goto again;
+	}
+
+	/*
+	 * If we are in an R&R state with an active IO path, we
+	 * shouldn't be getting interrupts. If we do, just clear
+	 * the condition and print a message
+	 */
+	if (fsp->state == fsp_mbx_rr) {
+		if (interrupt) {
+			prerror("FSP: Interrupt in RR state [HDIR=0x%08x]\n",
+				hdir);
+			fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+		}
+		return;
+	}
+
+	/* Poll FSP CTL */
+	if (ctl & (FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND))
+		DBG("FSP #%d: poll, ctl: %x\n", fsp->index, ctl);
+
+	/* Do we have a pending message waiting to complete ? */
+	if (ctl & FSP_MBX_CTL_XUP) {
+		fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP);
+		if (fsp->state == fsp_mbx_send) {
+			/* mbox is free */
+			fsp->state = fsp_mbx_idle;
+
+			/* Complete message (will break the lock) */
+			fsp_complete_send(fsp);
+
+			/* Lock can have been broken, so ctl is now
+			 * potentially invalid, let's recheck
+			 */
+			goto again;
+		} else {
+			prerror("FSP #%d: Got XUP with no pending message !\n",
+				fsp->index);
+		}
+	}
+
+	if (fsp->state == fsp_mbx_send) {
+		/* XXX Handle send timeouts!!! */
+	}
+
+	/* Is there an incoming message ? This will break the lock as well */
+	if (ctl & FSP_MBX_CTL_HPEND)
+		fsp_handle_incoming(fsp);
+
+	/* Note: Lock may have been broken above, thus ctl might be invalid
+	 * now, don't use it any further.
+	 */
+
+	/* Check for something else to send */
+	if (fsp->state == fsp_mbx_idle)
+		fsp_check_queues(fsp);
+
+	/* Clear interrupts, and recheck HCTL if any occurred */
+	if (interrupt && hdir) {
+		fsp_wreg(fsp, FSP_HDIR_REG, hdir);
+		goto again;
+	}
+}
+
+void fsp_poll(void)
+{
+	lock(&fsp_lock);
+	__fsp_poll(false);
+	unlock(&fsp_lock);
+}
+
+void fsp_interrupt(void)
+{
+	lock(&fsp_lock);
+	__fsp_poll(true);
+	unlock(&fsp_lock);
+}
+
+int fsp_sync_msg(struct fsp_msg *msg, bool autofree)
+{
+	int rc;
+
+	/* This indication is useful only in the case where
+	 * we queue up messages when the FSP takes a r/r.
+	 */
+	msg->sync_msg = true;
+	msg->auto_free = autofree;
+
+	rc = fsp_queue_msg(msg, NULL);
+	if (rc)
+		goto bail;
+
+	while(fsp_msg_busy(msg))
+		fsp_poll();
+
+	switch(msg->state) {
+	case fsp_msg_done:
+		rc = 0;
+		break;
+	case fsp_msg_timeout:
+		rc = -1; /* XXX to improve */
+		break;
+	default:
+		rc = -1; /* Should not happen... (assert ?) */
+	}
+
+	if (msg->resp)
+		rc = (msg->resp->word1 >> 8) & 0xff;
+ bail:
+	if (autofree)
+		fsp_freemsg(msg);
+	return rc;
+}
+
+void fsp_register_client(struct fsp_client *client, u8 msgclass)
+{
+	struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass);
+
+	if (!fsp_present())
+		return;
+	assert(cmdclass);
+	list_add_tail(&cmdclass->clientq, &client->link);
+}
+
+void fsp_unregister_client(struct fsp_client *client, u8 msgclass)
+{
+	struct fsp_cmdclass *cmdclass = __fsp_get_cmdclass(msgclass);
+
+	if (!fsp_present())
+		return;
+	assert(cmdclass);
+	list_del_from(&cmdclass->clientq, &client->link);
+}
+
+static int fsp_init_mbox(struct fsp *fsp)
+{
+	unsigned int i;
+	u32 reg;
+
+	/*
+	 * Note: The documentation contradicts itself as to
+	 * whether the HDIM bits should be set or cleared to
+	 * enable interrupts
+	 *
+	 * This seems to work...
+	 */
+
+	/* Mask all interrupts */
+	fsp_wreg(fsp, FSP_HDIM_CLR_REG, FSP_DBIRQ_ALL);
+
+	/* Clear all errors */
+	fsp_wreg(fsp, FSP_HDES_REG, FSP_DBERRSTAT_CLR1 | FSP_DBERRSTAT_CLR2);
+
+	/* Initialize data area as the doco says */
+	for (i = 0; i < 0x40; i += 4)
+		fsp_wreg(fsp, FSP_MBX1_HDATA_AREA + i, 0);
+
+	/*
+	 * Clear whatever crap may remain in HDCR. Do not write XDN as that
+	 * would be interpreted incorrectly as an R&R completion which
+	 * we aren't ready to send yet !
+	 */
+	fsp_wreg(fsp, FSP_MBX1_HCTL_REG, FSP_MBX_CTL_XUP | FSP_MBX_CTL_HPEND |
+		 FSP_MBX_CTL_HCSP_MASK | FSP_MBX_CTL_DCSP_MASK |
+		 FSP_MBX_CTL_PTS);
+
+	/* Clear all pending interrupts */
+	fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
+
+	/* Enable all mbox1 interrupts */
+	fsp_wreg(fsp, FSP_HDIM_SET_REG, FSP_DBIRQ_MBOX1);
+
+	/* Decode what FSP we are connected to */
+	reg = fsp_rreg(fsp, FSP_SCRATCH0_REG);
+	if (reg & PPC_BIT32(0)) {		/* Is it a valid connection */
+		if (reg & PPC_BIT32(3))
+			printf("FSP: Connected to FSP-B\n");
+		else
+			printf("FSP: Connected to FSP-A\n");
+	}
+
+	return 0;
+}
+
+/* We use a single fixed TCE table for all PSI interfaces */
+static void fsp_init_tce_table(void)
+{
+	fsp_tce_table = (u64 *)PSI_TCE_TABLE_BASE;
+
+	/* Memset the larger table even if we only use the smaller
+	 * one on P7
+	 */
+	memset(fsp_tce_table, 0, PSI_TCE_TABLE_SIZE_P8);
+}
+
+void fsp_tce_map(u32 offset, void *addr, u32 size)
+{
+	u64 raddr = (u64)addr;
+
+	assert(!(offset & 0xfff));
+	assert(!(raddr  & 0xfff));
+	assert(!(size   & 0xfff));
+
+	size   >>= 12;
+	offset >>= 12;
+
+	while(size--) {
+		fsp_tce_table[offset++] = raddr | 0x3;
+		raddr += 0x1000;
+	}
+}
+
+void fsp_tce_unmap(u32 offset, u32 size)
+{
+	assert(!(offset & 0xfff));
+	assert(!(size   & 0xfff));
+
+	size   >>= 12;
+	offset >>= 12;
+
+	while(size--)
+		fsp_tce_table[offset++] = 0;
+}
+
+static struct fsp *fsp_find_by_index(int index)
+{
+	struct fsp *fsp = first_fsp;
+
+	do {
+		if (fsp->index == index)
+			return fsp;
+	} while (fsp->link != first_fsp);
+
+	return NULL;
+}
+
+static void fsp_init_links(struct dt_node *fsp_node)
+{
+	const struct dt_property *linksprop;
+	int i, index;
+	struct fsp *fsp;
+	struct fsp_iopath *fiop;
+
+	linksprop = dt_find_property(fsp_node, "ibm,psi-links");
+	index = dt_prop_get_u32(fsp_node, "reg");
+	fsp = fsp_find_by_index(index);
+	if (!fsp) {
+		prerror("FSP: FSP with index %d not found\n", index);
+		return;
+	}
+
+	fsp->state = fsp_mbx_idle;
+
+	/* Iterate all links */
+	for (i = 0; i < fsp->iopath_count; i++) {
+		u64 reg;
+		u32 link;
+
+		link = ((const u32 *)linksprop->prop)[i];
+		fiop = &fsp->iopath[i];
+		fiop->psi = psi_find_link(link);
+		if (fiop->psi == NULL) {
+			prerror("FSP #%d: Couldn't find PSI link\n",
+				fsp->index);
+			continue;
+		}
+
+		printf("FSP #%d: Found PSI HB link to chip %d\n",
+		       fsp->index, link);
+
+		psi_fsp_link_in_use(fiop->psi);
+
+		/* Get the FSP register window */
+		reg = in_be64(fiop->psi->regs + PSIHB_FSPBAR);
+		fiop->fsp_regs = (void *)(reg | (1ULL << 63) |
+				dt_prop_get_u32(fsp_node, "reg-offset"));
+	}
+}
+
+static void fsp_update_links_states(struct fsp *fsp)
+{
+	struct fsp_iopath *fiop;
+	unsigned int i;
+
+	/* Iterate all links */
+	for (i = 0; i < fsp->iopath_count; i++) {
+		fiop = &fsp->iopath[i];
+		if (!fiop->psi)
+			continue;
+		if (!fiop->psi->working)
+			fiop->state = fsp_path_bad;
+		else if (fiop->psi->active) {
+			fsp->active_iopath = i;
+			fiop->state = fsp_path_active;
+		} else
+			fiop->state = fsp_path_backup;
+	}
+
+	if (fsp->active_iopath >= 0) {
+		if (!active_fsp || (active_fsp != fsp))
+			active_fsp = fsp;
+
+		fsp_inbound_off = 0;
+		fiop = &fsp->iopath[fsp->active_iopath];
+		psi_init_for_fsp(fiop->psi);
+		fsp_init_mbox(fsp);
+		psi_enable_fsp_interrupt(fiop->psi);
+	}
+}
+
+void fsp_reinit_fsp(void)
+{
+	struct fsp *fsp;
+
+	/* Stop polling PSI */
+	psi_set_link_polling(false);
+
+	/* Notify all FSPs to check for an updated link state */
+	for (fsp = first_fsp; fsp; fsp = fsp->link)
+		fsp_update_links_states(fsp);
+}
+
+static void fsp_create_fsp(struct dt_node *fsp_node)
+{
+	const struct dt_property *linksprop;
+	struct fsp *fsp;
+	int count, index;
+
+	index = dt_prop_get_u32(fsp_node, "reg");
+	prerror("FSP #%d: Found in device-tree, setting up...\n", index);
+
+	linksprop = dt_find_property(fsp_node, "ibm,psi-links");
+	if (!linksprop || linksprop->len < 4) {
+		prerror("FSP #%d: No links !\n", index);
+		return;
+	}
+
+	fsp = zalloc(sizeof(struct fsp));
+	if (!fsp) {
+		prerror("FSP #%d: Can't allocate memory !\n", index);
+		return;
+	}
+
+	fsp->index = index;
+	fsp->active_iopath = -1;
+
+	count = linksprop->len / 4;
+	printf("FSP #%d: Found %d IO PATH\n", index, count);
+	if (count > FSP_MAX_IOPATH) {
+		prerror("FSP #%d: WARNING, limited to %d IO PATH\n",
+			index, FSP_MAX_IOPATH);
+		count = FSP_MAX_IOPATH;
+	}
+	fsp->iopath_count = count;
+
+	fsp->link = first_fsp;
+	first_fsp = fsp;
+
+	fsp_init_links(fsp_node);
+	fsp_update_links_states(fsp);
+}
+
+static void fsp_opal_poll(void *data __unused)
+{
+	if (try_lock(&fsp_lock)) {
+		__fsp_poll(false);
+		unlock(&fsp_lock);
+	}
+}
+
+static bool fsp_init_one(const char *compat)
+{
+	struct dt_node *fsp_node;
+	bool inited = false;
+
+	dt_for_each_compatible(dt_root, fsp_node, compat) {
+		if (!inited) {
+			int i;
+	
+			/* Initialize the per-class msg queues */
+			for (i = 0;
+			     i <= (FSP_MCLASS_LAST - FSP_MCLASS_FIRST); i++) {
+				list_head_init(&fsp_cmdclass[i].msgq);
+				list_head_init(&fsp_cmdclass[i].clientq);
+				list_head_init(&fsp_cmdclass[i].rr_queue);
+			}
+
+			/* Init the queues for RR notifier cmdclass */
+			list_head_init(&fsp_cmdclass_rr.msgq);
+			list_head_init(&fsp_cmdclass_rr.clientq);
+			list_head_init(&fsp_cmdclass_rr.rr_queue);
+
+			/* Register poller */
+			opal_add_poller(fsp_opal_poll, NULL);
+
+			inited = true;
+		}
+
+		/* Create the FSP data structure */
+		fsp_create_fsp(fsp_node);
+	}
+
+	return inited;
+}
+
+void fsp_init(void)
+{
+	printf("FSP: Looking for FSP...\n");
+
+	fsp_init_tce_table();
+
+	if (!fsp_init_one("ibm,fsp1") && !fsp_init_one("ibm,fsp2")) {
+		printf("FSP: No FSP on this machine\n");
+		return;
+	}
+}
+
+bool fsp_present(void)
+{
+	return first_fsp != NULL;
+}
+
+static void fsp_timeout_poll(void *data __unused)
+{
+	u64 now = mftb();
+	u64 timeout_val = 0;
+	u64 cmdclass_resp_bitmask = fsp_cmdclass_resp_bitmask;
+	struct fsp_cmdclass *cmdclass = NULL;
+	struct fsp_msg *req = NULL;
+	u32 index = 0;
+
+	if (timeout_timer == 0)
+		timeout_timer = now + secs_to_tb(30);
+
+	/* The lowest granularity for a message timeout is 30 secs.
+	 * So every 30secs, check if there is any message
+	 * waiting for a response from the FSP
+	 */
+	if ((tb_compare(now, timeout_timer) == TB_AAFTERB) ||
+		(tb_compare(now, timeout_timer) == TB_AEQUALB))
+		timeout_timer = now + secs_to_tb(30);
+	else
+		return;
+
+	while (cmdclass_resp_bitmask) {
+		u64 time_sent = 0;
+		u64 time_to_comp = 0;
+
+		if (!(cmdclass_resp_bitmask & 0x1))
+			goto next_bit;
+
+		cmdclass = &fsp_cmdclass[index];
+		timeout_val = secs_to_tb((cmdclass->timeout) * 60);
+		time_sent = cmdclass->timesent;
+		time_to_comp = now - cmdclass->timesent;
+
+		/* Now check if the response has timed out */
+		if (tb_compare(time_to_comp, timeout_val) == TB_AAFTERB) {
+			u64 resetbit = 0;
+
+			/* Take the FSP lock now and re-check */
+			lock(&fsp_lock);
+			if (!(fsp_cmdclass_resp_bitmask & (1 << index)) ||
+			    time_sent != cmdclass->timesent) {
+				unlock(&fsp_lock);
+				goto next_bit;
+			}
+			req = list_top(&cmdclass->msgq,	struct fsp_msg, link);
+			log_simple_error(&e_info(OPAL_RC_FSP_POLL_TIMEOUT),
+				"FSP: Response from FSP timed out, word0 = %x,"
+				"word1 = %x state: %d\n",
+			       req->word0, req->word1, req->state);
+			fsp_reg_dump();
+			resetbit = ~fsp_get_class_bit(req->word0 & 0xff);
+			fsp_cmdclass_resp_bitmask &= resetbit;
+			cmdclass->timesent = 0;
+			if (req->resp)
+				req->resp->state = fsp_msg_timeout;
+			fsp_complete_msg(req);
+			__fsp_trigger_reset();
+			unlock(&fsp_lock);
+		}
+	next_bit:
+		cmdclass_resp_bitmask = cmdclass_resp_bitmask >> 1;
+		index++;
+	}
+}
+
+void fsp_opl(void)
+{
+	struct dt_node *iplp;
+
+	if (!fsp_present())
+		return;
+
+	/* Send OPL */
+	ipl_state |= ipl_opl_sent;
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_OPL, 0), true);
+	while(!(ipl_state & ipl_got_continue))
+		fsp_poll();
+
+	/* Send continue ACK */
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_CONTINUE_ACK, 0), true);
+
+	/* Wait for various FSP messages */
+	printf("INIT: Waiting for FSP to advertize new role...\n");
+	while(!(ipl_state & ipl_got_new_role))
+		fsp_poll();
+	printf("INIT: Waiting for FSP to request capabilities...\n");
+	while(!(ipl_state & ipl_got_caps))
+		fsp_poll();
+
+	/* Initiate the timeout poller */
+	opal_add_poller(fsp_timeout_poll, NULL);
+
+	/* Tell FSP we are in standby */
+	printf("INIT: Sending HV Functional: Standby...\n");
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x01000000), true);
+
+	/* Wait for FSP functional */
+	printf("INIT: Waiting for FSP functional\n");
+	while(!(ipl_state & ipl_got_fsp_functional))
+		fsp_poll();
+
+	/* Tell FSP we are in running state */
+	printf("INIT: Sending HV Functional: Runtime...\n");
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x02000000), true);
+
+	/*
+	 * For the factory reset case, FSP sends us the PCI Bus
+	 * Reset request. We don't have to do anything special with
+	 * PCI bus numbers here; just send the Power Down message
+	 * with modifier 0x02 to FSP.
+	 */
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp && dt_find_property(iplp, "pci-busno-reset-ipl")) {
+		printf("INIT: PCI Bus Reset requested. Sending Power Down\n");
+		fsp_sync_msg(fsp_mkmsg(FSP_CMD_POWERDOWN_PCIRS, 0), true);
+	}
+
+	/*
+	 * Tell FSP we are in running state with all partitions.
+	 *
+	 * This is need otherwise the FSP will not reset it's reboot count
+	 * on failures. Ideally we should send that when we know the
+	 * OS is up but we don't currently have a very good way to do
+	 * that so this will do as a stop-gap
+	 */
+	printf("INIT: Sending HV Functional: Runtime all parts...\n");
+	fsp_sync_msg(fsp_mkmsg(FSP_CMD_HV_FUNCTNAL, 1, 0x04000000), true);
+}
+
+uint32_t fsp_adjust_lid_side(uint32_t lid_no)
+{
+	struct dt_node *iplp;
+	const char *side = NULL;
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp)
+		side = dt_prop_get_def(iplp, "cec-ipl-side", NULL);
+	if (!side || !strcmp(side, "temp"))
+		lid_no |= ADJUST_T_SIDE_LID_NO;
+	return lid_no;
+}
+
+int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
+		   uint32_t offset, void *buffer, size_t *length)
+{
+	uint32_t total, remaining = *length;
+	uint64_t baddr;
+	uint64_t balign, boff, bsize;
+	struct fsp_msg *msg;
+	static struct lock fsp_fetch_lock = LOCK_UNLOCKED;
+
+	*length = total = 0;
+
+	if (!fsp_present())
+		return -ENODEV;
+
+	printf("FSP: Fetch data id: %02x sid: %08x to %p (0x%x bytes)\n",
+	       id, sub_id, buffer, remaining);
+
+	/*
+	 * Use a lock to avoid multiple processors trying to fetch
+	 * at the same time and colliding on the TCE space
+	 */
+	lock(&fsp_fetch_lock);
+
+	while(remaining) {
+		uint32_t chunk, taddr, woffset, wlen;
+		uint8_t rc;
+
+		/* Calculate alignment skew */
+		baddr = (uint64_t)buffer;
+		balign = baddr & ~0xffful;
+		boff = baddr & 0xffful;
+
+		/* Get a chunk */
+		chunk = remaining;
+		if (chunk > (PSI_DMA_FETCH_SIZE - boff))
+			chunk = PSI_DMA_FETCH_SIZE - boff;
+		bsize = ((boff + chunk) + 0xfff) & ~0xffful;
+
+		printf("FSP:  0x%08x bytes balign=%llx boff=%llx bsize=%llx\n",
+		       chunk, balign, boff, bsize);
+		fsp_tce_map(PSI_DMA_FETCH, (void *)balign, bsize);
+		taddr = PSI_DMA_FETCH + boff;
+		msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 6,
+				flags << 16 | id, sub_id, offset,
+				0, taddr, chunk);
+		rc = fsp_sync_msg(msg, false);
+		fsp_tce_unmap(PSI_DMA_FETCH, bsize);
+
+		woffset = msg->resp->data.words[1];
+		wlen = msg->resp->data.words[2];
+		printf("FSP:   -> rc=0x%02x off: %08x twritten: %08x\n",
+		       rc, woffset, wlen);
+		fsp_freemsg(msg);
+
+		/* XXX Is flash busy (0x3f) a reason for retry ? */
+		if (rc != 0 && rc != 2) {
+			unlock(&fsp_fetch_lock);
+			return -EIO;
+		}
+
+		remaining -= wlen;
+		total += wlen;
+		buffer += wlen;
+		offset += wlen;
+
+		/* The doc seems to indicate that we get rc=2 if there's
+		 * more data and rc=0 if we reached the end of file, but
+		 * it looks like I always get rc=0, so let's consider
+		 * an EOF if we got less than what we asked
+		 */
+		if (wlen < chunk)
+			break;
+	}
+	unlock(&fsp_fetch_lock);
+
+	*length = total;
+
+	return 0;
+}
+
+/*
+ * Asynchronous fsp fetch data call
+ *
+ * Note:
+ *   buffer = PSI DMA address space
+ */
+int fsp_fetch_data_queue(uint8_t flags, uint16_t id, uint32_t sub_id,
+			 uint32_t offset, void *buffer, size_t *length,
+			 void (*comp)(struct fsp_msg *msg))
+{
+	struct fsp_msg *msg;
+	uint32_t chunk = *length;
+
+	if (!comp)
+		return OPAL_PARAMETER;
+
+	msg = fsp_mkmsg(FSP_CMD_FETCH_SP_DATA, 0x6, flags << 16 | id,
+			sub_id, offset, 0, buffer, chunk);
+	if (!msg) {
+		prerror("FSP: allocation failed!\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	if (fsp_queue_msg(msg, comp)) {
+		fsp_freemsg(msg);
+		prerror("FSP: Failed to queue fetch data message\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+
+void fsp_used_by_console(void)
+{
+	fsp_lock.in_con_path = true;
+}
diff --git a/hw/gx.c b/hw/gx.c
new file mode 100644
index 00000000..31de7b57
--- /dev/null
+++ b/hw/gx.c
@@ -0,0 +1,158 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <gx.h>
+#include <xscom.h>
+
+/*
+ * Note: This file os only used on P7/P7+
+ */
+
+/* Configuration of the PSI BUID, see the explanation in
+ * interrupts.h
+ */
+static int gx_p7_configure_psi_buid(uint32_t chip, uint32_t buid)
+{
+	uint64_t mode1;
+	int rc;
+
+	rc = xscom_read(chip, GX_P7_MODE1_REG, &mode1);
+	if (rc) {
+		prerror("GX: XSCOM error %d reading GX MODE1 REG\n", rc);
+		return rc;
+	}
+
+	mode1 = SETFIELD(GX_P7_MODE1_PSI_BUID, mode1, buid);
+	mode1 &= ~GX_P7_MODE1_PSI_BUID_DISABLE;
+
+	printf("GX: MODE1_REG set to 0x%llx\n", mode1);
+	rc = xscom_write(chip, GX_P7_MODE1_REG, mode1);
+	if (rc) {
+		prerror("GX: XSCOM error %d writing GX MODE1 REG\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int gx_p7p_configure_psi_buid(uint32_t chip, uint32_t buid)
+{
+	uint64_t mode4;
+	int rc;
+
+	rc = xscom_read(chip, GX_P7P_MODE4_REG, &mode4);
+	if (rc) {
+		prerror("GX: XSCOM error %d reading GX MODE1 REG\n", rc);
+		return rc;
+	}
+
+	mode4 = SETFIELD(GX_P7P_MODE4_PSI_BUID, mode4, buid);
+	mode4 &= ~GX_P7P_MODE4_PSI_BUID_DISABLE;
+
+	rc = xscom_write(chip, GX_P7P_MODE4_REG, mode4);
+	if (rc) {
+		prerror("GX: XSCOM error %d writing GX MODE1 REG\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Configure the BUID of the PSI interrupt in the GX
+ * controller.
+ *
+ * @chip: Chip number (0..31)
+ * @buid: 9-bit BUID value
+ */
+int gx_configure_psi_buid(uint32_t chip, uint32_t buid)
+{
+	uint32_t pvr = mfspr(SPR_PVR);
+
+	printf("GX: PSI BUID for PVR %x (type %x) chip %d BUID 0x%x\n",
+	       pvr, PVR_TYPE(pvr), chip, buid);
+	       
+	switch(PVR_TYPE(pvr)) {
+	case PVR_TYPE_P7:
+		return gx_p7_configure_psi_buid(chip, buid);
+	case PVR_TYPE_P7P:
+		return gx_p7p_configure_psi_buid(chip, buid);
+	}
+	return -1;
+}
+
+
+static int gx_p7_configure_tce_bar(uint32_t chip, uint32_t gx, uint64_t addr,
+				   uint64_t size)
+{
+	uint32_t areg, mreg;
+	int rc;
+
+	switch (gx) {
+	case 0:
+		areg = GX_P7_GX0_TCE_BAR;
+		mreg = GX_P7_GX0_TCE_MASK;
+		break;
+	case 1:
+		areg = GX_P7_GX1_TCE_BAR;
+		mreg = GX_P7_GX1_TCE_MASK;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (addr) {
+		uint64_t taddr, tmask;
+
+		/* The address field contains bits 18 to 43 of the address */
+		taddr = SETFIELD(GX_P7_TCE_BAR_ADDR, 0ul,
+				 (addr >> GX_P7_TCE_BAR_ADDR_SHIFT));
+		taddr |= GX_P7_TCE_BAR_ENABLE;
+		tmask = SETFIELD(GX_P7_TCE_MASK, 0ul,
+				 ~((size - 1) >> GX_P7_TCE_BAR_ADDR_SHIFT));
+		rc = xscom_write(chip, areg, 0);
+		rc |= xscom_write(chip, mreg, tmask);
+		rc |= xscom_write(chip, areg, taddr);
+	} else {
+		rc = xscom_write(chip, areg, 0);
+	}
+	return rc ? -EIO : 0;
+}
+
+/* Configure the TCE BAR of a given GX bus
+ *
+ * @chip: Chip number (0..31)
+ * @gx  : GX bus index
+ * @addr: base address of TCE table
+ * @size: size of TCE table
+ */
+int gx_configure_tce_bar(uint32_t chip, uint32_t gx, uint64_t addr,
+			 uint64_t size)
+{
+	uint32_t pvr = mfspr(SPR_PVR);
+
+	printf("GX: TCE BAR for PVR %x (type %x) chip %d gx %d\n",
+	       pvr, PVR_TYPE(pvr), chip, gx);
+
+	/* We only support P7... is there a P7+ with P5IOC2 ? */
+	switch(PVR_TYPE(pvr)) {
+	case PVR_TYPE_P7:
+		return gx_p7_configure_tce_bar(chip, gx, addr, size);
+	}
+	return -EINVAL;
+}
+
+
diff --git a/hw/homer.c b/hw/homer.c
new file mode 100644
index 00000000..14cfa1e5
--- /dev/null
+++ b/hw/homer.c
@@ -0,0 +1,143 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <hostservices.h>
+
+#define PBA_BAR0	0x2013f00
+#define PBA_BARMASK0	0x2013f04
+
+static bool read_pba_bar(struct proc_chip *chip, unsigned int bar_no,
+			 uint64_t *base, uint64_t *size)
+{
+	uint64_t bar, mask;
+	int rc;
+
+	rc = xscom_read(chip->id, PBA_BAR0 + bar_no, &bar);
+	if (rc) {
+		prerror("SLW: Error %d reading PBA BAR%d on chip %d\n",
+			rc, bar_no, chip->id);
+		return false;
+	}
+	rc = xscom_read(chip->id, PBA_BARMASK0 + bar_no, &mask);
+	if (rc) {
+		prerror("SLW: Error %d reading PBA BAR MASK%d on chip %d\n",
+			rc, bar_no, chip->id);
+		return false;
+	}
+	printf("  PBA BAR%d : 0x%016llx\n", bar_no, bar);
+	printf("  PBA MASK%d: 0x%016llx\n", bar_no, mask);
+
+	*base = bar & 0x0ffffffffffffffful;
+	*size = (mask | 0xfffff) + 1;
+
+	return (*base) != 0;
+}
+
+static void homer_init_chip(struct proc_chip *chip)
+{
+	uint64_t hbase = 0, hsize = 0;
+	uint64_t sbase, ssize, obase, osize;
+
+	/*
+	 * PBA BARs assigned by HB:
+	 *
+	 *   0 : Entire HOMER
+	 *   1 : OCC to Centaur path (we don't care)
+	 *   2 : SLW image
+	 *   3 : OCC Common area
+	 *
+	 * We need to reserve the memory covered by BAR 0 and BAR 3, however
+	 * on earlier HBs, BAR0 isn't set so we need BAR 2 instead in that
+	 * case to cover SLW (OCC not running).
+	 */
+	if (read_pba_bar(chip, 0, &hbase, &hsize)) {
+		printf("  HOMER Image at 0x%llx size %lldMB\n",
+		       hbase, hsize / 0x100000);
+		mem_reserve("ibm,homer-image", hbase, hsize);
+
+		chip->homer_base = hbase;
+		chip->homer_size = hsize;
+	}
+
+	/*
+	 * We always read the SLW BAR since we need to grab info about the
+	 * SLW image in the struct proc_chip for use by the slw.c code
+	 */
+	if (read_pba_bar(chip, 2, &sbase, &ssize)) {
+		printf("  SLW Image at 0x%llx size %lldMB\n",
+		       sbase, ssize / 0x100000);
+
+		/*
+		 * Only reserve it if we have no homer image or if it
+		 * doesn't fit in it (only check the base).
+		 */
+		if (sbase < hbase || sbase > (hbase + hsize) ||
+			(hbase == 0 && sbase > 0))
+			mem_reserve("ibm,slw-image", sbase, ssize);
+
+		chip->slw_base = sbase;
+		chip->slw_bar_size = ssize;
+		chip->slw_image_size = ssize; /* will be adjusted later */
+	}
+
+	if (read_pba_bar(chip, 3, &obase, &osize)) {
+		printf("  OCC Common Area at 0x%llx size %lldMB\n",
+		       obase, osize / 0x100000);
+		chip->occ_common_base = obase;
+		chip->occ_common_size = osize;
+	}
+}
+
+void homer_init(void)
+{
+	struct proc_chip *chip;
+
+	if (proc_gen != proc_gen_p8)
+		return;
+
+	/*
+	 * XXX This is temporary, on P8 we look for any configured
+	 * SLW/OCC BAR and reserve the memory. Eventually, this will be
+	 * done via HostBoot using the device-tree "reserved-ranges"
+	 * or we'll load the SLW & OCC images ourselves using Host Services.
+	 */
+	for_each_chip(chip) {
+		printf("HOMER: Init chip %d\n", chip->id);
+		homer_init_chip(chip);
+	}
+
+	/*
+	 * Check is PBA BARs are already loaded with HOMER and
+	 * skip host services.
+	 */
+
+	chip = next_chip(NULL);
+	if (chip->homer_base && chip->occ_common_base) {
+		/* Reserve OCC comman area from BAR */
+		mem_reserve("ibm,occ-common-area", chip->occ_common_base,
+						chip->occ_common_size);
+	} else {
+		/* Allocate memory for HOMER and OCC common area */
+		host_services_occ_base_setup();
+	}
+}
+
diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
new file mode 100644
index 00000000..f6037e17
--- /dev/null
+++ b/hw/lpc-uart.c
@@ -0,0 +1,343 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <lpc.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <interrupts.h>
+#include <processor.h>
+#include <fsp-elog.h>
+#include <trace.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_UART_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_UART,
+		 OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+/* UART reg defs */
+#define REG_RBR		0
+#define REG_THR		0
+#define REG_DLL		0
+#define REG_IER		1
+#define REG_DLM		1
+#define REG_FCR		2
+#define REG_IIR		2
+#define REG_LCR		3
+#define REG_MCR		4
+#define REG_LSR		5
+#define REG_MSR		6
+#define REG_SCR		7
+
+#define LSR_DR   0x01  /* Data ready */
+#define LSR_OE   0x02  /* Overrun */
+#define LSR_PE   0x04  /* Parity error */
+#define LSR_FE   0x08  /* Framing error */
+#define LSR_BI   0x10  /* Break */
+#define LSR_THRE 0x20  /* Xmit holding register empty */
+#define LSR_TEMT 0x40  /* Xmitter empty */
+#define LSR_ERR  0x80  /* Error */
+
+#define LCR_DLAB 0x80  /* DLL access */
+
+static uint32_t uart_base;
+static bool has_irq, irq_disabled;
+
+/*
+ * We implement a simple buffer to buffer input data as some bugs in
+ * Linux make it fail to read fast enough after we get an interrupt.
+ *
+ * We use it on non-interrupt operations as well while at it because
+ * it doesn't cost us much and might help in a few cases where Linux
+ * is calling opal_poll_events() but not actually reading.
+ *
+ * Most of the time I expect we'll flush it completely to Linux into
+ * it's tty flip buffers so I don't bother with a ring buffer.
+ */
+#define IN_BUF_SIZE	0x1000
+static uint8_t	*in_buf;
+static uint32_t	in_count;
+
+static void uart_trace(u8 ctx, u8 cnt, u8 irq_state, u8 in_count)
+{
+	union trace t;
+
+	t.uart.ctx = ctx;
+	t.uart.cnt = cnt;
+	t.uart.irq_state = irq_state;
+	t.uart.in_count = in_count;
+	trace_add(&t, TRACE_UART, sizeof(struct trace_uart));
+}
+
+static inline uint8_t uart_read(unsigned int reg)
+{
+	return lpc_inb(uart_base + reg);
+}
+
+static inline void uart_write(unsigned int reg, uint8_t val)
+{
+	lpc_outb(val, uart_base + reg);
+}
+
+static size_t uart_con_write(const char *buf, size_t len)
+{
+	size_t written = 0;
+
+	while(written < len) {
+		while ((uart_read(REG_LSR) & LSR_THRE) == 0) {
+			int i = 0;
+
+			/* Give the simulator some breathing space */
+			for (; i < 1000; ++i)
+				smt_very_low();
+		}
+		smt_medium();
+		uart_write(REG_THR, buf[written++]);
+	};
+
+	return written;
+}
+
+/* Must be called with console lock held */
+static void uart_read_to_buffer(void)
+{
+	/* As long as there is room in the buffer */
+	while(in_count < IN_BUF_SIZE) {
+		/* Read status register */
+		uint8_t lsr = uart_read(REG_LSR);
+
+		/* Nothing to read ... */
+		if ((lsr & LSR_DR) == 0)
+			break;
+
+		/* Read and add to buffer */
+		in_buf[in_count++] = uart_read(REG_RBR);
+	}
+
+	if (!has_irq)
+		return;
+
+	/* If the buffer is full disable the interrupt */
+	if (in_count == IN_BUF_SIZE) {
+		if (!irq_disabled)
+			uart_write(REG_IER, 0x00);
+		irq_disabled = true;
+	} else {
+		/* Otherwise, enable it */
+		if (irq_disabled) 
+			uart_write(REG_IER, 0x01);
+		irq_disabled = false;
+	}
+}
+
+/* This is called with the console lock held */
+static size_t uart_con_read(char *buf, size_t len)
+{
+	size_t read_cnt = 0;
+	uint8_t lsr = 0;
+
+	if (!in_buf)
+		return 0;
+
+	/* Read from buffer first */
+	if (in_count) {
+		read_cnt = in_count;
+		if (len < read_cnt)
+			read_cnt = len;
+		memcpy(buf, in_buf, read_cnt);
+		len -= read_cnt;
+		if (in_count != read_cnt)
+			memmove(in_buf, in_buf + read_cnt, in_count - read_cnt);
+		in_count -= read_cnt;
+	}
+
+	/*
+	 * If there's still room in the user buffer, read from the UART
+	 * directly
+	 */
+	while(len) {
+		lsr = uart_read(REG_LSR);
+		if ((lsr & LSR_DR) == 0)
+			break;
+		buf[read_cnt++] = uart_read(REG_RBR);
+		len--;
+	}
+
+	/* Finally, flush whatever's left in the UART into our buffer */
+	uart_read_to_buffer();
+	
+	/* Adjust the OPAL event */
+	if (in_count)
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+	else
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+
+	uart_trace(TRACE_UART_CTX_READ, read_cnt, irq_disabled, in_count);
+
+	return read_cnt;
+}
+
+static struct con_ops uart_con_driver = {
+	.read = uart_con_read,
+	.write = uart_con_write
+};
+
+bool uart_console_poll(void)
+{
+	if (!in_buf)
+		return false;
+
+	/* Grab what's in the UART and stash it into our buffer */
+	uart_read_to_buffer();
+
+	uart_trace(TRACE_UART_CTX_POLL, 0, irq_disabled, in_count);
+
+	return !!in_count;
+}
+
+void uart_irq(void)
+{
+	if (!in_buf)
+		return;
+
+	/* This needs locking vs read() */
+	lock(&con_lock);
+
+	/* Grab what's in the UART and stash it into our buffer */
+	uart_read_to_buffer();
+
+	/* Set the event if the buffer has anything in it */
+	if (in_count)
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+
+	uart_trace(TRACE_UART_CTX_IRQ, 0, irq_disabled, in_count);
+	unlock(&con_lock);
+}
+
+static bool uart_init_hw(unsigned int speed, unsigned int clock)
+{
+	unsigned int dll = (clock / 16) / speed;
+
+	/* Clear line control */
+	uart_write(REG_LCR, 0x00);
+
+	/* Check if the UART responds */
+	uart_write(REG_IER, 0x01);
+	if (uart_read(REG_IER) != 0x01)
+		goto detect_fail;
+	uart_write(REG_IER, 0x00);
+	if (uart_read(REG_IER) != 0x00)
+		goto detect_fail;
+
+	uart_write(REG_LCR, LCR_DLAB);
+	uart_write(REG_DLL, dll & 0xff);
+	uart_write(REG_DLM, dll >> 8);
+	uart_write(REG_LCR, 0x03); /* 8N1 */
+	uart_write(REG_MCR, 0x03); /* RTS/DTR */
+	uart_write(REG_FCR, 0x07); /* clear & en. fifos */
+	return true;
+
+ detect_fail:
+	prerror("UART: Presence detect failed !\n");
+	return false;
+}
+
+void uart_init(bool enable_interrupt)
+{
+	const struct dt_property *prop;
+	struct dt_node *n;
+	char *path __unused;
+	uint32_t irqchip, irq;
+
+	if (!lpc_present())
+		return;
+
+	/* We support only one */
+	n = dt_find_compatible_node(dt_root, NULL, "ns16550");
+	if (!n)
+		return;
+
+	/* Get IO base */
+	prop = dt_find_property(n, "reg");
+	if (!prop) {
+		log_simple_error(&e_info(OPAL_RC_UART_INIT),
+				"UART: Can't find reg property\n");
+		return;
+	}
+	if (dt_property_get_cell(prop, 0) != OPAL_LPC_IO) {
+		log_simple_error(&e_info(OPAL_RC_UART_INIT),
+				"UART: Only supports IO addresses\n");
+		return;
+	}
+	uart_base = dt_property_get_cell(prop, 1);
+
+	if (!uart_init_hw(dt_prop_get_u32(n, "current-speed"),
+			  dt_prop_get_u32(n, "clock-frequency"))) {
+		prerror("UART: Initialization failed\n");
+		dt_add_property_strings(n, "status", "bad");
+		return;
+	}
+
+	/*
+	 * Mark LPC used by the console (will mark the relevant
+	 * locks to avoid deadlocks when flushing the console)
+	 */
+	lpc_used_by_console();
+
+	/* Install console backend for printf() */
+	set_console(&uart_con_driver);
+
+	/* Setup the interrupts properties since HB couldn't do it */
+	irqchip = dt_prop_get_u32(n, "ibm,irq-chip-id");
+	irq = get_psi_interrupt(irqchip) + P8_IRQ_PSI_HOST_ERR;
+	printf("UART: IRQ connected to chip %d, irq# is 0x%x\n", irqchip, irq);
+	if (enable_interrupt) {
+		dt_add_property_cells(n, "interrupts", irq);
+		dt_add_property_cells(n, "interrupt-parent", get_ics_phandle());
+	}
+
+	if (dummy_console_enabled()) {
+		/*
+		 * If the dummy console is enabled, we mark the UART as
+		 * reserved since we don't want the kernel to start using it
+		 * with its own 8250 driver
+		 */
+		dt_add_property_strings(n, "status", "reserved");
+
+		/*
+		 * If the interrupt is enabled, turn on RX interrupts (and
+		 * only these for now
+		 */
+		if (enable_interrupt) {
+			uart_write(REG_IER, 0x01);
+			has_irq = true;
+			irq_disabled = false;
+		}
+
+		/* Allocate an input buffer */
+		in_buf = zalloc(IN_BUF_SIZE);
+		printf("UART: Enabled as OS console\n");
+	} else {
+		/* Else, we expose it as our chosen console */
+		dt_add_property_strings(n, "status", "ok");
+		path = dt_get_path(n);
+		dt_add_property_string(dt_chosen, "linux,stdout-path", path);
+		free(path);
+		printf("UART: Enabled as OS pass-through\n");
+	}
+}
diff --git a/hw/lpc.c b/hw/lpc.c
new file mode 100644
index 00000000..8dc533db
--- /dev/null
+++ b/hw/lpc.c
@@ -0,0 +1,500 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <lock.h>
+#include <chip.h>
+#include <lpc.h>
+#include <timebase.h>
+#include <fsp-elog.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_READ, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_LPC_WRITE, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
+		 OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+#define ECCB_CTL	0 /* b0020 -> b00200 */
+#define ECCB_STAT	2 /* b0022 -> b00210 */
+#define ECCB_DATA	3 /* b0023 -> b00218 */
+
+#define ECCB_CTL_MAGIC		0xd000000000000000ul
+#define ECCB_CTL_DATASZ_MASK	PPC_BITMASK(4,7)
+#define ECCB_CTL_DATASZ_LSH	PPC_BITLSHIFT(7)
+#define ECCB_CTL_READ		PPC_BIT(15)
+#define ECCB_CTL_ADDRLEN_MASK	PPC_BITMASK(23,25)
+#define ECCB_CTL_ADDRLEN_LSH	PPC_BITLSHIFT(25)
+#define 	ECCB_ADDRLEN_4B	0x4
+#define ECCB_CTL_ADDR_MASK	PPC_BITMASK(32,63)
+#define ECCB_CTL_ADDR_LSH	0
+
+#define ECCB_STAT_PIB_ERR_MASK	PPC_BITMASK(0,5)
+#define ECCB_STAT_PIB_ERR_LSH	PPC_BITLSHIFT(5)
+#define ECCB_STAT_RD_DATA_MASK	PPC_BITMASK(6,37)
+#define ECCB_STAT_RD_DATA_LSH	PPC_BITLSHIFT(37)
+#define ECCB_STAT_BUSY		PPC_BIT(44)
+#define ECCB_STAT_ERRORS1_MASK	PPC_BITMASK(45,51)
+#define ECCB_STAT_ERRORS1_LSH	PPC_BITLSHIFT(51)
+#define ECCB_STAT_OP_DONE	PPC_BIT(52)
+#define ECCB_STAT_ERRORS2_MASK	PPC_BITMASK(53,55)
+#define ECCB_STAT_ERRORS3_LSH	PPC_BITLSHIFT(55)
+
+#define ECCB_STAT_ERR_MASK	(ECCB_STAT_PIB_ERR_MASK | \
+				 ECCB_STAT_ERRORS1_MASK | \
+				 ECCB_STAT_ERRORS2_MASK)
+
+#define ECCB_TIMEOUT	1000000
+
+/* LPC HC registers */
+#define LPC_HC_FW_SEG_IDSEL	0x24
+#define LPC_HC_FW_RD_ACC_SIZE	0x28
+#define   LPC_HC_FW_RD_1B	0x00000000
+#define   LPC_HC_FW_RD_2B	0x01000000
+#define   LPC_HC_FW_RD_4B	0x02000000
+#define   LPC_HC_FW_RD_16B	0x04000000
+#define   LPC_HC_FW_RD_128B	0x07000000
+
+/* Default LPC bus */
+static int32_t lpc_default_chip_id = -1;
+
+/*
+ * These are expected to be the same on all chips and should probably
+ * be read (or configured) dynamically. This is how things are configured
+ * today on Tuletta.
+ */
+static uint32_t lpc_io_opb_base		= 0xd0010000;
+static uint32_t lpc_mem_opb_base 	= 0xe0000000;
+static uint32_t lpc_fw_opb_base 	= 0xf0000000;
+static uint32_t lpc_reg_opb_base 	= 0xc0012000;
+
+static int64_t opb_write(struct proc_chip *chip, uint32_t addr, uint32_t data,
+			 uint32_t sz)
+{
+	uint64_t ctl = ECCB_CTL_MAGIC, stat;
+	int64_t rc, tout;
+	uint64_t data_reg;
+
+	switch(sz) {
+	case 1:
+		data_reg = ((uint64_t)data) << 56;
+		break;
+	case 2:
+		data_reg = ((uint64_t)data) << 48;
+		break;
+	case 4:
+		data_reg = ((uint64_t)data) << 32;
+		break;
+	default:
+		prerror("LPC: Invalid data size %d\n", sz);
+		return OPAL_PARAMETER;
+	}
+
+	rc = xscom_write(chip->id, chip->lpc_xbase + ECCB_DATA, data_reg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+			"LPC: XSCOM write to ECCB DATA error %lld\n", rc);
+		return rc;
+	}
+
+	ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz);
+	ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B);
+	ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr);
+	rc = xscom_write(chip->id, chip->lpc_xbase + ECCB_CTL, ctl);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+			"LPC: XSCOM write to ECCB CTL error %lld\n", rc);
+		return rc;
+	}
+
+	for (tout = 0; tout < ECCB_TIMEOUT; tout++) {
+		rc = xscom_read(chip->id, chip->lpc_xbase + ECCB_STAT, &stat);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+				"LPC: XSCOM read from ECCB STAT err %lld\n",
+									rc);
+			return rc;
+		}
+		if (stat & ECCB_STAT_OP_DONE) {
+			if (stat & ECCB_STAT_ERR_MASK) {
+				log_simple_error(&e_info(OPAL_RC_LPC_WRITE),
+					"LPC: Error status: 0x%llx\n", stat);
+				return OPAL_HARDWARE;
+			}
+			return OPAL_SUCCESS;
+		}
+		time_wait(100);
+	}
+	log_simple_error(&e_info(OPAL_RC_LPC_WRITE), "LPC: Write timeout !\n");
+	return OPAL_HARDWARE;
+}
+
+static int64_t opb_read(struct proc_chip *chip, uint32_t addr, uint32_t *data,
+		        uint32_t sz)
+{
+	uint64_t ctl = ECCB_CTL_MAGIC | ECCB_CTL_READ, stat;
+	int64_t rc, tout;
+
+	if (sz != 1 && sz != 2 && sz != 4) {
+		prerror("LPC: Invalid data size %d\n", sz);
+		return OPAL_PARAMETER;
+	}
+
+	ctl = SETFIELD(ECCB_CTL_DATASZ, ctl, sz);
+	ctl = SETFIELD(ECCB_CTL_ADDRLEN, ctl, ECCB_ADDRLEN_4B);
+	ctl = SETFIELD(ECCB_CTL_ADDR, ctl, addr);
+	rc = xscom_write(chip->id, chip->lpc_xbase + ECCB_CTL, ctl);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_LPC_READ),
+			"LPC: XSCOM write to ECCB CTL error %lld\n", rc);
+		return rc;
+	}
+
+	for (tout = 0; tout < ECCB_TIMEOUT; tout++) {
+		rc = xscom_read(chip->id, chip->lpc_xbase + ECCB_STAT, &stat);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_LPC_READ),
+				"LPC: XSCOM read from ECCB STAT err %lld\n",
+									rc);
+			return rc;
+		}
+		if (stat & ECCB_STAT_OP_DONE) {
+			uint32_t rdata = GETFIELD(ECCB_STAT_RD_DATA, stat);
+			if (stat & ECCB_STAT_ERR_MASK) {
+				log_simple_error(&e_info(OPAL_RC_LPC_READ),
+					"LPC: Error status: 0x%llx\n", stat);
+				return OPAL_HARDWARE;
+			}
+			switch(sz) {
+			case 1:
+				*data = rdata >> 24;
+				break;
+			case 2:
+				*data = rdata >> 16;
+				break;
+			default:
+				*data = rdata;
+				break;
+			}
+			return 0;
+		}
+		time_wait(100);
+	}
+	log_simple_error(&e_info(OPAL_RC_LPC_READ), "LPC: Read timeout !\n");
+	return OPAL_HARDWARE;
+}
+
+static int64_t lpc_set_fw_idsel(struct proc_chip *chip, uint8_t idsel)
+{
+	uint32_t val;
+	int64_t rc;
+
+	if (idsel == chip->lpc_fw_idsel)
+		return OPAL_SUCCESS;
+	if (idsel > 0xf)
+		return OPAL_PARAMETER;
+
+	rc = opb_read(chip, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL,
+		      &val, 4);
+	if (rc) {
+		prerror("LPC: Failed to read HC_FW_SEG_IDSEL register !\n");
+		return rc;
+	}
+	val = (val & 0xfffffff0) | idsel;
+	rc = opb_write(chip, lpc_reg_opb_base + LPC_HC_FW_SEG_IDSEL,
+		       val, 4);
+	if (rc) {
+		prerror("LPC: Failed to write HC_FW_SEG_IDSEL register !\n");
+		return rc;
+	}
+	chip->lpc_fw_idsel = idsel;
+	return OPAL_SUCCESS;
+}
+
+static int64_t lpc_set_fw_rdsz(struct proc_chip *chip, uint8_t rdsz)
+{
+	uint32_t val;
+	int64_t rc;
+
+	if (rdsz == chip->lpc_fw_rdsz)
+		return OPAL_SUCCESS;
+	switch(rdsz) {
+	case 1:
+		val = LPC_HC_FW_RD_1B;
+		break;
+	case 2:
+		val = LPC_HC_FW_RD_2B;
+		break;
+	case 4:
+		val = LPC_HC_FW_RD_4B;
+		break;
+	default:
+		/*
+		 * The HW supports 16 and 128 via a buffer/cache
+		 * but I have never exprimented with it and am not
+		 * sure it works the way we expect so let's leave it
+		 * at that for now
+		 */
+		return OPAL_PARAMETER;
+	}
+	rc = opb_write(chip, lpc_reg_opb_base + LPC_HC_FW_RD_ACC_SIZE,
+		       val, 4);
+	if (rc) {
+		prerror("LPC: Failed to write LPC_HC_FW_RD_ACC_SIZE !\n");
+		return rc;
+	}
+	chip->lpc_fw_rdsz = rdsz;
+	return OPAL_SUCCESS;
+}
+
+static int64_t lpc_opb_prepare(struct proc_chip *chip,
+			       enum OpalLPCAddressType addr_type,
+			       uint32_t addr, uint32_t sz,
+			       uint32_t *opb_base, bool is_write)
+{
+	uint32_t top = addr + sz;
+	uint8_t fw_idsel;
+	int64_t rc;
+
+	/* Address wraparound */
+	if (top < addr)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Bound check access and get the OPB base address for
+	 * the window corresponding to the access type
+	 */
+	switch(addr_type) {
+	case OPAL_LPC_IO:
+		/* IO space is 64K */
+		if (top > 0x10000)
+			return OPAL_PARAMETER;
+		/* And only supports byte accesses */
+		if (sz != 1)
+			return OPAL_PARAMETER;
+		*opb_base = lpc_io_opb_base;
+		break;
+	case OPAL_LPC_MEM:
+		/* MEM space is 256M */
+		if (top > 0x10000000)
+			return OPAL_PARAMETER;
+		/* And only supports byte accesses */
+		if (sz != 1)
+			return OPAL_PARAMETER;
+		*opb_base = lpc_mem_opb_base;
+		break;
+	case OPAL_LPC_FW:
+		/*
+		 * FW space is in segments of 256M controlled
+		 * by IDSEL, make sure we don't cross segments
+		 */
+		*opb_base = lpc_fw_opb_base;
+		fw_idsel = (addr >> 28);
+		if (((top - 1) >> 28) != fw_idsel)
+			return OPAL_PARAMETER;
+
+		/* Set segment */
+		rc = lpc_set_fw_idsel(chip, fw_idsel);
+		if (rc)
+			return rc;
+		/* Set read access size */
+		if (!is_write) {
+			rc = lpc_set_fw_rdsz(chip, sz);
+			if (rc)
+				return rc;
+		}
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t __lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+			   uint32_t addr, uint32_t data, uint32_t sz)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	uint32_t opb_base;
+	int64_t rc;
+
+	if (!chip || !chip->lpc_xbase)
+		return OPAL_PARAMETER;
+
+	lock(&chip->lpc_lock);
+
+	/*
+	 * Convert to an OPB access and handle LPC HC configuration
+	 * for FW accesses (IDSEL)
+	 */
+	rc = lpc_opb_prepare(chip, addr_type, addr, sz, &opb_base, true);
+	if (rc)
+		goto bail;
+
+	/* Perform OPB access */
+	rc = opb_write(chip, opb_base + addr, data, sz);
+
+	unlock(&chip->lpc_lock);
+
+	/* XXX Add LPC error handling/recovery */
+ bail:
+	return rc;
+}
+
+int64_t lpc_write(enum OpalLPCAddressType addr_type, uint32_t addr,
+		  uint32_t data, uint32_t sz)
+{
+	if (lpc_default_chip_id < 0)
+		return OPAL_PARAMETER;
+	return __lpc_write(lpc_default_chip_id, addr_type, addr, data, sz);
+}
+
+/*
+ * The "OPAL" variant add the emulation of 2 and 4 byte accesses using
+ * byte accesses for IO and MEM space in order to be compatible with
+ * existing Linux expectations
+ */
+static int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+			      uint32_t addr, uint32_t data, uint32_t sz)
+{
+	int64_t rc;
+
+	if (addr_type == OPAL_LPC_FW || sz == 1)
+		return __lpc_write(chip_id, addr_type, addr, data, sz);
+	while(sz--) {
+		rc = __lpc_write(chip_id, addr_type, addr, data & 0xff, 1);
+		if (rc)
+			return rc;
+		addr++;
+		data >>= 8;
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t __lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+			  uint32_t addr, uint32_t *data, uint32_t sz)
+{
+	struct proc_chip *chip = get_chip(chip_id);
+	uint32_t opb_base;
+	int64_t rc;
+
+	if (!chip || !chip->lpc_xbase)
+		return OPAL_PARAMETER;
+
+	lock(&chip->lpc_lock);
+
+	/*
+	 * Convert to an OPB access and handle LPC HC configuration
+	 * for FW accesses (IDSEL and read size)
+	 */
+	rc = lpc_opb_prepare(chip, addr_type, addr, sz, &opb_base, false);
+	if (rc)
+		goto bail;
+
+	/* Perform OPB access */
+	rc = opb_read(chip, opb_base + addr, data, sz);
+
+	unlock(&chip->lpc_lock);
+
+	/* XXX Add LPC error handling/recovery */
+ bail:
+	return rc;
+}
+
+int64_t lpc_read(enum OpalLPCAddressType addr_type, uint32_t addr,
+		 uint32_t *data, uint32_t sz)
+{
+	if (lpc_default_chip_id < 0)
+		return OPAL_PARAMETER;
+	return __lpc_read(lpc_default_chip_id, addr_type, addr, data, sz);
+}
+
+/*
+ * The "OPAL" variant add the emulation of 2 and 4 byte accesses using
+ * byte accesses for IO and MEM space in order to be compatible with
+ * existing Linux expectations
+ */
+static int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
+			     uint32_t addr, uint32_t *data, uint32_t sz)
+{
+	int64_t rc;
+
+	if (addr_type == OPAL_LPC_FW || sz == 1)
+		return __lpc_read(chip_id, addr_type, addr, data, sz);
+	*data = 0;
+	while(sz--) {
+		uint32_t byte;
+
+		rc = __lpc_read(chip_id, addr_type, addr, &byte, 1);
+		if (rc)
+			return rc;
+		*data = *data | (byte << (8 * sz));
+		addr++;
+	}
+	return OPAL_SUCCESS;
+}
+
+bool lpc_present(void)
+{
+	return lpc_default_chip_id >= 0;
+}
+
+void lpc_init(void)
+{
+	struct dt_node *xn;
+	bool has_lpc = false;
+
+	dt_for_each_compatible(dt_root, xn, "ibm,power8-lpc") {
+		uint32_t gcid = dt_get_chip_id(xn);
+		struct proc_chip *chip;
+
+		chip = get_chip(gcid);
+		assert(chip);
+
+		chip->lpc_xbase = dt_get_address(xn, 0, NULL);
+		chip->lpc_fw_idsel = 0xff;
+		chip->lpc_fw_rdsz = 0xff;
+		init_lock(&chip->lpc_lock);
+
+		if (lpc_default_chip_id < 0 ||
+		    dt_has_node_property(xn, "primary", NULL)) {
+			lpc_default_chip_id = chip->id;
+		}
+
+		printf("LPC: Bus on chip %d PCB_Addr=0x%x\n",
+		       chip->id, chip->lpc_xbase);
+		has_lpc = true;
+	}
+	if (lpc_default_chip_id >= 0)
+		printf("LPC: Default bus on chip %d\n", lpc_default_chip_id);
+
+	if (has_lpc) {
+		opal_register(OPAL_LPC_WRITE, opal_lpc_write, 5);
+		opal_register(OPAL_LPC_READ, opal_lpc_read, 5);
+	}
+}
+
+void lpc_used_by_console(void)
+{
+	struct proc_chip *chip;
+
+	xscom_used_by_console();
+
+	for_each_chip(chip)
+		chip->lpc_lock.in_con_path = true;
+}
diff --git a/hw/nx.c b/hw/nx.c
new file mode 100644
index 00000000..8f427173
--- /dev/null
+++ b/hw/nx.c
@@ -0,0 +1,127 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <nx.h>
+
+#define NX_P7_RNG_BAR		XSCOM_SAT(0x1, 0x2, 0x0c)
+#define   NX_P7_RNG_BAR_ADDR_MASK	PPC_BITMASK(18, 51)
+#define   NX_P7_RNG_BAR_ADDR_LSH	PPC_BITLSHIFT(51)
+#define   NX_P7_RNG_BAR_SIZE_MASK	PPC_BITMASK(53, 55)
+#define   NX_P7_RNG_BAR_SIZE_LSH	PPC_BITLSHIFT(55)
+#define   NX_P7_RNG_BAR_ENABLE		PPC_BIT(52)
+
+#define NX_P8_RNG_BAR		XSCOM_SAT(0xc, 0x2, 0x0d)
+#define   NX_P8_RNG_BAR_ADDR_MASK	PPC_BITMASK(14, 51)
+#define   NX_P8_RNG_BAR_ADDR_LSH	PPC_BITLSHIFT(51)
+#define   NX_P8_RNG_BAR_SIZE_MASK	PPC_BITMASK(53, 55)
+#define   NX_P8_RNG_BAR_SIZE_LSH	PPC_BITLSHIFT(55)
+#define   NX_P8_RNG_BAR_ENABLE		PPC_BIT(52)
+
+#define NX_P7_RNG_CFG		XSCOM_SAT(0x1, 0x2, 0x12)
+#define   NX_P7_RNG_CFG_ENABLE		PPC_BIT(63)
+#define NX_P8_RNG_CFG		XSCOM_SAT(0xc, 0x2, 0x12)
+#define   NX_P8_RNG_CFG_ENABLE		PPC_BIT(63)
+
+static void nx_create_node(struct dt_node *node)
+{
+	u64 bar, cfg;
+	u64 xbar, xcfg;
+	u32 pb_base;
+	u32 gcid;
+	u64 rng_addr, rng_len, len;
+	struct dt_node *rng;
+	int rc;
+
+	gcid = dt_get_chip_id(node);
+	pb_base = dt_get_address(node, 0, NULL);
+
+	if (dt_node_is_compatible(node, "ibm,power7-nx")) {
+		xbar = pb_base + NX_P7_RNG_BAR;
+		xcfg = pb_base + NX_P7_RNG_CFG;
+	} else if (dt_node_is_compatible(node, "ibm,power8-nx")) {
+		xbar = pb_base + NX_P8_RNG_BAR;
+		xcfg = pb_base + NX_P8_RNG_CFG;
+	} else {
+		prerror("NX%d: Unknown NX type!\n", gcid);
+		return;
+	}
+
+	rc = xscom_read(gcid, xbar, &bar); /* Get RNG BAR */
+	if (rc)
+		return;	/* Hope xscom always prints error message */
+
+	rc = xscom_read(gcid, xcfg, &cfg); /* Get RNG CFG */
+	if (rc)
+		return;
+
+	/*
+	 * We use the P8 BAR constants. The layout of the BAR is the
+	 * same, with more bits at the top of P8 which are hard wired to
+	 * 0 on P7. We also mask in-place rather than using GETFIELD
+	 * for the base address as we happen to *know* that it's properly
+	 * aligned in the register.
+	 *
+	 * FIXME? Always assusme BAR gets a valid address from FSP
+	 */
+	rng_addr = bar & NX_P8_RNG_BAR_ADDR_MASK;
+	len  = GETFIELD(NX_P8_RNG_BAR_SIZE, bar);
+	if (len > 4) {
+		prerror("NX%d: Corrupted bar size %lld\n", gcid, len);
+		return;
+	}
+	rng_len = (u64[]){  0x1000,         /* 4K */
+			    0x10000,        /* 64K */
+			    0x400000000,    /* 16G*/
+			    0x100000,       /* 1M */
+			    0x1000000       /* 16M */} [len];
+
+
+	printf("NX%d: RNG BAR set to 0x%016llx..0x%016llx\n",
+	       gcid, rng_addr, rng_addr + rng_len - 1);
+
+	/* RNG must be enabled before MMIO is enabled */
+	rc = xscom_write(gcid, xcfg, cfg | NX_P8_RNG_CFG_ENABLE);
+	if (rc)
+		return;
+
+	/* The BAR needs to be enabled too */
+	rc = xscom_write(gcid, xbar, bar | NX_P8_RNG_BAR_ENABLE);
+	if (rc)
+		return;
+	rng = dt_new_addr(dt_root, "hwrng", rng_addr);
+	if (!rng)
+		return;
+
+	dt_add_property_strings(rng, "compatible", "ibm,power-rng");
+	dt_add_property_cells(rng, "reg", hi32(rng_addr), lo32(rng_addr),
+			hi32(rng_len), lo32(rng_len));
+	dt_add_property_cells(rng, "ibm,chip-id", gcid);
+}
+
+/* Create nodes for MMIO accesible components in NX (only RNG) */
+void nx_init(void)
+{
+	struct dt_node *node;
+
+	dt_for_each_compatible(dt_root, node, "ibm,power-nx")
+		nx_create_node(node);
+}
diff --git a/hw/occ.c b/hw/occ.c
new file mode 100644
index 00000000..7e6284a1
--- /dev/null
+++ b/hw/occ.c
@@ -0,0 +1,477 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <fsp.h>
+#include <timebase.h>
+#include <hostservices.h>
+#include <fsp-elog.h>
+
+/* OCC Communication Area for PStates */
+
+#define P8_HOMER_SAPPHIRE_DATA_OFFSET	0x1F8000
+
+#define MAX_PSTATES 256
+
+struct occ_pstate_entry {
+	s8 id;
+	u8 flags;
+	u8 vdd;
+	u8 vcs;
+	u32 freq_khz;
+};
+
+struct occ_pstate_table {
+	u8 valid;
+	u8 version;
+	u8 throttle;
+	s8 pstate_min;
+	s8 pstate_nom;
+	s8 pstate_max;
+	u8 spare1;
+	u8 spare2;
+	u64 reserved;
+	struct occ_pstate_entry pstates[MAX_PSTATES];
+};
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_LOAD, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_OCC_PSTATE_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC,
+		OPAL_CEC_HARDWARE, OPAL_INFO,
+		OPAL_NA, NULL);
+
+/* Check each chip's HOMER/Sapphire area for PState valid bit */
+static bool wait_for_all_occ_init(void)
+{
+	struct proc_chip *chip;
+	uint64_t occ_data_area;
+	struct occ_pstate_table *occ_data;
+	int tries;
+	uint64_t start_time, end_time;
+
+	start_time = mftb();
+	for_each_chip(chip) {
+		/* Check for valid homer address */
+		if (!chip->homer_base) {
+			printf("OCC: Chip: %x homer_base is not valid\n",
+				chip->id);
+			return false;
+		}
+		/* Get PState table address */
+		occ_data_area = chip->homer_base + P8_HOMER_SAPPHIRE_DATA_OFFSET;
+		occ_data = (struct occ_pstate_table *)occ_data_area;
+
+		/*
+		 * Checking for occ_data->valid == 1 is ok because we clear all
+		 * homer_base+size before passing memory to host services.
+		 * This ensures occ_data->valid == 0 before OCC load
+		 */
+		tries = 20; /* 2 secs */
+		while((occ_data->valid != 1) && tries--) {
+			time_wait_ms(100);
+		}
+		if (occ_data->valid != 1) {
+			printf("OCC: Chip: %x PState table is not valid\n",
+				chip->id);
+			return false;
+		}
+		printf("OCC: Chip %02x Data (%016llx) = %016llx\n",
+				chip->id, occ_data_area,
+				*(uint64_t *)occ_data_area);
+	}
+	end_time = mftb();
+	printf("OCC: All Chip Rdy after %lld ms\n", (end_time - start_time) / 512 / 1000);
+	return true;
+}
+
+/* Add device tree properties to describe pstates states */
+/* Retrun nominal pstate to set in each core */
+static bool add_cpu_pstate_properties(s8 *pstate_nom)
+{
+	struct proc_chip *chip;
+	uint64_t occ_data_area;
+	struct occ_pstate_table *occ_data;
+	struct dt_node *power_mgt;
+	u8 nr_pstates;
+	/* Arrays for device tree */
+	u32 dt_id[MAX_PSTATES];
+	u32 dt_freq[MAX_PSTATES];
+	int i;
+
+	printf("OCC: CPU pstate state device tree init\n");
+
+	/* Find first chip and core */
+	chip = next_chip(NULL);
+
+	/* Extract PState information from OCC */
+
+	/* Dump state table */
+	occ_data_area = chip->homer_base + P8_HOMER_SAPPHIRE_DATA_OFFSET;
+
+	printf("OCC: Data (%16llx) = %16llx %16llx\n",
+	       occ_data_area,
+	       *(uint64_t *)occ_data_area,
+	       *(uint64_t *)(occ_data_area+8));
+	
+	occ_data = (struct occ_pstate_table *)occ_data_area;
+
+	if (!occ_data->valid) {
+		printf("OCC: PState table is not valid\n");
+		return false;
+	}
+
+	nr_pstates = occ_data->pstate_max - occ_data->pstate_min + 1;
+	printf("OCC: Min %d Nom %d Max %d Nr States %d\n", 
+	       occ_data->pstate_min, occ_data->pstate_nom,
+	       occ_data->pstate_max, nr_pstates);
+
+	if (nr_pstates <= 1 || nr_pstates > 128) {
+		printf("OCC: OCC range is not valid\n");
+		return false;
+	}
+
+	/* Setup arrays for device-tree */
+	for( i=0; i < nr_pstates; i++) {
+		dt_id[i] = occ_data->pstates[i].id;
+		dt_freq[i] = occ_data->pstates[i].freq_khz/1000;
+	}
+
+	power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		printf("OCC: dt node /ibm,opal/power-mgt not found\n");
+		return false;
+	}
+
+	/* Add the device-tree entries */
+	dt_add_property(power_mgt, "ibm,pstate-ids", dt_id, nr_pstates * 4);
+	dt_add_property(power_mgt, "ibm,pstate-frequencies-mhz", dt_freq, nr_pstates * 4);
+	dt_add_property_cells(power_mgt, "ibm,pstate-min", occ_data->pstate_min);
+	dt_add_property_cells(power_mgt, "ibm,pstate-nominal", occ_data->pstate_nom);
+	dt_add_property_cells(power_mgt, "ibm,pstate-max", occ_data->pstate_max);
+
+	/* Return pstate to set for each core */
+	*pstate_nom = occ_data->pstate_nom;
+	return true;
+}
+
+/*
+ * Prepare chip for pstate transitions
+ */
+
+static bool cpu_pstates_prepare_core(struct proc_chip *chip, struct cpu_thread *c, s8 pstate_nom)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp, pstate;
+	int rc;
+
+	/*
+	 * Currently Fastsleep init clears EX_PM_SPR_OVERRIDE_EN.
+	 * Need to ensure only relevant bits are inited
+	 */
+
+	/* Init PM GP1 for SCOM based PSTATE control to set nominal freq */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp);
+	tmp = tmp | EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to write PM_GP1 in pstates init\n");
+		return false;
+	}
+
+	/* Set new pstate to core */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), &tmp);
+	tmp = tmp & ~0xFFFF000000000000ULL;
+	pstate = ((uint64_t) pstate_nom) & 0xFF;
+	tmp = tmp | (pstate << 56) | (pstate << 48);
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to write PM_GP1 in pstates init\n");
+		return false;
+	}
+	time_wait_ms(1); /* Wait for PState to change */
+	/*
+	 * Init PM GP1 for SPR based PSTATE control.
+	 * Once OCC is active EX_PM_SETUP_GP1_DPLL_FREQ_OVERRIDE_EN will be
+	 * cleared by OCC.  Sapphire need not clear.
+	 * However wait for DVFS state machine to become idle after min->nominal
+	 * transition initiated above.  If not switch over to SPR control could fail.
+	 */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp);
+	tmp = tmp & ~EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: Failed to write PM_GP1 in pstates init\n");
+		return false;
+	}
+
+	/* Just debug */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMSR), &tmp);
+	printf("OCC: Chip %x Core %x PPMSR %016llx\n", chip->id, core, tmp);
+
+	/*
+	 * If PMSR is still in transition at this point due to PState change
+	 * initiated above, then the switchover to SPR may not work.
+	 * ToDo: Check for DVFS state machine idle before change.
+	 */
+
+	return true;
+}
+
+/* CPU-OCC PState init */
+/* Called after OCC init on P8 */
+void occ_pstates_init(void)
+{
+	struct proc_chip *chip;
+	struct cpu_thread *c;
+	s8 pstate_nom;
+
+	/* OCC is P8 only */
+	if (proc_gen != proc_gen_p8)
+		return;
+
+	chip = next_chip(NULL);
+	if (!chip->homer_base) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"OCC: No HOMER detected, assuming no pstates\n");
+		return;
+	}
+
+	/* Wait for all OCC to boot up */
+	if(!wait_for_all_occ_init()) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+					"OCC: All OCC did not init.  Timed Out\n");
+		return;
+	}
+
+	/*
+	 * Check boundary conditions and add device tree nodes
+	 * and return nominal pstate to set for the core
+	 */
+	if (!add_cpu_pstate_properties(&pstate_nom)) {
+		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
+			"Skiping core cpufreq init due to OCC error\n");
+		return;
+	}
+
+	/* Setup host based pstates and set nominal frequency */
+	for_each_chip(chip) {
+		for_each_available_core_in_chip(c, chip->id) {
+			cpu_pstates_prepare_core(chip, c, pstate_nom);
+		}
+	}
+}
+
+static void occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id)
+{
+	struct fsp_msg *rsp, *stat;
+	int rc = -ENOMEM;
+	int status_word = 0;
+	struct proc_chip *chip = next_chip(NULL);
+	u8 err = 0;
+
+	/* Check arguments */
+	if (scope != 0x01 && scope != 0x02) {
+		prerror("OCC: Load message with invalid scope 0x%x\n",
+			scope);
+		err = 0x22;
+	}
+
+	/* First queue up an OK response to the load message itself */
+	rsp = fsp_mkmsg(FSP_RSP_LOAD_OCC, 0 | err);
+	if (rsp)
+		rc = fsp_queue_msg(rsp, fsp_freemsg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d queueing FSP OCC LOAD reply\n", rc);
+		return;
+	}
+
+	/* If we had an error, return */
+	if (err)
+		return;
+
+	/* Call HBRT... */
+	rc = host_services_occ_load();
+
+	/* Handle fallback to preload */
+	if (rc == -ENOENT && chip->homer_base) {
+		printf("OCC: Load: Fallback to preloaded image\n");
+		rc = 0;
+	} else if (!rc) {
+		/* Success, start OCC */
+		rc = host_services_occ_start();
+	}
+	if (rc) {
+		/* If either of hostservices call fail, send fail to FSP */
+		/* Find a chip ID to send failure */
+		for_each_chip(chip) {
+			if (scope == 0x01 && dbob_id != chip->dbob_id)
+				continue;
+			status_word = 0xB500 | (chip->pcid & 0xff);
+			break;
+		}
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d in load/start OCC\n", err);
+	}
+
+	/* Send a single response for all chips */
+	stat = fsp_mkmsg(FSP_CMD_LOAD_OCC_STAT, 2, status_word, seq_id);
+	if (stat)
+		rc = fsp_queue_msg(stat, fsp_freemsg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_LOAD),
+			"OCC: Error %d queueing FSP OCC LOAD STATUS msg", rc);
+	}
+}
+
+static void occ_do_reset(u8 scope, u32 dbob_id, u32 seq_id)
+{
+	struct fsp_msg *rsp, *stat;
+	struct proc_chip *chip = next_chip(NULL);
+	int rc = -ENOMEM;
+	u8 err = 0;
+
+	/* Check arguments */
+	if (scope != 0x01 && scope != 0x02) {
+		prerror("OCC: Reset message with invalid scope 0x%x\n",
+			scope);
+		err = 0x22;
+	}
+
+	/* First queue up an OK response to the reset message itself */
+	rsp = fsp_mkmsg(FSP_RSP_RESET_OCC, 0 | err);
+	if (rsp)
+		rc = fsp_queue_msg(rsp, fsp_freemsg);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+			"OCC: Error %d queueing FSP OCC RESET reply\n", rc);
+		return;
+	}
+
+	/* If we had an error, return */
+	if (err)
+		return;
+
+	/* Call HBRT... */
+	rc = host_services_occ_start();
+
+	/* Handle fallback to preload */
+	if (rc == -ENOENT && chip->homer_base) {
+		printf("OCC: Reset: Fallback to preloaded image\n");
+		rc = 0;
+	}
+	if (!rc) {
+		/* Send a single success response for all chips */
+		stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, 0, seq_id);
+		if (stat)
+			rc = fsp_queue_msg(stat, fsp_freemsg);
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+				"OCC: Error %d queueing FSP OCC RESET"
+					" STATUS message\n", rc);
+		}
+	} else {
+
+		/*
+		 * Then send a matching OCC Reset Status message with an 0xFE
+		 * (fail) response code as well to the first matching chip
+		 */
+		for_each_chip(chip) {
+			if (scope == 0x01 && dbob_id != chip->dbob_id)
+				continue;
+			rc = -ENOMEM;
+			stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2,
+					 0xfe00 | (chip->pcid & 0xff), seq_id);
+			if (stat)
+				rc = fsp_queue_msg(stat, fsp_freemsg);
+			if (rc) {
+				log_simple_error(&e_info(OPAL_RC_OCC_RESET),
+					"OCC: Error %d queueing FSP OCC RESET"
+						" STATUS message\n", rc);
+			}
+			break;
+		}
+	}
+}
+
+static bool fsp_occ_msg(u32 cmd_sub_mod, struct fsp_msg *msg)
+{
+	u32 dbob_id, seq_id;
+	u8 scope;
+
+	switch (cmd_sub_mod) {
+	case FSP_CMD_LOAD_OCC:
+		/*
+		 * We get the "Load OCC" command at boot. We don't currently
+		 * support loading it ourselves (we don't have the procedures,
+		 * they will come with Host Services). For now HostBoot will
+		 * have loaded a OCC firmware for us, but we still need to
+		 * be nice and respond to OCC.
+		 */
+		scope = msg->data.bytes[3];
+		dbob_id = msg->data.words[1];
+		seq_id = msg->data.words[2];
+		printf("OCC: Got OCC Load message, scope=0x%x dbob=0x%x"
+		       " seq=0x%x\n", scope, dbob_id, seq_id);
+		occ_do_load(scope, dbob_id, seq_id);
+		return true;
+
+	case FSP_CMD_RESET_OCC:
+		/*
+		 * We shouldn't be getting this one, but if we do, we have
+		 * to reply something sensible or the FSP will get upset
+		 */
+		scope = msg->data.bytes[3];
+		dbob_id = msg->data.words[1];
+		seq_id = msg->data.words[2];
+		printf("OCC: Got OCC Reset message, scope=0x%x dbob=0x%x"
+		       " seq=0x%x\n", scope, dbob_id, seq_id);
+		occ_do_reset(scope, dbob_id, seq_id);
+		return true;
+	}
+	return false;
+}
+
+static struct fsp_client fsp_occ_client = {
+	.message = fsp_occ_msg,
+};
+
+void occ_fsp_init(void)
+{
+	/* OCC is P8 only */
+	if (proc_gen != proc_gen_p8)
+		return;
+
+	/* If we have an FSP, register for notifications */
+	if (fsp_present())
+		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
+}
+
+
diff --git a/hw/p5ioc2-phb.c b/hw/p5ioc2-phb.c
new file mode 100644
index 00000000..b52fe44b
--- /dev/null
+++ b/hw/p5ioc2-phb.c
@@ -0,0 +1,1233 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <p5ioc2.h>
+#include <p5ioc2-regs.h>
+#include <io.h>
+#include <timebase.h>
+#include <affinity.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <interrupts.h>
+#include <ccan/str/str.h>
+
+static void p5ioc2_phb_trace(struct p5ioc2_phb *p, FILE *s, const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+
+static void p5ioc2_phb_trace(struct p5ioc2_phb *p, FILE *s, const char *fmt, ...)
+{
+	/* Use a temp stack buffer to print all at once to avoid
+	 * mixups of a trace entry on SMP
+	 */
+	char tbuf[128 + 10];
+	va_list args;
+	char *b = tbuf;
+
+	b += sprintf(b, "PHB%d: ", p->phb.opal_id);
+	va_start(args, fmt);
+	vsnprintf(b, 128, fmt, args);
+	va_end(args);
+	fputs(tbuf, s);
+}
+#define PHBDBG(p, fmt...)	p5ioc2_phb_trace(p, stdout, fmt)
+#define PHBERR(p, fmt...)	p5ioc2_phb_trace(p, stderr, fmt)
+
+/* Helper to set the state machine timeout */
+static inline uint64_t p5ioc2_set_sm_timeout(struct p5ioc2_phb *p, uint64_t dur)
+{
+	uint64_t target, now = mftb();
+
+	target = now + dur;
+	if (target == 0)
+		target++;
+	p->delay_tgt_tb = target;
+
+	return dur;
+}
+
+/*
+ * Lock callbacks. Allows the OPAL API handlers to lock the
+ * PHB around calls such as config space, EEH, etc...
+ */
+static void p5ioc2_phb_lock(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+
+	lock(&p->lock);
+}
+
+static  void p5ioc2_phb_unlock(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+
+	unlock(&p->lock);
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t p5ioc2_pcicfg_address(struct p5ioc2_phb *p, uint32_t bdfn,
+				     uint32_t offset, uint32_t size)
+{
+	uint32_t addr, sm = size - 1;
+
+	if (bdfn > 0xffff)
+		return OPAL_PARAMETER;
+	/* XXX Should we enable 4K config space on PCI-X 2.0 ? */
+	if ((offset > 0xff && !p->is_pcie) || offset > 0xfff)
+		return OPAL_PARAMETER;
+	if (offset & sm)
+		return OPAL_PARAMETER;
+
+	/* The root bus only has a device at 0 and we get into an
+	 * error state if we try to probe beyond that, so let's
+	 * avoid that and just return an error to Linux
+	 */
+	if (p->is_pcie && (bdfn >> 8) == 0 && (bdfn & 0xff))
+		return OPAL_HARDWARE;
+
+	/* Prevent special operation generation */
+	if (((bdfn >> 3) & 0x1f) == 0x1f)
+		return OPAL_HARDWARE;
+
+	/* Check PHB state */
+	if (p->state == P5IOC2_PHB_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	/* Additionally, should we prevent writes to the PHB own
+	 * bus number register ?
+	 */
+
+	addr = CAP_PCADR_ENABLE | ((uint64_t)bdfn << CAP_PCADR_FUNC_LSH);
+	addr |= (offset & 0xff);
+	addr |= ((offset & 0xf00) << (CAP_PCADR_EXTOFF_LSH - 8));
+	out_le32(p->regs + CAP_PCADR, addr);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_pcicfg_read8(struct phb *phb, uint32_t bdfn,
+				  uint32_t offset, uint8_t *data)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	int64_t rc;
+
+	/* Initialize data in case of error */
+	*data = 0xff;
+
+	rc = p5ioc2_pcicfg_address(p, bdfn, offset, 1);
+	if (rc)
+		return rc;
+
+	*data = in_8(p->regs + CAP_PCDAT + (offset & 3));
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_pcicfg_read16(struct phb *phb, uint32_t bdfn,
+				   uint32_t offset, uint16_t *data)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	int64_t rc;
+
+	/* Initialize data in case of error */
+	*data = 0xffff;
+
+	rc = p5ioc2_pcicfg_address(p, bdfn, offset, 2);
+	if (rc)
+		return rc;
+
+	*data = in_le16(p->regs + CAP_PCDAT + (offset & 3));
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_pcicfg_read32(struct phb *phb, uint32_t bdfn,
+				   uint32_t offset, uint32_t *data)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	int64_t rc;
+
+	/* Initialize data in case of error */
+	*data = 0xffffffff;
+
+	rc = p5ioc2_pcicfg_address(p, bdfn, offset, 4);
+	if (rc)
+		return rc;
+
+	*data = in_le32(p->regs + CAP_PCDAT);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_pcicfg_write8(struct phb *phb, uint32_t bdfn,
+				   uint32_t offset, uint8_t data)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	int64_t rc;
+
+	rc = p5ioc2_pcicfg_address(p, bdfn, offset, 1);
+	if (rc)
+		return rc;
+
+	out_8(p->regs + CAP_PCDAT + (offset & 3), data);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_pcicfg_write16(struct phb *phb, uint32_t bdfn,
+				    uint32_t offset, uint16_t data)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	int64_t rc;
+
+	rc = p5ioc2_pcicfg_address(p, bdfn, offset, 2);
+	if (rc)
+		return rc;
+
+	out_le16(p->regs + CAP_PCDAT + (offset & 3), data);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_pcicfg_write32(struct phb *phb, uint32_t bdfn,
+				    uint32_t offset, uint32_t data)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	int64_t rc;
+
+	rc = p5ioc2_pcicfg_address(p, bdfn, offset, 4);
+	if (rc)
+		return rc;
+
+	out_le32(p->regs + CAP_PCDAT, data);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_presence_detect(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint16_t slotstat;
+	int64_t rc;
+
+	if (!p->is_pcie) {
+		uint32_t lsr;
+
+		lsr = in_be32(p->regs + SHPC_LOGICAL_SLOT);
+		if (GETFIELD(SHPC_LOGICAL_SLOT_PRSNT, lsr)
+		    != SHPC_SLOT_STATE_EMPTY)
+			return OPAL_SHPC_DEV_PRESENT;
+		else
+		return OPAL_SHPC_DEV_NOT_PRESENT;
+	}
+
+	rc = p5ioc2_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTSTAT,
+				 &slotstat);
+	if (rc || !(slotstat & PCICAP_EXP_SLOTSTAT_PDETECTST))
+		return OPAL_SHPC_DEV_NOT_PRESENT;
+	return OPAL_SHPC_DEV_PRESENT;
+}
+
+static int64_t p5ioc2_link_state(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint16_t lstat;
+	int64_t rc;
+
+	/* XXX Test for PHB in error state ? */
+	if (!p->is_pcie)
+		return OPAL_SHPC_LINK_UP_x1;
+
+	rc = p5ioc2_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_LSTAT,
+				 &lstat);
+	if (rc < 0) {
+		/* Shouldn't happen */
+		PHBERR(p, "Failed to read link status\n");
+		return OPAL_HARDWARE;
+	}
+	if (!(lstat & PCICAP_EXP_LSTAT_DLLL_ACT))
+		return OPAL_SHPC_LINK_DOWN;
+	return GETFIELD(PCICAP_EXP_LSTAT_WIDTH, lstat);
+}
+
+static int64_t p5ioc2_power_state(struct phb *phb __unused)
+{
+	/* XXX FIXME */
+#if 0
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint64_t reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+
+	/* XXX Test for PHB in error state ? */
+
+	if (reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)
+		return OPAL_SHPC_POWER_ON;
+
+	return OPAL_SHPC_POWER_OFF;
+#else
+	return OPAL_SHPC_POWER_ON;
+#endif
+}
+
+/* p5ioc2_sm_slot_power_off - Slot power off state machine
+ */
+static int64_t p5ioc2_sm_slot_power_off(struct p5ioc2_phb *p)
+{
+	switch(p->state) {
+	default:
+		break;
+	}
+
+	/* Unknown state, hardware error ? */
+	return OPAL_HARDWARE;
+}
+
+static int64_t p5ioc2_slot_power_off(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+
+	if (p->state != P5IOC2_PHB_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* run state machine */
+	return p5ioc2_sm_slot_power_off(p);
+}
+
+static int64_t p5ioc2_sm_slot_power_on(struct p5ioc2_phb *p __unused)
+{
+#if 0
+	uint64_t reg;
+	uint32_t reg32;
+	uint16_t brctl;
+
+	switch(p->state) {
+	case P5IOC2_PHB_STATE_FUNCTIONAL:
+		/* Check presence */
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PRSTN_STAT)) {
+			PHBDBG(p, "Slot power on: no device\n");
+			return OPAL_CLOSED;
+		}
+
+		/* Adjust UTL interrupt settings to disable various
+		 * errors that would interfere with the process
+		 */
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0x7e00000000000000);
+
+		/* If the power is not on, turn it on now */
+		if (!(reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)) {
+			reg = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE);
+			reg &= ~(0x8c00000000000000ul);
+			reg |= 0x8400000000000000ul;
+			out_be64(p->regs + PHB_HOTPLUG_OVERRIDE, reg);
+			p->state = PHB_STATE_SPUP_STABILIZE_DELAY;
+			PHBDBG(p, "Slot power on: powering on...\n");
+			return p5ioc2_set_sm_timeout(p, secs_to_tb(2));
+		}
+		/* Power is already on */
+	power_ok:
+		/* Ensure hot reset is deasserted */
+		p5ioc2_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		p5ioc2_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		p->retries = 40;
+		p->state = PHB_STATE_SPUP_WAIT_LINK;
+		PHBDBG(p, "Slot power on: waiting for link\n");
+		/* Fall through */
+	case PHB_STATE_SPUP_WAIT_LINK:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		/* Link is up ? Complete */
+
+		/* XXX TODO: Check link width problem and if present
+		 * go straight to the host reset code path.
+		 */
+		if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) {
+			/* Restore UTL interrupts */
+			out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+				 0xfe65000000000000);
+			p->state = PHB_STATE_FUNCTIONAL;
+			PHBDBG(p, "Slot power on: up !\n");
+			return OPAL_SUCCESS;
+		}
+		/* Retries */
+		p->retries--;
+		if (p->retries == 0) {
+			/* XXX Improve logging */
+			PHBERR(p,"Slot power on: Timeout waiting for link\n");
+			goto error;
+		}
+		/* Check time elapsed */
+		if ((p->retries % 20) != 0)
+			return p5ioc2_set_sm_timeout(p, msecs_to_tb(10));
+
+		/* >200ms, time to try a hot reset after clearing the
+		 * link status bit (doco says to do so)
+		 */
+		out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0x0080000000000000);
+
+		/* Mask receiver error status in AER */
+		p5ioc2_pcicfg_read32(&p->phb, 0,
+				    p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+		reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+		p5ioc2_pcicfg_write32(&p->phb, 0,
+				     p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+		/* Turn on host reset */
+		p5ioc2_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+		p5ioc2_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		p->state = PHB_STATE_SPUP_HOT_RESET_DELAY;
+		PHBDBG(p, "Slot power on: soft reset...\n");
+		return p5ioc2_set_sm_timeout(p, secs_to_tb(1));
+	case PHB_STATE_SPUP_HOT_RESET_DELAY:
+		/* Turn off host reset */
+		p5ioc2_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		p5ioc2_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		/* Clear spurious errors */
+		out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0x00e0000000000000);
+		p5ioc2_pcicfg_write32(&p->phb, 0,
+				     p->aercap + PCIECAP_AER_CE_STATUS,
+				     PCIECAP_AER_CE_RECVR_ERR);
+		/* Unmask receiver error status in AER */
+		p5ioc2_pcicfg_read32(&p->phb, 0,
+				    p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+		reg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+		p5ioc2_pcicfg_write32(&p->phb, 0,
+				     p->aercap + PCIECAP_AER_CE_MASK, reg32);
+		/* Go back to waiting for link */
+		p->state = PHB_STATE_SPUP_WAIT_LINK;
+		PHBDBG(p, "Slot power on: waiting for link (2)\n");
+		return p5ioc2_set_sm_timeout(p, msecs_to_tb(10));
+
+	case PHB_STATE_SPUP_STABILIZE_DELAY:
+		/* Come here after the 2s delay after power up */
+		p->retries = 1000;
+		p->state = PHB_STATE_SPUP_SLOT_STATUS;
+		PHBDBG(p, "Slot power on: waiting for power\n");
+		/* Fall through */
+	case PHB_STATE_SPUP_SLOT_STATUS:
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+
+		/* Doc says to check LED status, but we ignore that, there
+		 * no point really and it's easier that way
+		 */
+		if (reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)
+			goto power_ok;
+		if (p->retries-- == 0) {
+			/* XXX Improve error logging */
+			PHBERR(p, "Timeout powering up slot\n");
+			goto error;
+		}
+		return p5ioc2_set_sm_timeout(p, msecs_to_tb(10));
+	default:
+		break;
+	}
+
+	/* Unknown state, hardware error ? */
+ error:
+	p->state = PHB_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+#else
+	return OPAL_SUCCESS;
+#endif
+}
+
+static int64_t p5ioc2_slot_power_on(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+
+	if (p->state != P5IOC2_PHB_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* run state machine */
+	return p5ioc2_sm_slot_power_on(p);
+}
+
+static int64_t p5ioc2_sm_hot_reset(struct p5ioc2_phb *p)
+{
+	switch(p->state) {
+	default:
+		break;
+	}
+
+	/* Unknown state, hardware error ? */
+	return OPAL_HARDWARE;
+}
+
+static int64_t p5ioc2_hot_reset(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+
+	if (p->state != P5IOC2_PHB_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* run state machine */
+	return p5ioc2_sm_hot_reset(p);
+}
+
+static int64_t p5ioc2_sm_freset(struct p5ioc2_phb *p)
+{
+	switch(p->state) {
+	default:
+		break;
+	}
+
+	/* XXX Not implemented, return success to make
+	 * pci.c happy, otherwise probing of slots will
+	 * fail
+	 */
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_freset(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+
+	if (p->state != P5IOC2_PHB_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* run state machine */
+	return p5ioc2_sm_freset(p);
+}
+
+static int64_t p5ioc2_poll(struct phb *phb)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint64_t now = mftb();
+
+	if (p->state == P5IOC2_PHB_STATE_FUNCTIONAL)
+		return OPAL_SUCCESS;
+
+	/* Check timer */
+	if (p->delay_tgt_tb &&
+	    tb_compare(now, p->delay_tgt_tb) == TB_ABEFOREB)
+		return p->delay_tgt_tb - now;
+
+	/* Expired (or not armed), clear it */
+	p->delay_tgt_tb = 0;
+
+#if 0
+	/* Dispatch to the right state machine */
+	switch(p->state) {
+	case PHB_STATE_SPUP_STABILIZE_DELAY:
+	case PHB_STATE_SPUP_SLOT_STATUS:
+	case PHB_STATE_SPUP_WAIT_LINK:
+	case PHB_STATE_SPUP_HOT_RESET_DELAY:
+		return p5ioc2_sm_slot_power_on(p);
+	case PHB_STATE_SPDOWN_STABILIZE_DELAY:
+	case PHB_STATE_SPDOWN_SLOT_STATUS:
+		return p5ioc2_sm_slot_power_off(p);
+	case PHB_STATE_HRESET_DELAY:
+		return p5ioc2_sm_hot_reset(p);
+	default:
+		break;
+	}
+#endif
+	/* Unknown state, could be a HW error */
+	return OPAL_HARDWARE;
+}
+
+static int64_t p5ioc2_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+					uint8_t *freeze_state,
+					uint16_t *pci_error_type,
+					uint16_t *severity,
+					uint64_t *phb_status __unused)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint32_t cfgrw;
+
+	/* Defaults: not frozen */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	if (severity)
+		*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	if (pe_number != 0)
+		return OPAL_PARAMETER;
+
+	/* XXX Handle PHB status */
+	/* XXX We currently only check for PE freeze, not fence */
+
+	cfgrw = in_be32(p->regs + CAP_PCFGRW);
+	if (cfgrw & CAP_PCFGRW_MMIO_FROZEN)
+		*freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+	if (cfgrw & CAP_PCFGRW_DMA_FROZEN)
+		*freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+	if (severity &&
+	    (cfgrw & (CAP_PCFGRW_MMIO_FROZEN | CAP_PCFGRW_MMIO_FROZEN)))
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+	/* XXX Don't bother populating pci_error_type */
+	/* Should read the bits from PLSSR */
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_eeh_next_error(struct phb *phb, uint64_t *first_frozen_pe,
+				     uint16_t *pci_error_type, uint16_t *severity)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint32_t cfgrw;
+
+	/* XXX Don't bother */
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	*first_frozen_pe = 0;
+
+	cfgrw = in_be32(p->regs + CAP_PCFGRW);
+	if (cfgrw & (CAP_PCFGRW_MMIO_FROZEN | CAP_PCFGRW_MMIO_FROZEN))
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+				       uint64_t eeh_action_token)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint32_t cfgrw;
+
+	if (pe_number != 0)
+		return OPAL_PARAMETER;
+
+	/*
+	 * This sequence isn't very well documented. We play guess
+	 * games based on the documentation, what we do on P7IOC,
+	 * and common sense.
+	 *
+	 * Basically we start from the low level (UTL), clear all
+	 * error conditions there. Then we clear error conditions
+	 * in the PLSSR and DMACSR.
+	 *
+	 * Once that's done, we unfreeze the PHB
+	 *
+	 * Note: Should we also clear the error bits in the config
+	 * space ? The docs don't say anything... TODO: Check what
+	 * OPAL does if possible or ask Milton.
+	 */
+
+	/* Clear UTL error regs on PCIe */
+	if (p->is_pcie) {
+		uint32_t err;
+	
+		err = in_be32(p->regs + UTL_SYS_BUS_AGENT_STATUS);
+		out_be32(p->regs + UTL_SYS_BUS_AGENT_STATUS, err);
+		err = in_be32(p->regs + UTL_PCIE_PORT_STATUS);
+		out_be32(p->regs + UTL_PCIE_PORT_STATUS, err);
+		err = in_be32(p->regs + UTL_RC_STATUS);
+		out_be32(p->regs + UTL_RC_STATUS, err);
+	}
+
+	/* XXX We should probably clear the error regs in the cfg space... */
+
+	/* Clear PLSSR and DMACSR */
+	out_be32(p->regs + CAP_DMACSR, 0);
+	out_be32(p->regs + CAP_PLSSR, 0);
+
+	/* Clear freeze state as requested */
+	cfgrw = in_be32(p->regs + CAP_PCFGRW);
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+		cfgrw &= ~CAP_PCFGRW_MMIO_FROZEN;
+		out_be32(p->regs + CAP_PCFGRW, cfgrw);
+	}
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+		cfgrw &= ~CAP_PCFGRW_DMA_FROZEN;
+		out_be32(p->regs + CAP_PCFGRW, cfgrw);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_get_msi_64(struct phb *phb __unused, uint32_t mve_number,
+				uint32_t xive_num, uint8_t msi_range,
+				uint64_t *msi_address, uint32_t *message_data)
+{
+	if (mve_number > 255 || xive_num > 255 || msi_range != 1)
+		return OPAL_PARAMETER;
+
+	*msi_address = 0x1000000000000000ul;
+	*message_data = xive_num;
+
+	return OPAL_SUCCESS;
+}
+
+static uint8_t p5ioc2_choose_bus(struct phb *phb __unused,
+				struct pci_device *bridge __unused,
+				uint8_t candidate, uint8_t *max_bus __unused,
+				bool *use_max)
+{
+	/* Use standard bus number selection */
+	*use_max = false;
+	return candidate;
+}
+
+/* p5ioc2_phb_ioda_reset - Reset the IODA tables
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ *
+ * Note: We don't handle EEH on p5ioc2, we use no cache
+ * and thus always purge
+ */
+static int64_t p5ioc2_ioda_reset(struct phb *phb, bool purge __unused)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	unsigned int i;
+
+	/* Init XIVRs */
+	for (i = 0; i < 16; i++) {
+		p->xive_cache[i] = SETFIELD(CAP_XIVR_PRIO, 0, 0xff);
+		out_be32(p->regs + CAP_XIVRn(i), 0x000000ff);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_set_phb_tce_memory(struct phb *phb,
+					 uint64_t tce_mem_addr,
+					 uint64_t tce_mem_size)
+{
+	struct p5ioc2_phb *p = phb_to_p5ioc2_phb(phb);
+	uint64_t tar;
+	uint32_t cfg;
+
+	printf("PHB%d: set_tce_memory: 0x%016llx 0x%016llx\n",
+	       p->index, tce_mem_addr, tce_mem_size);
+	printf("PHB%d: bridge values : 0x%016llx 0x%016llx\n",
+	       p->index, p->ioc->tce_base, p->ioc->tce_size);
+
+	/* First check if it fits in the memory established for
+	 * the IO HUB
+	 */
+	if (tce_mem_addr &&
+	    (tce_mem_addr < p->ioc->tce_base ||
+	     tce_mem_addr > (p->ioc->tce_base + p->ioc->tce_size) ||
+	     (tce_mem_addr + tce_mem_size) >
+	     (p->ioc->tce_base + p->ioc->tce_size))) {
+		prerror("PHB%d: TCEs not in bridge range\n", p->index);
+		return OPAL_PARAMETER;
+	}
+
+	/* Supported sizes are power of two's naturally aligned
+	 * and between 64K and 8M (p5ioc2 spec)
+	 */
+	if (tce_mem_addr && !is_pow2(tce_mem_size)) {
+		prerror("PHB%d: Size is not a power of 2\n", p->index);
+		return OPAL_PARAMETER;
+	}
+	if (tce_mem_addr & (tce_mem_size - 1)) {
+		prerror("PHB%d: Not naturally aligned\n", p->index);
+		return OPAL_PARAMETER;
+	}
+	if (tce_mem_addr &&
+	    (tce_mem_size < 0x10000 || tce_mem_size > 0x800000)) {
+		prerror("PHB%d: Size out of range\n", p->index);
+		return OPAL_PARAMETER;
+	}
+
+	/* First we disable TCEs in the bridge */
+	cfg = in_be32(p->regs + CAP_PCFGRW);
+	cfg &= ~CAP_PCFGRW_TCE_EN;
+	out_be32(p->regs + CAP_PCFGRW, cfg);
+
+
+	/* Now there's a blurb in the spec about all TARm needing
+	 * to have the same size.. I will let that as a surprise
+	 * for the user ... Linux does it fine and I'd rather not
+	 * keep more state to check than I need to
+	 */
+	tar = 0;
+	if (tce_mem_addr) {
+		tar = SETFIELD(CA_TAR_HUBID, 0ul, p->ca ? 4 : 1);
+		tar = SETFIELD(CA_TAR_ALTHUBID, tar, p->ca ? 4 : 1);
+		tar = SETFIELD(CA_TAR_NUM_TCE, tar, ilog2(tce_mem_size) - 16);
+		tar |= tce_mem_addr; /* addr is naturally aligned */
+		tar |= CA_TAR_VALID;
+		printf("PHB%d: Writing TAR: 0x%016llx\n", p->index, tar);
+	}
+	out_be64(p->ca_regs + CA_TARn(p->index), tar);
+
+	/* Now set the TCE enable if we set a valid address */
+	if (tce_mem_addr) {
+		cfg |= CAP_PCFGRW_TCE_EN;
+		out_be32(p->regs + CAP_PCFGRW, cfg);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+
+static const struct phb_ops p5ioc2_phb_ops = {
+	.lock			= p5ioc2_phb_lock,
+	.unlock			= p5ioc2_phb_unlock,
+	.cfg_read8		= p5ioc2_pcicfg_read8,
+	.cfg_read16		= p5ioc2_pcicfg_read16,
+	.cfg_read32		= p5ioc2_pcicfg_read32,
+	.cfg_write8		= p5ioc2_pcicfg_write8,
+	.cfg_write16		= p5ioc2_pcicfg_write16,
+	.cfg_write32		= p5ioc2_pcicfg_write32,
+	.choose_bus		= p5ioc2_choose_bus,
+	.eeh_freeze_status	= p5ioc2_eeh_freeze_status,
+	.eeh_freeze_clear	= p5ioc2_eeh_freeze_clear,
+	.next_error		= p5ioc2_eeh_next_error,
+	.get_msi_64		= p5ioc2_get_msi_64,
+	.ioda_reset		= p5ioc2_ioda_reset,
+	.set_phb_tce_memory	= p5ioc2_set_phb_tce_memory,
+	.presence_detect	= p5ioc2_presence_detect,
+	.link_state		= p5ioc2_link_state,
+	.power_state		= p5ioc2_power_state,
+	.slot_power_off		= p5ioc2_slot_power_off,
+	.slot_power_on		= p5ioc2_slot_power_on,
+	.hot_reset		= p5ioc2_hot_reset,
+	.fundamental_reset	= p5ioc2_freset,
+	.poll			= p5ioc2_poll,
+};
+
+/* p5ioc2_phb_get_xive - Interrupt control from OPAL */
+static int64_t p5ioc2_phb_get_xive(void *data, uint32_t isn,
+				   uint16_t *server, uint8_t *prio)
+{
+	struct p5ioc2_phb *p = data;
+	uint32_t irq, xivr, fbuid = P7_IRQ_FBUID(isn);
+
+	if (fbuid != p->buid)
+		return OPAL_PARAMETER;
+	irq = isn & 0xf;
+
+	xivr = p->xive_cache[irq];
+	*server = GETFIELD(CAP_XIVR_SERVER, xivr);
+	*prio = GETFIELD(CAP_XIVR_PRIO, xivr);
+
+	return OPAL_SUCCESS;
+}
+
+/* p5ioc2_phb_set_xive - Interrupt control from OPAL */
+static int64_t p5ioc2_phb_set_xive(void *data, uint32_t isn,
+				   uint16_t server, uint8_t prio)
+{
+	struct p5ioc2_phb *p = data;
+	uint32_t irq, xivr, fbuid = P7_IRQ_FBUID(isn);
+
+	if (fbuid != p->buid)
+		return OPAL_PARAMETER;
+	irq = isn & 0xf;
+
+	printf("PHB%d: Set XIVE isn %04x (irq=%d) server=%x, prio=%x\n",
+	       p->index, isn, irq, server, prio);
+
+	xivr = SETFIELD(CAP_XIVR_SERVER, 0, server);
+	xivr = SETFIELD(CAP_XIVR_PRIO, xivr, prio);
+	p->xive_cache[irq] = xivr;
+
+	/* Now we mangle the server and priority */
+	if (prio == 0xff) {
+		server = 0;
+		prio = 0xff;
+	} else {
+		prio = (prio >> 3) | ((server & 7) << 5);
+		server = server >> 3;
+	}
+
+	/* We use HRT entry 0 always for now */
+	xivr = SETFIELD(CAP_XIVR_SERVER, 0, server);
+	xivr = SETFIELD(CAP_XIVR_PRIO, xivr, prio);
+	out_be32(p->regs + CAP_XIVRn(irq), xivr);
+	printf("PHB%d: wrote 0x%08x to XIVR %d\n", p->index, xivr, irq);
+
+	return OPAL_SUCCESS;
+}
+
+/* IRQ ops for OS interrupts (not internal) */
+static const struct irq_source_ops p5ioc2_phb_os_irq_ops = {
+	.get_xive = p5ioc2_phb_get_xive,
+	.set_xive = p5ioc2_phb_set_xive,
+};
+
+
+static void p5ioc2_phb_init_utl(struct p5ioc2_phb *p __unused)
+{
+	/* XXX FIXME */
+}
+
+static void p5ioc2_phb_init_pcie(struct p5ioc2_phb *p)
+{
+	int64_t ecap, aercap;
+
+	ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+	if (ecap < 0) {
+		/* Shouldn't happen */
+		prerror("P5IOC2: Failed to locate PCI-E cap in bridge\n");
+		return;
+	}
+	p->ecap = ecap;
+
+	aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+	if (aercap < 0) {
+		/* Shouldn't happen */
+		prerror("P5IOC2: Failed to locate AER ext cap in bridge\n");
+		return;
+	}
+	p->aercap = aercap;
+
+	/* XXX plenty more to do ... */
+}
+
+static void p5ioc2_phb_hwinit(struct p5ioc2_phb *p)
+{
+	uint16_t pcicmd;
+	uint32_t phbid;
+
+	printf("P5IOC2: Initializing PHB HW...\n");
+
+	/* Enable PHB and and disable address decoding */
+	phbid = in_be32(p->ca_regs + CA_PHBIDn(p->index));
+	phbid |= CA_PHBID_PHB_ENABLE;
+	phbid &= ~CA_PHBID_ADDRSPACE_ENABLE;
+	out_be32(p->ca_regs + CA_PHBIDn(p->index), phbid);
+
+	/* Set BUID */
+	out_be32(p->regs + CAP_BUID, SETFIELD(CAP_BUID, 0,
+					      P7_BUID_BASE(p->buid)));
+	out_be32(p->regs + CAP_MSIBASE, P7_BUID_BASE(p->buid) << 16);
+
+	/* Set IO and Memory mapping */
+	out_be32(p->regs + CAP_IOAD_H, hi32(p->io_base + IO_PCI_START));
+	out_be32(p->regs + CAP_IOAD_L, lo32(p->io_base + IO_PCI_START));
+	out_be32(p->regs + CAP_IOSZ, ~(IO_PCI_SIZE - 1));
+	out_be32(p->regs + CAP_IO_ST, IO_PCI_START);
+	out_be32(p->regs + CAP_MEM1_H, hi32(p->mm_base + MM_PCI_START));
+	out_be32(p->regs + CAP_MEM1_L, lo32(p->mm_base + MM_PCI_START));
+	out_be32(p->regs + CAP_MSZ1, ~(MM_PCI_SIZE - 1));
+	out_be32(p->regs + CAP_MEM_ST, MM_PCI_START);
+
+	/* Setup the MODE registers. We captures the values used
+	 * by pHyp/OPAL
+	 */
+	out_be32(p->regs + CAP_MODE0, 0x00800010);
+	out_be32(p->regs + CAP_MODE1, 0x00800000);
+	out_be32(p->regs + CAP_MODE3, 0xFFC00050);
+	if (p->is_pcie)
+		out_be32(p->regs + CAP_MODE2, 0x00000400);
+	else
+		out_be32(p->regs + CAP_MODE2, 0x00000408);
+
+	/* XXX Setup of the arbiter... not sure what to do here,
+	 * probably system specific (depends on whow things are
+	 * wired on the motherboard). I set things up based on
+	 * the values I read on a Juno machine. We setup the BPR
+	 * with the various timeouts etc... as well based one
+	 * similarily captured values
+	 */
+	if (p->is_pcie) {
+		out_be32(p->regs + CAP_AER, 0x04000000);
+		out_be32(p->regs + CAP_BPR, 0x0000004f);
+	} else {
+		out_be32(p->regs + CAP_AER, 0x84000000);
+		out_be32(p->regs + CAP_BPR, 0x000f00ff);
+	}
+
+	/* XXX Setup error reporting registers */
+
+	/* Clear errors in PLSSR and DMACSR */
+	out_be32(p->regs + CAP_DMACSR, 0);
+	out_be32(p->regs + CAP_PLSSR, 0);	
+
+	/* Configure MSIs on PCIe only */
+	if (p->is_pcie) {
+		/* XXX Check that setting ! That's what OPAL uses but
+		 * I suspect it might not be correct. We enable a masking
+		 * of 3 bits and no offset, which makes me think only
+		 * some MSIs will work... not 100% certain.
+		 */
+		out_be32(p->regs + CAP_MVE0, CAP_MVE_VALID |
+			 SETFIELD(CAP_MVE_TBL_OFF, 0, 0) |
+			 SETFIELD(CAP_MVE_NUM_INT, 0, 0x3));
+		out_be32(p->regs + CAP_MVE1, 0);
+	}
+
+	/* Configuration. We keep TCEs disabled */
+	out_be32(p->regs + CAP_PCFGRW,
+		 CAP_PCFGRW_ERR_RECOV_EN |
+		 CAP_PCFGRW_FREEZE_EN |
+		 CAP_PCFGRW_DAC_DISABLE |
+		 (p->is_pcie ? CAP_PCFGRW_MSI_EN : 0));
+
+	/* Re-enable address decode */
+	phbid |= CA_PHBID_ADDRSPACE_ENABLE;
+	out_be32(p->ca_regs + CA_PHBIDn(p->index), phbid);
+
+	/* PCIe specific inits */
+	if (p->is_pcie) {
+		p5ioc2_phb_init_utl(p);
+		p5ioc2_phb_init_pcie(p);
+	}
+
+	/* Take out reset pins on PCI-X. PCI-E will be handled via the hotplug
+	 * controller separately
+	 */
+	if (!p->is_pcie) {
+		uint32_t val;
+
+		/* Setting 1's will deassert the reset signals */
+		out_be32(p->regs + CAP_CRR, CAP_CRR_RESET1 | CAP_CRR_RESET2);
+
+		/* Set max sub bus */
+		p5ioc2_pcicfg_write8(&p->phb, 0, 0x41, 0xff);
+
+		/* XXX SHPC stuff */
+		printf("P5IOC2: SHPC Slots available 1  : %08x\n",
+		       in_be32(p->regs + 0xb20));
+		printf("P5IOC2: SHPC Slots available 2  : %08x\n",
+		       in_be32(p->regs + 0xb24));
+		printf("P5IOC2: SHPC Slots config       : %08x\n",
+		       in_be32(p->regs + 0xb28));
+		printf("P5IOC2: SHPC Secondary bus conf : %08x\n",
+		       in_be32(p->regs + 0xb2c));
+
+		p5ioc2_pcicfg_read32(&p->phb, 0, 0, &val);
+		printf("P5IOC2: val0: %08x\n", val);
+		p5ioc2_pcicfg_read32(&p->phb, 0, 4, &val);
+		printf("P5IOC2: val4: %08x\n", val);
+	}
+
+	/* Enable PCI command/status */
+	p5ioc2_pcicfg_read16(&p->phb, 0, PCI_CFG_CMD, &pcicmd);
+	pcicmd |= PCI_CFG_CMD_IO_EN | PCI_CFG_CMD_MEM_EN |
+		PCI_CFG_CMD_BUS_MASTER_EN;
+	p5ioc2_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD, pcicmd);
+
+	p->state = P5IOC2_PHB_STATE_FUNCTIONAL;
+}
+
+static void p5ioc2_pcie_add_node(struct p5ioc2_phb *p)
+{
+	uint64_t reg[2], mmb, iob;
+	uint32_t lsibase, icsp = get_ics_phandle();
+	struct dt_node *np;
+
+	reg[0] = cleanup_addr((uint64_t)p->regs);
+	reg[1] = 0x1000;
+
+	np = dt_new_addr(p->ioc->dt_node, "pciex", reg[0]);
+	if (!np)
+		return;
+
+	p->phb.dt_node = np;
+	dt_add_property_strings(np, "compatible", "ibm,p5ioc2-pciex");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+	dt_add_property_cells(np, "interrupt-parent", icsp);
+	/* XXX FIXME: add phb own interrupts */
+	dt_add_property_cells(np, "ibm,opal-num-pes", 1);
+	dt_add_property_cells(np, "ibm,opal-msi-ranges", (p->buid << 4) + 5, 8);
+	/* XXX FIXME: add slot-name */
+	iob = cleanup_addr(p->io_base + IO_PCI_START);
+	mmb = cleanup_addr(p->mm_base + MM_PCI_START);
+	dt_add_property_cells(np, "ranges",
+			      /* IO space */
+			      0x01000000, 0x00000000, 0x00000000,
+			      hi32(iob), lo32(iob), 0, IO_PCI_SIZE,
+			      /* M32 space */
+			      0x02000000, 0x00000000, MM_PCI_START,
+			      hi32(mmb), lo32(mmb), 0, MM_PCI_SIZE);
+
+	/* Add associativity properties */
+	add_chip_dev_associativity(np);
+
+	/* The interrupt maps will be generated in the RC node by the
+	 * PCI code based on the content of this structure:
+	 */
+	lsibase = p->buid << 4;
+	p->phb.lstate.int_size = 1;
+	p->phb.lstate.int_val[0][0] = lsibase + 1;
+	p->phb.lstate.int_val[1][0] = lsibase + 2;
+	p->phb.lstate.int_val[2][0] = lsibase + 3;
+	p->phb.lstate.int_val[3][0] = lsibase + 4;
+	p->phb.lstate.int_parent[0] = icsp;
+	p->phb.lstate.int_parent[1] = icsp;
+	p->phb.lstate.int_parent[2] = icsp;
+	p->phb.lstate.int_parent[3] = icsp;
+
+	/* reset clear timestamp... to add if we do a reset and want
+	 * to avoid waiting in skiboot
+	 */
+	//dt_property_cells("reset-clear-timestamp",....
+}
+
+static void p5ioc2_pcix_add_node(struct p5ioc2_phb *p)
+{
+	uint64_t reg[2], mmb, iob;
+	uint32_t lsibase, icsp = get_ics_phandle();
+	struct dt_node *np;
+
+	reg[0] = cleanup_addr((uint64_t)p->regs);
+	reg[1] = 0x1000;
+
+	np = dt_new_addr(p->ioc->dt_node, "pci", reg[0]);
+	if (!np)
+		return;
+
+	p->phb.dt_node = np;
+	dt_add_property_strings(np, "compatible", "ibm,p5ioc2-pcix");
+	dt_add_property_strings(np, "device_type", "pci");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+	//dt_add_property_cells(np, "bus-width", 8); /* Figure out from VPD ? */
+	dt_add_property_cells(np, "interrupt-parent", icsp);
+	/* XXX FIXME: add phb own interrupts */
+	dt_add_property_cells(np, "ibm,opal-num-pes", 1);
+	/* XXX FIXME: add slot-name */
+	iob = cleanup_addr(p->io_base + IO_PCI_START);
+	mmb = cleanup_addr(p->mm_base + MM_PCI_START);
+	dt_add_property_cells(np, "ranges",
+			      /* IO space */
+			      0x01000000, 0x00000000, 0x00000000,
+			      hi32(iob), lo32(iob), 0, IO_PCI_SIZE,
+			      /* M32 space */
+			      0x02000000, 0x00000000, MM_PCI_START,
+			      hi32(mmb), lo32(mmb), 0, MM_PCI_SIZE);
+
+	/* Add associativity properties */
+	add_chip_dev_associativity(np);
+
+	/* The interrupt maps will be generated in the RC node by the
+	 * PCI code based on the content of this structure:
+	 */
+	lsibase = p->buid << 4;
+	p->phb.lstate.int_size = 1;
+	p->phb.lstate.int_val[0][0] = lsibase + 1;
+	p->phb.lstate.int_val[1][0] = lsibase + 2;
+	p->phb.lstate.int_val[2][0] = lsibase + 3;
+	p->phb.lstate.int_val[3][0] = lsibase + 4;
+	p->phb.lstate.int_parent[0] = icsp;
+	p->phb.lstate.int_parent[1] = icsp;
+	p->phb.lstate.int_parent[2] = icsp;
+	p->phb.lstate.int_parent[3] = icsp;
+
+	/* On PCI-X we need to create an interrupt map here */
+	pci_std_swizzle_irq_map(np, NULL, &p->phb.lstate, 0);
+}
+
+void p5ioc2_phb_setup(struct p5ioc2 *ioc, struct p5ioc2_phb *p,
+		      uint8_t ca, uint8_t index, bool active,
+		      uint32_t buid)
+{
+	uint32_t phbid;
+
+	p->index = index;
+	p->ca = ca;
+	p->ioc = ioc;
+	p->active = active;
+	p->phb.ops = &p5ioc2_phb_ops;
+	p->buid = buid;
+	p->ca_regs = ca ? ioc->ca1_regs : ioc->ca0_regs;
+	p->regs = p->ca_regs + CA_PHBn_REGS(index);
+
+	printf("P5IOC2: Initializing PHB %d on CA%d, regs @%p, BUID 0x%04x\n",
+	       p->index, p->ca, p->regs, p->buid);
+
+	/* Memory map: described in p5ioc2.h */
+	p->mm_base = ca ? ioc->ca1_mm_region : ioc->ca0_mm_region;
+	p->mm_base += MM_WINDOW_SIZE * index;
+	p->io_base = (uint64_t)p->ca_regs;
+	p->io_base += IO_PCI_SIZE * (index + 1);
+	p->state = P5IOC2_PHB_STATE_UNINITIALIZED;
+
+	/* Query PHB type */
+	phbid = in_be32(p->ca_regs + CA_PHBIDn(p->index));
+
+	switch(GETFIELD(CA_PHBID_PHB_TYPE, phbid)) {
+	case CA_PHBTYPE_PCIX1_0:
+		p->is_pcie = false;
+		p->phb.scan_map = 0x0003;
+		p->phb.phb_type = phb_type_pcix_v1;
+		printf("P5IOC2: PHB is PCI/PCI-X 1.0\n");
+		break;
+	case CA_PHBTYPE_PCIX2_0:
+		p->is_pcie = false;
+		p->phb.scan_map = 0x0003;
+		p->phb.phb_type = phb_type_pcix_v2;
+		printf("P5IOC2: PHB is PCI/PCI-X 2.0\n");
+		break;
+	case CA_PHBTYPE_PCIE_G1:
+		p->is_pcie = true;
+		p->phb.scan_map = 0x0001;
+		p->phb.phb_type = phb_type_pcie_v1;
+		printf("P5IOC2: PHB is PCI Express Gen 1\n");
+		break;
+	case CA_PHBTYPE_PCIE_G2:
+		p->is_pcie = true;
+		p->phb.scan_map = 0x0001;
+		p->phb.phb_type = phb_type_pcie_v2;
+		printf("P5IOC2: PHB is PCI Express Gen 2\n");
+		break;
+	default:
+		printf("P5IOC2: Unknown PHB type ! phbid=%08x\n", phbid);
+		p->is_pcie = true;
+		p->phb.scan_map = 0x0001;
+		p->phb.phb_type = phb_type_pcie_v1;
+	}
+
+	/* Find P5IOC2 base location code in IOC */
+	p->phb.base_loc_code = dt_prop_get_def(ioc->dt_node,
+					       "ibm,io-base-loc-code", NULL);
+	if (!p->phb.base_loc_code)
+		prerror("P5IOC2: Base location code not found !\n");
+
+	/* Add device nodes */
+	if (p->is_pcie)
+		p5ioc2_pcie_add_node(p);
+	else
+		p5ioc2_pcix_add_node(p);
+
+	/* Initialize PHB HW */
+	p5ioc2_phb_hwinit(p);
+
+	/* Register all 16 interrupt sources for now as OS visible
+	 *
+	 * If we ever add some EEH, we might take out the error interrupts
+	 * and register them as OPAL internal interrupts instead
+	 */
+	register_irq_source(&p5ioc2_phb_os_irq_ops, p, p->buid << 4, 16);
+
+	/* We cannot query the PHB type yet as the registers aren't routed
+	 * so we'll do that in the inits, at which point we'll establish
+	 * the scan map
+	 */
+
+	/* We register the PHB before we initialize it so we
+	 * get a useful OPAL ID for it
+	 */
+	pci_register_phb(&p->phb);
+
+	/* Platform additional setup */
+	if (platform.pci_setup_phb)
+		platform.pci_setup_phb(&p->phb, p->index);
+}
+
diff --git a/hw/p5ioc2.c b/hw/p5ioc2.c
new file mode 100644
index 00000000..d8b95917
--- /dev/null
+++ b/hw/p5ioc2.c
@@ -0,0 +1,297 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <p5ioc2.h>
+#include <p5ioc2-regs.h>
+#include <cec.h>
+#include <gx.h>
+#include <opal.h>
+#include <interrupts.h>
+#include <device.h>
+#include <timebase.h>
+#include <vpd.h>
+#include <ccan/str/str.h>
+
+
+static int64_t p5ioc2_set_tce_mem(struct io_hub *hub, uint64_t address,
+				  uint64_t size)
+{
+	struct p5ioc2 *ioc = iohub_to_p5ioc2(hub);
+	int64_t rc;
+
+	printf("P5IOC2: set_tce_mem(0x%016llx size 0x%llx)\n",
+	       address, size);
+
+	/* The address passed must be naturally aligned */
+	if (address && !is_pow2(size))
+		return OPAL_PARAMETER;
+	if (address & (size - 1))
+		return OPAL_PARAMETER;
+
+	ioc->tce_base = address;
+	ioc->tce_size = size;
+
+	rc = gx_configure_tce_bar(ioc->host_chip, ioc->gx_bus,
+				  address, size);
+	if (rc)
+		return OPAL_INTERNAL_ERROR;
+	return OPAL_SUCCESS;
+}
+
+static int64_t p5ioc2_get_diag_data(struct io_hub *hub __unused,
+				   void *diag_buffer __unused,
+				   uint64_t diag_buffer_len __unused)
+{
+	/* XXX Not yet implemented */
+	return OPAL_UNSUPPORTED;
+}
+
+static const struct io_hub_ops p5ioc2_hub_ops = {
+	.set_tce_mem	= p5ioc2_set_tce_mem,
+	.get_diag_data	= p5ioc2_get_diag_data,
+};
+
+static void p5ioc2_inits(struct p5ioc2 *ioc)
+{
+	uint64_t val;
+	unsigned int p, n;
+
+	printf("P5IOC2: Initializing hub...\n");
+
+	/*
+	 * BML base inits
+	 */
+	/* mask off interrupt presentation timeout in FIRMC */
+	out_be64(ioc->regs + (P5IOC2_FIRMC | P5IOC2_REG_OR),
+		 0x0000080000000000);
+
+	/* turn off display alter mode */
+	out_be64(ioc->regs + (P5IOC2_CTL | P5IOC2_REG_AND),
+		 0xffffff7fffffffff);
+
+	/* setup hub and clustering interrupts BUIDs to 1 and 2 */
+	out_be64(ioc->regs + P5IOC2_SBUID, 0x0001000200000000);
+
+	/* setup old style MSI BUID (should be unused but set it up anyway) */
+	out_be32(ioc->regs + P5IOC2_BUCO, 0xf);
+
+	/* Set XIXO bit 0 needed for "enhanced" TCEs or else TCE
+	 * fetches appear as normal memory reads on GX causing
+	 * P7 to checkstop when a TCE DKill collides with them.
+	 */
+	out_be64(ioc->regs + P5IOC2_XIXO, in_be64(ioc->regs + P5IOC2_XIXO)
+		 | P5IOC2_XIXO_ENH_TCE);
+
+	/* Clear routing tables */
+	for (n = 0; n < 16; n++) {
+		for (p = 0; p < 8; p++)
+			out_be64(ioc->regs + P5IOC2_TxRTE(p,n), 0);
+	}
+	for (n = 0; n < 32; n++)
+		out_be64(ioc->regs + P5IOC2_BUIDRTE(n), 0);
+
+	/*
+	 * Setup routing. We use the same setup that pHyp appears
+	 * to do (after inspecting the various registers with SCOM)
+	 *
+	 * We assume the BARs are already setup by the FSP such
+	 * that BAR0 is 128G (8G region size) and BAR6 is
+	 * 256M (16M region size).
+	 *
+	 * The routing is based on what pHyp and BML do, each Calgary
+	 * get one slice of BAR6 and two slices of BAR0
+	 */
+	/* BAR 0 segments 0 & 1 -> CA0 */
+	out_be64(ioc->regs + P5IOC2_TxRTE(0,0),
+		 P5IOC2_TxRTE_VALID | P5IOC2_CA0_RIO_ID);
+	out_be64(ioc->regs + P5IOC2_TxRTE(0,1),
+		 P5IOC2_TxRTE_VALID | P5IOC2_CA0_RIO_ID);
+
+	/* BAR 0 segments 2 & 3 -> CA1 */
+	out_be64(ioc->regs + P5IOC2_TxRTE(0,2),
+		 P5IOC2_TxRTE_VALID | P5IOC2_CA1_RIO_ID);
+	out_be64(ioc->regs + P5IOC2_TxRTE(0,3),
+		 P5IOC2_TxRTE_VALID | P5IOC2_CA1_RIO_ID);
+
+	/* BAR 6 segments 0 -> CA0 */
+	out_be64(ioc->regs + P5IOC2_TxRTE(6,0),
+		 P5IOC2_TxRTE_VALID | P5IOC2_CA0_RIO_ID);
+
+	/* BAR 6 segments 1 -> CA0 */
+	out_be64(ioc->regs + P5IOC2_TxRTE(6,1),
+		 P5IOC2_TxRTE_VALID | P5IOC2_CA1_RIO_ID);
+
+	/*
+	 * BUID routing, we send entries 1 to CA0 and 2 to CA1
+	 * just like pHyp and make sure the base and mask are
+	 * both clear in SID to we route the whole 512 block
+	 */
+	val = in_be64(ioc->regs + P5IOC2_SID);
+	val = SETFIELD(P5IOC2_SID_BUID_BASE, val, 0);
+	val = SETFIELD(P5IOC2_SID_BUID_MASK, val, 0);
+	out_be64(ioc->regs + P5IOC2_SID, val);
+	out_be64(ioc->regs + P5IOC2_BUIDRTE(1),
+		 P5IOC2_BUIDRTE_VALID | P5IOC2_BUIDRTE_RR_RET |
+		 P5IOC2_CA0_RIO_ID);
+	out_be64(ioc->regs + P5IOC2_BUIDRTE(2),
+		 P5IOC2_BUIDRTE_VALID | P5IOC2_BUIDRTE_RR_RET |
+		 P5IOC2_CA1_RIO_ID);
+}
+
+static void p5ioc2_ca_init(struct p5ioc2 *ioc, int ca)
+{
+	void *regs = ca ? ioc->ca1_regs : ioc->ca0_regs;
+	uint64_t val;
+
+	printf("P5IOC2: Initializing Calgary %d...\n", ca);
+
+	/* Setup device BUID */
+	val = SETFIELD(CA_DEVBUID, 0ul, ca ? P5IOC2_CA1_BUID : P5IOC2_CA0_BUID);
+	out_be32(regs + CA_DEVBUID, val);
+
+	/* Setup HubID in TARm (and keep TCE clear, Linux will init that)
+	 *
+	 * BML and pHyp sets the values to 1 for CA0 and 4 for CA1. We
+	 * keep the TAR valid bit clear as well.
+	 */
+	val = SETFIELD(CA_TAR_HUBID, 0ul, ca ? 4 : 1);
+	val = SETFIELD(CA_TAR_ALTHUBID, val, ca ? 4 : 1);
+	out_be64(regs + CA_TAR0, val);
+	out_be64(regs + CA_TAR1, val);
+	out_be64(regs + CA_TAR2, val);
+	out_be64(regs + CA_TAR3, val);
+	
+	/* Bridge config register. We set it up to the same value as observed
+	 * under pHyp on a Juno machine. The difference from the IPL value is
+	 * that TCE buffers are enabled, discard timers are increased and
+	 * we disable response status to avoid errors.
+	 */
+	//out_be64(regs + CA_CCR, 0x5045DDDED2000000);
+	// disable memlimit:
+	out_be64(regs + CA_CCR, 0x5005DDDED2000000);
+
+	/* The system memory base/limit etc... setup will be done when the
+	 * user enables TCE via OPAL calls
+	 */
+}
+
+static void p5ioc2_create_hub(struct dt_node *np)
+{
+	struct p5ioc2 *ioc;
+	unsigned int i, id, irq;
+	char *path;
+
+	/* Use the BUID extension as ID and add it to device-tree */
+	id = dt_prop_get_u32(np, "ibm,buid-ext");
+	path = dt_get_path(np);	
+	printf("P5IOC2: Found at %s ID 0x%x\n", path, id);
+	free(path);
+	dt_add_property_cells(np, "ibm,opal-hubid", 0, id);
+
+	/* Load VPD LID */
+	vpd_iohub_load(np);
+
+	ioc = zalloc(sizeof(struct p5ioc2));
+	if (!ioc)
+		return;
+	ioc->hub.hub_id = id;
+	ioc->hub.ops = &p5ioc2_hub_ops;
+	ioc->dt_node = np;
+
+	/* We assume SBAR == GX0 + some hard coded offset */
+	ioc->regs = (void *)dt_get_address(np, 0, NULL);
+
+	/* For debugging... */
+	for (i = 0; i < 8; i++)
+		printf("P5IOC2: BAR%d = 0x%016llx M=0x%16llx\n", i,
+		       in_be64(ioc->regs + P5IOC2_BAR(i)),
+		       in_be64(ioc->regs + P5IOC2_BARM(i)));
+
+	ioc->host_chip = dt_get_chip_id(np);
+
+	ioc->gx_bus = dt_prop_get_u32(np, "ibm,gx-index");
+
+	/* Rather than reading the BARs in P5IOC2, we "know" that
+	 * BAR6 matches GX BAR 1 and BAR0 matches GX BAR 2. This
+	 * is a bit fishy but will work for the few machines this
+	 * is intended to work on
+	 */
+	ioc->bar6 = dt_prop_get_u64(np, "ibm,gx-bar-1");
+	ioc->bar0 = dt_prop_get_u64(np, "ibm,gx-bar-2");
+
+	printf("DT BAR6 = 0x%016llx\n", ioc->bar6);
+	printf("DT BAR0 = 0x%016llx\n", ioc->bar0);
+
+	/* We setup the corresponding Calgary register bases and memory
+	 * regions. Note: those cannot be used until the routing has
+	 * been setup by inits
+	 */
+	ioc->ca0_regs = (void *)ioc->bar6 + P5IOC2_CA0_REG_OFFSET;
+	ioc->ca1_regs = (void *)ioc->bar6 + P5IOC2_CA1_REG_OFFSET;
+	ioc->ca0_mm_region = ioc->bar0 + P5IOC2_CA0_MM_OFFSET;
+	ioc->ca1_mm_region = ioc->bar0 + P5IOC2_CA1_MM_OFFSET;
+
+	/* Base of our BUIDs, will be refined later */
+	ioc->buid_base = id << 9;
+
+	/* Add interrupts: XXX These are the hub interrupts, we should add the
+	 * calgary ones as well... but we don't handle any of them currently
+	 * anyway.
+	 */
+	irq = (ioc->buid_base + 1) << 4;
+	dt_add_property_cells(np, "interrupts", irq, irq + 1);
+	dt_add_property_cells(np, "interrupt-base", irq);
+
+
+	/* Now, we do the bulk of the inits */
+	p5ioc2_inits(ioc);
+	p5ioc2_ca_init(ioc, 0);
+	p5ioc2_ca_init(ioc, 1);
+
+	/* So how do we know what PHBs to create ? Let's try all of them
+	 * and we'll see if that causes problems. TODO: Use VPD !
+	 */
+	for (i = 0; i < 4; i++)
+		p5ioc2_phb_setup(ioc, &ioc->ca0_phbs[i], 0, i, true,
+				 ioc->buid_base + P5IOC2_CA0_BUID + i + 1);
+	for (i = 0; i < 4; i++)
+		p5ioc2_phb_setup(ioc, &ioc->ca1_phbs[i], 1, i, true,
+				 ioc->buid_base + P5IOC2_CA1_BUID + i + 1);
+
+	/* Reset delay... synchronous, hope we never do that as a
+	 * result of an OPAL callback. We shouldn't really need this
+	 * here and may fold it in the generic slot init sequence but
+	 * it's not like we care much about that p5ioc2 code...
+	 *
+	 * This is mostly to give devices a chance to settle after
+	 * having lifted the reset pin on PCI-X.
+	 */
+	time_wait_ms(1000);
+
+	printf("P5IOC2: Initialization complete\n");
+
+	cec_register(&ioc->hub);
+}
+
+void probe_p5ioc2(void)
+{
+	struct dt_node *np;
+
+	dt_for_each_compatible(dt_root, np, "ibm,p5ioc2")
+		p5ioc2_create_hub(np);
+}
+
diff --git a/hw/p7ioc-inits.c b/hw/p7ioc-inits.c
new file mode 100644
index 00000000..dc5c3703
--- /dev/null
+++ b/hw/p7ioc-inits.c
@@ -0,0 +1,1096 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This inits are in part auto-generated from tables coming
+ * from the HW guys, then hand updated
+ */
+#include <skiboot.h>
+#include <p7ioc.h>
+#include <p7ioc-regs.h>
+#include <io.h>
+#include <processor.h>
+#include <timebase.h>
+
+#undef DUMP_CI_ROUTING
+#undef DUMP_REG_WRITES
+
+#ifdef DUMP_REG_WRITES
+#define REGW(offset, value)     do {                            \
+                out_be64(ioc->regs + (offset), (value));        \
+		printf("  REGW: %06lx=%016lx RB: %016llx\n",	\
+		       (unsigned long)(offset),			\
+		       (unsigned long)(value),			\
+		       in_be64(ioc->regs + (offset)));		\
+		in_be64(ioc->regs + (offset));			\
+        } while(0)
+#else
+#define REGW(offset, value)     do {                            \
+                out_be64(ioc->regs + (offset), (value));        \
+		in_be64(ioc->regs + (offset));			\
+        } while(0)
+#endif
+#define REGR(offset)	in_be64(ioc->regs + (offset))
+
+static void p7ioc_init_BI(struct p7ioc *ioc)
+{
+	printf("P7IOC: Init BI...\n");
+
+        /*** General settings ***/
+
+        /* Init_1 and Init_2: Different between P7 and P7+ */
+        if (PVR_TYPE(mfspr(SPR_PVR)) == PVR_TYPE_P7P) {
+		printf("P7IOC:   -> Configured for P7+\n");
+
+                /* Chicken switches */
+                REGW(0x3c00d8, 0x0004000000000600);
+                /* GX config */
+                REGW(0x3c00a0, 0x9F8929BE00880085);
+        } else {
+		printf("P7IOC:   -> Configured for P7\n");
+
+                /* P7 setting assumes "early write done" mode is
+                 * enabled in the GX controller. It seems to be
+                 * the case but maybe we want to check/set it via
+                 * xscom ?
+                 */
+                /* Chicken switches */
+                REGW(0x3c00d8, 0x00040000000004C0);
+                /* GX config */
+                REGW(0x3c00a0, 0x9C8929BE00880085);
+        }
+
+	/*
+	 * Note: While running skiboot on Firebird-L, I have
+	 * to print something or wait for a while. The root
+	 * cause wasn't identified yet.
+	 */
+	time_wait_ms(100);
+
+        /* Init_3: Upbound Credit Config */
+        REGW(0x3c00c8, 0x0303060403030000);
+        /* Init_4: Credit Init Timer */
+        REGW(0x3c00e8, 0x00000000000000FF);
+
+        /* Init_4.1: BI Ack Timing */
+        REGW(0x3c00e8, 0x0000FC0000000000);
+        /* Init_5: Ordering Override 0*/
+        REGW(0x3c0200, 0x0000000000000000);
+        /* Init_6: Ordering Override 1*/
+        REGW(0x3c0208, 0x0000000000000000);
+
+        /*** Downbound TTYPE table ***/
+
+        /* Init_7: Enable sequence / speculation for CI Loads */
+        REGW(0x3c00a8, 0x0000000000000004);
+        /* Init_8: */
+        REGW(0x3c00b0, 0x700800C000000000);
+        /* Init_9: Enable sequence / speculation for CI Stores */
+        REGW(0x3c00a8, 0x0000000000000005);
+        /* Init_10: */
+        REGW(0x3c00b0, 0x704820C000000000);
+        /* Init_11: Enable speculation for EOI */
+        REGW(0x3c00a8, 0x000000000000001B);
+        /* Init_12: */
+        REGW(0x3c00b0, 0x3590204000000000);
+        /* Init_13: ENable speculation for DMA Rd Responses */
+        REGW(0x3c00a8, 0x0000000000000020);
+        /* Init_14: */
+        REGW(0x3c00b0, 0x1103C4C000000000);
+        /* Init_15: Enable sequence for DMA RWNITC */
+        REGW(0x3c00a8, 0x0000000000000001);
+        /* Init_16: */
+        REGW(0x3c00b0, 0xC000000000000000);
+        /* Init_17: Enable sequence for IOKill */
+        REGW(0x3c00a8, 0x0000000000000009);
+        /* Init_18: */
+        REGW(0x3c00b0, 0x4208210000000000);
+        /* Init_19: Enable sequence for IOKill */
+        REGW(0x3c00a8, 0x000000000000000A);
+        /* Init_20: */
+        REGW(0x3c00b0, 0x4200210000000000);
+        /* Init_21: Enable sequence for FMTC CI Store w/Kill */
+        REGW(0x3c00a8, 0x0000000000000021);
+
+        /*** Timer controls ***/
+
+        /* Init_22: */
+        REGW(0x3c00b0, 0x4200300000000000);
+        /* Init_23: Dnbound timer mask */
+        REGW(0x3c0190, 0x0040000000000000);
+        /* Init_24: Upbound timer mask 0 */
+        REGW(0x3c0180, 0x0010001000100010);
+        /* Init_25: Upbound timer mask 1 */
+        REGW(0x3c0188, 0x0010000000000000);
+        /* Init_26: Credit sync check config */
+        REGW(0x3c00f0, 0xC102000000000000);
+
+        /*** Setup trace ***/
+
+        /* Init_27: DBG stop trace */
+        REGW(0x3c0410, 0x4000000000000000);
+        /* Init_28: DBG control */
+        REGW(0x3c0400, 0x0000000000000000);
+        /* Init_29: DBG Mode */
+        REGW(0x3c0408, 0xA0000000F0CC3300);
+        /* Init_29a: DBG C0 (Stop on Error) */
+        REGW(0x3c0418, 0xF4F00FFF00000000);
+        /* Init_30: DBG pre-mux select */
+        REGW(0x3c0478, 0x0023000000000000);
+        /* Init_31: CA0 mode */
+        REGW(0x3c04b0, 0x8000000000000000);
+        /* Init_32: CA0 Compression 0 */
+        REGW(0x3c04b8, 0x0000000000000000);
+        /* Init_33: CA0 Compression 1 */
+        REGW(0x3c04c0, 0x0000000000000000);
+        /* Init_34: CA0 Pattern A march (cmd1 selected val) */
+        REGW(0x3c0480, 0x008000007FFFFF00);
+        /* Init_35: CA0 Trigger 0 definition (pattern A) */
+        REGW(0x3c04a0, 0x8000000000000000);
+        /* Init_36: CA1 mode */
+        REGW(0x3c0530, 0x8000000000000000);
+        /* Init_37: CA1 Compression 0 */
+        REGW(0x3c0538, 0x0000000000000000);
+        /* Init_38: CA1 Compression 1 */
+        REGW(0x3c0540, 0x0000000000000000);
+        /* Init_39: CA2 mode */
+        REGW(0x3c05b0, 0x8000000000000000);
+        /* Init_40: CA2 Compression 0 */
+        REGW(0x3c05b8, 0x0000000000000000);
+        /* Init_41: CA2 Compression 1 */
+        REGW(0x3c05c0, 0x0000000000000000);
+        /* Init_42: CA3 Mode */
+        REGW(0x3c0630, 0x8000000000000000);
+        /* Init_43: CA3 Compression 0 */
+        REGW(0x3c0638, 0x0000000000000000);
+        /* Init_44: CA3 Compression 1 */
+        REGW(0x3c0640, 0x0000000000000000);
+        /* Init_45: CA3 Pattern A match (AIB val) */
+        REGW(0x3c0600, 0x80000100FFFEFF00);
+        /* Init_46: CA3 Trigger 0 definition (pattern A) */
+        REGW(0x3c0620, 0x8000000000000000);
+        /* Init_47: DBG unfreeze trace */
+        REGW(0x3c0410, 0x1000000000000000);
+        /* Init_48: DBG start trace */
+        REGW(0x3c0410, 0x8000000000000000);
+
+        /*** AIB Port Config ***/
+
+        /* Init_49: AIB Port Information */
+        REGW(0x3c00d0, 0x0888888800000000);
+        /* Init_50: Port Ordering controls */
+        REGW(0x3c0200, 0x0000000000000000);
+
+        /*** LEMs (need to match recov. tables) ***/
+
+        /* Init_51: Clear upbound LEM */
+        REGW(0x3c0000, 0x0000000000000000);
+        /* Init_52: Clear upbound WOF */
+        REGW(0x3c0040, 0x0000000000000000);
+        /* Init_53: Clear Dnbound LEM */
+        REGW(0x3c0050, 0x0000000000000000);
+        /* Init_54: Clear Dnbound WOF */
+        REGW(0x3c0090, 0x0000000000000000);
+        /* Init_55: Clear Fences */
+        REGW(0x3c0130, 0x0000000000000000);
+        /* Init_56: Clear Erpt latches */
+        REGW(0x3c0148, 0x0080000000000000);
+        /* Init_57: Set Upbound LEM Action0 */
+        REGW(0x3c0030, 0x0800000000800000);
+        /* Init_58: Set Upbound LEN Action1 */
+        REGW(0x3c0038, 0x0000000000000000);
+        /* Init_59: Set Upbound LEM Mask (AND write) */
+        REGW(0x3c0020, 0x0800000000000000);
+        /* Init_60: Set Dnbound LEM Action0 */
+        REGW(0x3c0080, 0x2000080CA07FFF40);
+        /* Init_61: Set Dnbound LEM Action1 */
+        REGW(0x3c0088, 0x0000000000000000);
+        /* Init_62: Set Dnbound LEM Mask (AND write) */
+        REGW(0x3c0070, 0x00000800200FFE00);
+
+        /*** Setup Fences (need to match recov. tables) ***/
+
+        /* Init_63: Set Upbound Damage Control 0 (GX Err) */
+        REGW(0x3c0100, 0xF7FFFFFFFF7FFFFF);
+        /* Init_64: Set Upbound Damage Control 1 (AIB Fence) */
+        REGW(0x3c0108, 0xF7FFFFFFFF7FFFFF);
+        /* Init_65: Set Upbound Damage Control 2 (Drop Pkt) */
+        REGW(0x3c0110, 0x0010054000000000);
+        /* Init_66: Set Dnbound Damage Control 0 (GX Err) */
+        REGW(0x3c0118, 0xDFFFF7F35F8000BF);
+        /* Init_67: Set Dnbound Damage Control 1 (AIB Fence) */
+        REGW(0x3c0120, 0xDFFFF7F35F8000BF);
+        /* Init_68: Set Dnbound Damage Control 2 (Drop Pkt) */
+        REGW(0x3c0128, 0x0000000C00000000);
+}
+
+static void p7ioc_init_MISC_HSS(struct p7ioc *ioc)
+{
+        unsigned int i, regbase;
+
+	printf("P7IOC: Init HSS...\n");
+
+        /* Note: These values might need to be tweaked per system and
+         * per physical port depending on electrical characteristics.
+         *
+         * For now we stick to the defaults provided by the spec.
+         */
+        for (i = 0; i < P7IOC_NUM_PHBS; i++) {
+                regbase = P7IOC_HSS_BASE + i * P7IOC_HSS_STRIDE;
+
+		if (!p7ioc_phb_enabled(ioc, i))
+			continue;
+
+                /* Init_1: HSSn CTL2 */
+                REGW(regbase + P7IOC_HSSn_CTL2_OFFSET, 0xFFFF6DB6DB000000);
+                /* Init_2: HSSn CTL3 */
+                REGW(regbase + P7IOC_HSSn_CTL3_OFFSET, 0x1130000320000000);
+                /* Init_3: HSSn CTL8 */
+                REGW(regbase + P7IOC_HSSn_CTL8_OFFSET, 0xDDDDDDDD00000000);
+
+#if 0 /* All these remain set to the values configured by the FSP */
+                /* Init_4: HSSn CTL9 */
+                REGW(regbase + P7IOC_HSSn_CTL9_OFFSET, 0x9999999900000000);
+                /* Init_5: HSSn CTL10 */
+                REGW(regbase + P7IOC_HSSn_CTL10_OFFSET, 0x8888888800000000);
+                /* Init_6: HSSn CTL11 */
+                REGW(regbase + P7IOC_HSSn_CTL11_OFFSET, 0x4444444400000000);
+                /* Init_7: HSSn CTL12 */
+                REGW(regbase + P7IOC_HSSn_CTL12_OFFSET, 0x3333333300000000);
+                /* Init_8: HSSn CTL13 */
+                REGW(regbase + P7IOC_HSSn_CTL13_OFFSET, 0x2222222200000000);
+                /* Init_9: HSSn CTL14 */
+                REGW(regbase + P7IOC_HSSn_CTL14_OFFSET, 0x1111111100000000);
+                /* Init_10: HSSn CTL15 */
+                REGW(regbase + P7IOC_HSSn_CTL15_OFFSET, 0x1111111100000000);
+                /* Init_11: HSSn CTL16 */
+                REGW(regbase + P7IOC_HSSn_CTL16_OFFSET, 0x9999999900000000);
+                /* Init_12: HSSn CTL17 */
+                REGW(regbase + P7IOC_HSSn_CTL17_OFFSET, 0x8888888800000000);
+                /* Init_13: HSSn CTL18 */
+                REGW(regbase + P7IOC_HSSn_CTL18_OFFSET, 0xDDDDDDDD00000000);
+                /* Init_14: HSSn CTL19 */
+                REGW(regbase + P7IOC_HSSn_CTL19_OFFSET, 0xCCCCCCCC00000000);
+                /* Init_15: HSSn CTL20 */
+                REGW(regbase + P7IOC_HSSn_CTL20_OFFSET, 0xBBBBBBBB00000000);
+                /* Init_16: HSSn CTL21 */
+		REGW(regbase + P7IOC_HSSn_CTL21_OFFSET, 0x9999999900000000);
+                /* Init_17: HSSn CTL22 */
+		REGW(regbase + P7IOC_HSSn_CTL22_OFFSET, 0x8888888800000000);
+                /* Init_18: HSSn CTL23 */
+		REGW(regbase + P7IOC_HSSn_CTL23_OFFSET, 0x7777777700000000);
+#endif
+	}
+}
+
+static void p7ioc_init_RGC(struct p7ioc *ioc)
+{
+	unsigned int i;
+	uint64_t val, cfg;
+
+	printf("P7IOC: Init RGC...\n");
+
+	/*** Clear ERPT Macros ***/
+	
+	/* Init_1: RGC Configuration reg */
+	cfg = REGR(0x3e1c08);
+	REGW(0x3e1c08, cfg | PPC_BIT(1));
+	time_wait_ms(1);
+
+	/* Init_2: RGC Configuration reg */
+	REGW(0x3e1c08, cfg);
+
+	/*** Set LEM regs (needs to match recov. code) */
+
+	/* Init_3: LEM FIR Accumulator */
+	REGW(0x3e1e00, 0x0000000000000000);
+	/* Init_4: LEM Action 0 */
+	REGW(0x3e1e30, 0x0FFF791F0B030000);
+	/* Init_5: LEN Action 1 */
+	REGW(0x3e1e38, 0x0000000000000000);
+	/* Init_6: LEM WOF */
+	REGW(0x3e1e40, 0x0000000000000000);
+	/* Init_7: LEM Mask Reg (AND write) */
+	REGW(0x3e1e20, 0x0FFF001F03030000);
+
+	/*** Set GEM regs (masks still on, no irpts can occur yet) ***/
+
+	/* Init_8: GEM XFIR */
+	REGW(0x3e0008, 0x0000000000000000);
+	/* Init_9: GEM WOF */
+	REGW(0x3e0028, 0x0000000000000000);
+
+	/*** Set Damage Controls (needs to match recov.) ***/
+
+	/* Init_10: LDCP */
+	REGW(0x3e1c18, 0xF00086C0B4FCFFFF);
+
+	/*** Read status (optional) ***/
+
+	/* Init_11: Read status */
+	val = REGR(0x3e1c10);
+	printf("P7IOC:   Init_11 Status: %016llx\n", val);
+
+	/*** Set running configuration **/
+
+	/* Init_12: Configuration reg (modes, values, timers) */
+	REGW(0x3e1c08, 0x10000077CE100000);
+
+	/* Init_13: Cmd/Dat Crd Allocation */
+	REGW(0x3e1c20, 0x00000103000700FF);
+	/* Init_14: GP reg - disable errs, wrap, stop_trc */
+	REGW(0x3e1018, 0x0000000000000000);
+	/* Init_15: Configuration reg (start init timers) */
+	cfg = REGR(0x3e1c08);
+	REGW(0x3e1c08, cfg | 0x00003f0000000000);
+
+	/*** Setup  interrupts ***/
+
+	/* Init_16: BUID Register
+	 *
+	 * XXX NOTE: This needs to be clarified. According to the doc
+	 * the register contains a 9-bit BUID, which makes sense so far.
+	 *
+	 * However, the initialization sequence says "depends on which
+	 * GX bus) which doesn't since afaik the GX bus number is encoded
+	 * in the BUID Extension bit which is right *above* the 9-bit
+	 * BUID in the interrupt message.
+	 *
+	 * So I must be missing something here... For now I'll just
+	 * write my 9-bit BUID and we'll see what happens.
+	 *
+	 */
+	REGW(0x3e1800, (uint64_t)ioc->rgc_buid << PPC_BITLSHIFT(31));
+
+	/* Init_17: Supposed to lock the IODA table but we aren't racing
+	 *          with anybody so there is little point.
+	 *
+	 * Note: If/when we support some kind of error recovery that
+	 *       involves re-initializing the IOC, then we might have
+	 *       to take some locks but it's assumed that the necessary
+	 *       lock(s) will be obtained by the caller.
+	 */
+	//REGR(0x3e1840, 0x0000000000000000);
+
+	/* Init_18: IODA Table Addr: Select IST*/
+	REGW(0x3e1820, 0x8001000000000000);
+	/* Init_19: IODA Table Data: IRPT 0 */
+	REGW(0x3e1830, 0x0000000000000000);
+	/* Init_20: IODA Table Data: IRPT 1 */
+	REGW(0x3e1830, 0x0000000000000000);
+	/* Init_21: IODA Table Addr: Select HRT */
+	REGW(0x3e1820, 0x8000000000000000);
+	/* Init_22: IODA Table Data: HRT
+	 *
+	 * XXX Figure out what this actually is and what value should
+	 *     we use. For now, do like BML and use 0
+	 */
+	for (i = 0; i < 4; i++)
+		REGW(0x3e1830, 0x0000000000000000);
+
+	/* Init_23: IODA Table Addr: select XIVT */
+	REGW(0x3e1820, 0x8002000000000000);
+	/* Init_24: IODA Table Data: Mask all interrupts */
+	for (i = 0; i < 16; i++)
+		REGW(0x3e1830, 0x000000ff00000000);
+
+	/* Init_25: Clear table lock if any was stale */
+	REGW(0x3e1840, 0x0000000000000000);
+
+	/* Init_32..37: Set the PHB AIB addresses. We configure those
+	 * to the values recommended in the p7IOC doc.
+	 *
+	 * XXX NOTE: I cannot find a documentation for these, I assume
+	 * they just take the full 64-bit address, but we may want to
+	 * dbl check just in case (it seems to be what BML does but
+	 * I'm good at mis-reading Milton's Perl).
+	 */
+	for (i = 0; i < P7IOC_NUM_PHBS; i++) {
+		if (!p7ioc_phb_enabled(ioc, i))
+			continue;
+		REGW(0x3e1080 + (i << 3),
+		     ioc->mmio1_win_start + PHBn_AIB_BASE(i));
+	}
+}
+
+static void p7ioc_init_ci_routing(struct p7ioc *ioc)
+{
+	unsigned int i, j = 0;
+	uint64_t rmatch[47];
+	uint64_t rmask[47];
+	uint64_t pmask;
+
+	/* Init_130: clear all matches (except 47 which routes to the RGC) */
+	for (i = 0; i < 47; i++) {
+		rmatch[i] = REGR(P7IOC_CI_RMATC_REG(i)) &
+			~(P7IOC_CI_RMATC_ADDR_VALID |
+			  P7IOC_CI_RMATC_BUID_VALID |
+			  P7IOC_CI_RMATC_TYPE_VALID);
+		rmask[i] = 0;
+		REGW(P7IOC_CI_RMATC_REG(i), rmatch[i]);
+	}
+
+	/* Init_131...224: configure routing for everything except RGC
+	 *
+	 * We are using a slightly different routing setup than the
+	 * example to make the code easier. We configure all PHB
+	 * routing entries by doing all of PHB0 first, then all of PHB1
+	 * etc...
+	 *
+	 * Then we append everything else except the RGC itself which
+	 * remains hard wired at entry 47. So the unused entries live
+	 * at 39..46.
+	 *
+	 *  -      0 : PHB0 LSI BUID
+	 *  -      1 : PHB0 MSI BUID
+	 *  -      2 : PHB0 AIB Registers
+	 *  -      3 : PHB0 IO Space
+	 *  -      4 : PHB0 M32 Space
+	 *  -      5 : PHB0 M64 Space
+	 *  -  6..11 : PHB1
+	 *  - 12..17 : PHB2
+	 *  - 18..23 : PHB3
+	 *  - 24..29 : PHB4
+	 *  - 30..35 : PHB5
+	 *  -     36 : Invalidates broadcast (FMTC)
+	 *  -     37 : Interrupt response for RGC
+	 *  -     38 : RGC GEM BUID
+	 *  - 39..46 : Unused (alternate M64 ?)
+	 *  -     47 : RGC ASB Registers (catch all)
+	 */
+
+	/* Helper macro to set a rule */
+#define CI_ADD_RULE(p, k, d, m)	do {				\
+		rmask[j] = P7IOC_CI_RMATC_ENCODE_##k(m);	\
+		rmatch[j]= P7IOC_CI_RMATC_PORT(p)     |		\
+			   P7IOC_CI_RMATC_##k##_VALID |		\
+			   P7IOC_CI_RMATC_ENCODE_##k(d);	\
+		j++;						\
+	} while (0)
+
+	pmask = 0;
+	for (i = 0; i < P7IOC_NUM_PHBS; i++) {
+		unsigned int buid_base = ioc->buid_base + PHBn_BUID_BASE(i);
+
+		if (!p7ioc_phb_enabled(ioc, i))
+			continue;
+
+		/* LSI BUIDs, match all 9 bits (1 BUID per PHB) */
+		CI_ADD_RULE(P7IOC_CI_PHB_PORT(i), BUID,
+			    buid_base + PHB_BUID_LSI_OFFSET, 0x1ff);
+
+		/* MSI BUIDs, match 4 bits (16 BUIDs per PHB) */
+		CI_ADD_RULE(P7IOC_CI_PHB_PORT(i), BUID,
+			    buid_base + PHB_BUID_MSI_OFFSET, 0x1f0);
+
+		/* AIB reg space */
+		CI_ADD_RULE(P7IOC_CI_PHB_PORT(i), ADDR,
+			    ioc->mmio1_win_start + PHBn_AIB_BASE(i),
+			    ~(PHBn_AIB_SIZE - 1));
+
+		/* IO space */
+		CI_ADD_RULE(P7IOC_CI_PHB_PORT(i), ADDR,
+			    ioc->mmio1_win_start + PHBn_IO_BASE(i),
+			    ~(PHB_IO_SIZE - 1));
+
+		/* M32 space */
+		CI_ADD_RULE(P7IOC_CI_PHB_PORT(i), ADDR,
+			    ioc->mmio2_win_start + PHBn_M32_BASE(i),
+			    ~(PHB_M32_SIZE - 1));
+
+		/* M64 space */
+		CI_ADD_RULE(P7IOC_CI_PHB_PORT(i), ADDR,
+			    ioc->mmio2_win_start + PHBn_M64_BASE(i),
+			    ~(PHB_M64_SIZE - 1));
+
+		/* For use with invalidate bcasts */
+		pmask |= P7IOC_CI_PHB_PORT(i);
+	}
+
+	/* Invalidates broadcast to all PHBs */
+	CI_ADD_RULE(pmask, TYPE, 0x80, 0xf0);
+
+	/* Interrupt responses go to RGC */
+	CI_ADD_RULE(P7IOC_CI_RGC_PORT, TYPE, 0x60, 0xf0);
+
+	/* RGC GEM BUID (1 BUID) */
+	CI_ADD_RULE(P7IOC_CI_RGC_PORT, BUID, ioc->rgc_buid, 0x1ff);
+
+	/* Program the values masks first */
+	for (i = 0; i < 47; i++)
+		REGW(P7IOC_CI_RMASK_REG(i), rmask[i]);
+	for (i = 0; i < 47; i++)
+		REGW(P7IOC_CI_RMATC_REG(i), rmatch[i]);
+
+	/* Init_225: CI Match 47 (Configure RGC catch all) */
+	REGW(P7IOC_CI_RMASK_REG(47), 0x0000000000000000);
+	REGW(P7IOC_CI_RMATC_REG(47), 0x4000800000000000);
+
+#ifdef DUMP_CI_ROUTING
+	printf("P7IOC: CI Routing table:\n");
+	for (i = 0; i < 48; i++)
+		printf("  [%.2d] MTCH: %016llx MSK: %016llx\n", i,
+		       REGR(P7IOC_CI_RMATC_REG(i)),
+		       REGR(P7IOC_CI_RMASK_REG(i)));
+#endif /* DUMP_CI_ROUTING */
+}
+
+static void p7ioc_init_CI(struct p7ioc *ioc)
+{
+	printf("P7IOC: Init CI...\n");
+
+	/*** Clear ERPT macros ***/
+
+	/* XXX NOTE: The doc seems to also provide "alternate freq ratio"
+	 * settings. Not sure what they are about, let's stick to the
+	 * original values for now.
+	 */
+
+	/* Init_1: CI Port 0 Configuration */
+	REGW(0x3d0000, 0x420000C0073F0002);
+	/* Init_2: CI Port 0 Configuration */
+	REGW(0x3d0000, 0x020000C0073F0002);
+	/* Init_3: CI Port 1 Configuration */
+	REGW(0x3d1000, 0x42000FCF07200002);
+	/* Init_4: CI Port 1 Configuration */
+	REGW(0x3d1000, 0x02000FCF07200002);
+	/* Init_5: CI Port 2 Configuration */
+	REGW(0x3d2000, 0x420000C307200002);
+	/* Init_6: CI Port 2 Configuration */
+	REGW(0x3d2000, 0x020000C307200002);
+	/* Init_7: CI Port 3 Configuration */
+	REGW(0x3d3000, 0x420000C307200002);
+	/* Init_8: CI Port 3 Configuration */
+	REGW(0x3d3000, 0x020000C307200002);
+	/* Init_9: CI Port 4 Configuration */
+	REGW(0x3d4000, 0x420000C307200002);
+	/* Init_10: CI Port 4 Configuration */
+	REGW(0x3d4000, 0x020000C307200002);
+	/* Init_11: CI Port 5 Configuration */
+	REGW(0x3d5000, 0x420000C307200002);
+	/* Init_12: CI Port 5 Configuration */
+	REGW(0x3d5000, 0x020000C307200002);
+	/* Init_13: CI Port 6 Configuration */
+	REGW(0x3d6000, 0x420000C307200002);
+	/* Init_14: CI Port 6 Configuration */
+	REGW(0x3d6000, 0x020000C307200002);
+	/* Init_15: CI Port 7 Configuration */
+	REGW(0x3d7000, 0x420000C307200002);
+	/* Init_16: CI Port 7 Configuration */
+	REGW(0x3d7000, 0x020000C307200002);
+
+	/*** Set LEM regs (need to match recov.) ***/
+
+	/* Init_17: CI Port 0 LEM FIR Accumulator */
+	REGW(0x3d0200, 0x0000000000000000);
+	/* Init_18: CI Port 0 LEM Action 0 */
+	REGW(0x3d0230, 0x0A00000000000000);
+	/* Init_19: CI Port 0 LEM Action 1 */
+	REGW(0x3d0238, 0x0000000000000000);
+	/* Init_20: CI Port 0 LEM WOF */
+	REGW(0x3d0240, 0x0000000000000000);
+	/* Init_21: CI Port 0 LEM Mask (AND write) */
+	REGW(0x3d0220, 0x0200000000000000);
+	/* Init_22: CI Port 1 LEM FIR Accumularor */
+	REGW(0x3d1200, 0x0000000000000000);
+	/* Init_23: CI Port 1 LEM Action 0 */
+	REGW(0x3d1230, 0x0000000000000000);
+	/* Init_24: CI Port 1 LEM Action 1 */
+	REGW(0x3d1238, 0x0000000000000000);
+	/* Init_25: CI Port 1 LEM WOF */
+	REGW(0x3d1240, 0x0000000000000000);
+	/* Init_26: CI Port 1 LEM Mask (AND write) */
+	REGW(0x3d1220, 0x0000000000000000);
+	/* Init_27: CI Port 2 LEM FIR Accumulator */
+	REGW(0x3d2200, 0x0000000000000000);
+	/* Init_28: CI Port 2 LEM Action 0 */
+	REGW(0x3d2230, 0xA4F4000000000000);
+	/* Init_29: CI Port 2 LEM Action 1 */
+	REGW(0x3d2238, 0x0000000000000000);
+	/* Init_30: CI Port 2 LEM WOF */
+	REGW(0x3d2240, 0x0000000000000000);
+	/* Init_31: CI Port 2 LEM Mask (AND write) */
+	REGW(0x3d2220, 0x0000000000000000);
+	/* Init_32: CI Port 3 LEM FIR Accumulator */
+	REGW(0x3d3200, 0x0000000000000000);
+	/* Init_33: CI Port 3 LEM Action 0 */
+	REGW(0x3d3230, 0xA4F4000000000000);
+	/* Init_34: CI Port 3 LEM Action 1 */
+	REGW(0x3d3238, 0x0000000000000000);
+	/* Init_35: CI Port 3 LEM WOF */
+	REGW(0x3d3240, 0x0000000000000000);
+	/* Init_36: CI Port 3 LEM Mask (AND write) */
+	REGW(0x3d3220, 0x0000000000000000);
+	/* Init_37: CI Port 4 LEM FIR Accumulator */
+	REGW(0x3d4200, 0x0000000000000000);
+	/* Init_38: CI Port 4 Action 0 */
+	REGW(0x3d4230, 0xA4F4000000000000);
+	/* Init_39: CI Port 4 Action 1 */
+	REGW(0x3d4238, 0x0000000000000000);
+	/* Init_40: CI Port 4 WOF */
+	REGW(0x3d4240, 0x0000000000000000);
+	/* Init_41: CI Port 4 Mask (AND write) */
+	REGW(0x3d4220, 0x0000000000000000);
+	/* Init_42: CI Port 5 LEM FIR Accumulator */
+	REGW(0x3d5200, 0x0000000000000000);
+	/* Init_43: CI Port 5 Action 0 */
+	REGW(0x3d5230, 0xA4F4000000000000);
+	/* Init_44: CI Port 5 Action 1 */
+	REGW(0x3d5238, 0x0000000000000000);
+	/* Init_45: CI Port 4 WOF */
+	REGW(0x3d5240, 0x0000000000000000);
+	/* Init_46: CI Port 5 Mask (AND write) */
+	REGW(0x3d5220, 0x0000000000000000);
+	/* Init_47: CI Port 6 LEM FIR Accumulator */
+	REGW(0x3d6200, 0x0000000000000000);
+	/* Init_48: CI Port 6 Action 0 */
+	REGW(0x3d6230, 0xA4F4000000000000);
+	/* Init_49: CI Port 6 Action 1 */
+	REGW(0x3d6238, 0x0000000000000000);
+	/* Init_50: CI Port 6 WOF */
+	REGW(0x3d6240, 0x0000000000000000);
+	/* Init_51: CI Port 6 Mask (AND write) */
+	REGW(0x3d6220, 0x0000000000000000);
+	/* Init_52: CI Port 7 LEM FIR Accumulator */
+	REGW(0x3d7200, 0x0000000000000000);
+	/* Init_53: CI Port 7 Action 0 */
+	REGW(0x3d7230, 0xA4F4000000000000);
+	/* Init_54: CI Port 7 Action 1 */
+	REGW(0x3d7238, 0x0000000000000000);
+	/* Init_55: CI Port 7 WOF */
+	REGW(0x3d7240, 0x0000000000000000);
+	/* Init_56: CI Port 7 Mask (AND write) */
+	REGW(0x3d7220, 0x0000000000000000);
+
+	/*** Set Damage Controls (need match recov.) ***/
+
+	/* Init_57: CI Port 0 LDCP*/
+	REGW(0x3d0010, 0x421A0000000075FF);
+	/* Init_58: CI Port 1 LDCP */
+	REGW(0x3d1010, 0x421A000000007FFF);
+	/* Init_59: CI Port 2 LDCP */
+	REGW(0x3d2010, 0x421A24F400005B0B);
+	/* Init_60: CI Port 3 LDCP */
+	REGW(0x3d3010, 0x421A24F400005B0B);
+	/* Init_61: CI Port 4 LDCP */
+	REGW(0x3d4010, 0x421A24F400005B0B);
+	/* Init_62: CI Port 5 LDCP */
+	REGW(0x3d5010, 0x421A24F400005B0B);
+	/* Init_63: CI Port 6 LDCP */
+	REGW(0x3d6010, 0x421A24F400005B0B);
+	/* Init_64: CI Port 7 LDCP */
+	REGW(0x3d7010, 0x421A24F400005B0B);
+
+	/*** Setup Trace 0 ***/
+
+	/* Init_65: CI Trc 0 DBG - Run/Status (stop trace) */
+	REGW(0x3d0810, 0x5000000000000000);
+	/* Init_66: CI Trc 0 DBG - Mode (not cross trig CA's) */
+	REGW(0x3d0808, 0xB0000000F0000000);
+	/* Init_66a: CI Trc 0 DBG - C0 (stop on error) */
+	REGW(0x3d0818, 0xF4F00FFF00000000);
+	/* Init_67: CI Trc 0 DBG - Select (port 0 mode 2) */
+	REGW(0x3d0878, 0x0002000000000000);
+	/* Init_68: CI Trc 0 CA0 - Pattern A (RX cmd val) */
+	REGW(0x3d0880, 0xC0200000DFFFFF00);
+	/* Init_69: CI Trc 0 CA0 - Trigger 0 (Pattern A) */
+	REGW(0x3d08a0, 0x8000000000000000);
+	/* Init_70: CI Trc 0 - Mode */
+	REGW(0x3d08b0, 0x8000000000000000);
+	/* Init_71: CI Trc 0 CA1 - Pattern A (TX cmd val) */
+	REGW(0x3d0900, 0xC0200000DFFFFF00);
+	/* Init_72: CI Trc 0 CA1 - Trigger 0 (Pattern A) */
+	REGW(0x3d0920, 0x8000000000000000);
+	/* Init_73: CI Trc 0 CA1 - Mode */
+	REGW(0x3d0930, 0x8000000000000000);
+	/* Init_74: CI Trc 0 DBG - Run/Status (start trace) */
+	REGW(0x3d0810, 0x8000000000000000);
+
+	/*** Setup Trace 1 ***/
+
+	/* Init_75: CI Trc 1 DBG - Run/Status (stop trace) */
+	REGW(0x3d0c10, 0x5000000000000000);
+	/* Init_76: CI Trc 1 DBG - Mode (not cross trig CA's) */
+	REGW(0x3d0c08, 0xB0000000F0000000);
+	/* Init_76a: CI Trc 1 DBG - C0 (stop on error) */
+	REGW(0x3d0c18, 0xF4F00FFF00000000);
+	/* Init_77: CI Trc 1 DBG - Select (port 1 mode 2) */
+	REGW(0x3d0c78, 0x0102000000000000);
+	/* Init_78: CI Trc 1 CA0 - Pattern A (RX cmd val) */
+	REGW(0x3d0c80, 0xC0200000DFFFFF00);
+	/* Init_79: CI Trc 1 CA0 - Trigger 0 (Pattern A) */
+	REGW(0x3d0ca0, 0x8000000000000000);
+	/* Init_80: CI Trc 1 CA0 - Mode */
+	REGW(0x3d0cb0, 0x8000000000000000);
+	/* Init_81: CI Trc 1 CA1 - Pattern A (TX cmd val) */
+	REGW(0x3d0d00, 0xC0200000DFFFFF00);
+	/* Init_82: CI Trc 1 CA1 - Trigger 0 (Pattern A) */
+	REGW(0x3d0d20, 0x8000000000000000);
+	/* Init_83: CI Trc 1 CA1 - Mode */
+	REGW(0x3d0d30, 0x8000000000000000);
+	/* Init_84: CI Trc 1 DBG - Run/Status (start trace) */
+	REGW(0x3d0c10, 0x8000000000000000);
+
+	/* Init_85...92:
+	 *
+	 * XXX NOTE: Here we normally read the Port 0 to 7 status regs
+	 * which is optional. Eventually we might want to do it to check
+	 * if the status matches expectations
+	 *
+	 * (regs 0x3d0008 to 0x3d7008)
+	 */
+
+	/*** Set buffer allocations (credits) ***/
+
+	/* Init_93: CI Port 0 Rx Cmd Buffer Allocation */
+	REGW(0x3d0050, 0x0808040400000000);
+	/* Init_94: CI Port 0 Rx Dat Buffer Allocation */
+	REGW(0x3d0060, 0x0006000200000000);
+	/* Init_95: CI Port 1 Tx Cmd Buffer Allocation */
+	REGW(0x3d1030, 0x0000040400000000);
+	/* Init_96: CI Port 1 Tx Dat Buffer Allocation */
+	REGW(0x3d1040, 0x0000004800000000);
+	/* Init_97: CI Port 1 Rx Cmd Buffer Allocation */
+	REGW(0x3d1050, 0x0008000000000000);
+	/* Init_98: CI Port 1 Rx Dat Buffer Allocation */
+	REGW(0x3d1060, 0x0048000000000000);
+	/* Init_99: CI Port 2 Tx Cmd Buffer Allocation */
+	REGW(0x3d2030, 0x0808080800000000);
+	/* Init_100: CI Port 2 Tx Dat Buffer Allocation */
+	REGW(0x3d2040, 0x0086008200000000);
+	/* Init_101: CI Port 2 Rx Cmd Buffer Allocation */
+	REGW(0x3d2050, 0x0808080800000000);
+	/* Init_102: CI Port 2 Rx Dat Buffer Allocation */
+	REGW(0x3d2060, 0x8648000000000000);
+	/* Init_103: CI Port 3 Tx Cmd Buffer Allocation */
+	REGW(0x3d3030, 0x0808080800000000);
+	/* Init_104: CI Port 3 Tx Dat Buffer Allocation */
+	REGW(0x3d3040, 0x0086008200000000);
+	/* Init_105: CI Port 3 Rx Cmd Buffer Allocation */
+	REGW(0x3d3050, 0x0808080800000000);
+	/* Init_106: CI Port 3 Rx Dat Buffer Allocation */
+	REGW(0x3d3060, 0x8648000000000000);
+	/* Init_107: CI Port 4 Tx Cmd Buffer Allocation */
+	REGW(0x3d4030, 0x0808080800000000);
+	/* Init_108: CI Port 4 Tx Dat Buffer Allocation */
+	REGW(0x3d4040, 0x0086008200000000);
+	/* Init_109: CI Port 4 Rx Cmd Buffer Allocation */
+	REGW(0x3d4050, 0x0808080800000000);
+	/* Init_110: CI Port 4 Rx Dat Buffer Allocation */
+	REGW(0x3d4060, 0x8648000000000000);
+	/* Init_111: CI Port 5 Tx Cmd Buffer Allocation */
+	REGW(0x3d5030, 0x0808080800000000);
+	/* Init_112: CI Port 5 Tx Dat Buffer Allocation */
+	REGW(0x3d5040, 0x0086008200000000);
+	/* Init_113: CI Port 5 Rx Cmd Buffer Allocation */
+	REGW(0x3d5050, 0x0808080800000000);
+	/* Init_114: CI Port 5 Rx Dat Buffer Allocation */
+	REGW(0x3d5060, 0x8648000000000000);
+	/* Init_115: CI Port 6 Tx Cmd Buffer Allocation */
+	REGW(0x3d6030, 0x0808080800000000);
+	/* Init_116: CI Port 6 Tx Dat Buffer Allocation */
+	REGW(0x3d6040, 0x0086008200000000);
+	/* Init_117: CI Port 6 Rx Cmd Buffer Allocation */
+	REGW(0x3d6050, 0x0808080800000000);
+	/* Init_118: CI Port 6 Rx Dat Buffer Allocation */
+	REGW(0x3d6060, 0x8648000000000000);
+	/* Init_119: CI Port 7 Tx Cmd Buffer Allocation */
+	REGW(0x3d7030, 0x0808080800000000);
+	/* Init_120: CI Port 7 Tx Dat Buffer Allocation */
+	REGW(0x3d7040, 0x0086008200000000);
+	/* Init_121: CI Port 7 Rx Cmd Buffer Allocation */
+	REGW(0x3d7050, 0x0808080800000000);
+	/* Init_122: CI Port 6 Rx Dat Buffer Allocation */
+	REGW(0x3d7060, 0x8648000000000000);
+
+	/*** Channel ordering ***/
+
+	/* Init_123: CI Port 1 Ordering */
+	REGW(0x3d1070, 0x73D0735E00000000);
+	/* Init_124: CI Port 2 Ordering */
+	REGW(0x3d2070, 0x73D0735E00000000);
+	/* Init_125: CI Port 3 Ordering */
+	REGW(0x3d3070, 0x73D0735E00000000);
+	/* Init_126: CI Port 4 Ordering */
+	REGW(0x3d4070, 0x73D0735E00000000);
+	/* Init_127: CI Port 5 Ordering */
+	REGW(0x3d5070, 0x73D0735E00000000);
+	/* Init_128: CI Port 6 Ordering */
+	REGW(0x3d6070, 0x73D0735E00000000);
+	/* Init_129: CI POrt 7 Ordering */
+	REGW(0x3d7070, 0x73D0735E00000000);
+
+	/*** Setup routing (port 0 only) */
+
+	p7ioc_init_ci_routing(ioc);
+
+	/*** Set Running Configuration/Crd Init Timers ***
+	 *
+	 * XXX NOTE: Supposed to only modify bits 8:15
+	 */
+
+	/* Init_226: CI Port 1 Configuration */
+	REGW(0x3d1000, 0x023F0FCF07200002);
+	/* Init_227: CI Port 2 Configuration */
+	REGW(0x3d2000, 0x023F00C307200002);
+	/* Init_228: CI Port 3 Configuration */
+	REGW(0x3d3000, 0x023F00C307200002);
+	/* Init_229: CI Port 4 Configuration */
+	REGW(0x3d4000, 0x023F00C307200002);
+	/* Init_230: CI Port 5 Configuration */
+	REGW(0x3d5000, 0x023F00C307200002);
+	/* Init_231: CI Port 6 Configuration */
+	REGW(0x3d6000, 0x023F00C307200002);
+	/* Init_232: CI Port 7 Configuration */
+	REGW(0x3d7000, 0x023F00C307200002);
+	/* Init_233: CI Port 0 Configuration */
+	REGW(0x3d0000, 0x023F00C0073F0002);
+}
+
+static void p7ioc_init_PHBs(struct p7ioc *ioc)
+{
+	unsigned int i;
+
+	printf("P7IOC: Init PHBs...\n");
+
+	/* We use the same reset sequence that we use for
+	 * fast reboot for consistency
+	 */
+	for (i = 0; i < P7IOC_NUM_PHBS; i++) {
+		if (p7ioc_phb_enabled(ioc, i))
+			p7ioc_phb_reset(&ioc->phbs[i].phb);
+	}
+}
+
+static void p7ioc_init_MISC(struct p7ioc *ioc)
+{
+	printf("P7IOC: Init MISC...\n");
+
+	/*** Set LEM regs ***/
+
+	/* Init_1: LEM FIR Accumulator */
+	REGW(0x3ea000, 0x0000000000000000);
+	/* Init_2: LEM Action 0 */
+	REGW(0x3ea030, 0xFFFFFFFCEE3FFFFF);
+	/* Init_3: LEM Action 1 */
+	REGW(0x3ea038, 0x0000000001C00000);
+	/* Init_4: LEM WOF */
+	REGW(0x3ea040, 0x0000000000000000);
+	/* Init_5: LEM Mask (AND write) */
+	REGW(0x3ea020, 0x000F03F0CD3FFFFF);
+	/* Init_5.1: I2C LEM FIR Accumulator */
+	REGW(0x3eb000, 0x0000000000000000);
+	/* Init_5.2: I2C LEM Action 0 */
+	REGW(0x3eb030, 0xEE00000000000000);
+	/* Init_5.3: I2C LEM Action 1 */
+	REGW(0x3eb038, 0x0000000000000000);
+	/* Init_5.4: I2C LEM WOF */
+	REGW(0x3eb040, 0x0000000000000000);
+	/* Init_5.5: I2C LEM Mask (AND write) */
+	REGW(0x3eb020, 0x4600000000000000);
+
+	/*** Set RGC GP bits (error enables) ***/
+
+	/* Init_7: RGC GP0 control (enable umux errors) */
+	REGW(0x3e1018, 0x8888880000000000);
+
+	/*** Central Trace Setup ***
+	 *
+	 * By default trace 4 PHBs Rx/Tx, but this can be changed
+	 * for debugging purposes
+	 */
+
+	/* Init_8: */
+	REGW(0x3ea810, 0x5000000000000000);
+	/* Init_9: */
+	REGW(0x3ea800, 0x0000000000000000);
+	/* Init_10: */
+	REGW(0x3ea808, 0xB0000000F0000000);
+	/* Init_11: */
+	REGW(0x3ea818, 0xF4F00FFF00000000);
+	/* Init_12: */
+	REGW(0x3ea820, 0x0000000000000000);
+	/* Init_13: */
+	REGW(0x3ea828, 0x0000000000000000);
+	/* Init_14: */
+	REGW(0x3ea830, 0x0000000000000000);
+	/* Init_15: */
+	REGW(0x3ea838, 0x0000000000000000);
+	/* Init_16: */
+	REGW(0x3ea840, 0x0000000000000000);
+	/* Init_17: */
+	REGW(0x3ea878, 0x0300000000000000);
+
+	/* Init_18: PHB0 mux select (Rx/Tx) */
+	REGW(0x000F80, 0x0000000000000000);
+	/* Init_19: PHB1 mux select (Rx/Tx) */
+	REGW(0x010F80, 0x0000000000000000);
+	/* Init_19.0: PHB2 mux select (Rx/Tx) */
+	REGW(0x020F80, 0x0000000000000000);
+	/* Init_19.1: PHB3 mux select (Rx/Tx) */
+	REGW(0x030F80, 0x0000000000000000);
+	/* Init_19.2: PHB4 mux select (Rx/Tx) */
+	REGW(0x040F80, 0x0000000000000000);
+	/* Init_19.3: PHB5 mux select (Rx/Tx) */
+	REGW(0x050F80, 0x0000000000000000);
+
+	/* Init_20: */
+	REGW(0x3ea880, 0x40008000FF7F0000);
+	/* Init_21: */
+	REGW(0x3ea888, 0x0000000000000000);
+	/* Init_22: */
+	REGW(0x3ea890, 0x0000000000000000);
+	/* Init_23: */
+	REGW(0x3ea898, 0x0000000000000000);
+	/* Init_24: */
+	REGW(0x3ea8a0, 0x8000000000000000);
+	/* Init_25: */
+	REGW(0x3ea8a8, 0x0000000000000000);
+	/* Init_26: */
+	REGW(0x3ea8b0, 0x8000000000000000);
+	/* Init_27: */
+	REGW(0x3ea8b8, 0x0000000000000000);
+	/* Init_28: */
+	REGW(0x3ea8c0, 0x0000000000000000);
+	/* Init_29: */
+	REGW(0x3ea900, 0x40008000FF7F0000);
+	/* Init_30: */
+	REGW(0x3ea908, 0x0000000000000000);
+	/* Init_31: */
+	REGW(0x3ea910, 0x0000000000000000);
+	/* Init_32: */
+	REGW(0x3ea918, 0x0000000000000000);
+	/* Init_33: */
+	REGW(0x3ea920, 0x8000000000000000);
+	/* Init_34: */
+	REGW(0x3ea928, 0x0000000000000000);
+	/* Init_35: */
+	REGW(0x3ea930, 0x8000000000000000);
+	/* Init_36: */
+	REGW(0x3ea938, 0x0000000000000000);
+	/* Init_37: */
+	REGW(0x3ea940, 0x0000000000000000);
+	/* Init_38: */
+	REGW(0x3ea980, 0x40008000FF7F0000);
+	/* Init_39: */
+	REGW(0x3ea988, 0x0000000000000000);
+	/* Init_40: */
+	REGW(0x3ea990, 0x0000000000000000);
+	/* Init_41: */
+	REGW(0x3ea998, 0x0000000000000000);
+	/* Init_42: */
+	REGW(0x3ea9a0, 0x8000000000000000);
+	/* Init_43: */
+	REGW(0x3ea9a8, 0x0000000000000000);
+	/* Init_44: */
+	REGW(0x3ea9b0, 0x8000000000000000);
+	/* Init_45: */
+	REGW(0x3ea9b8, 0x0000000000000000);
+	/* Init_46: */
+	REGW(0x3ea9c0, 0x0000000000000000);
+	/* Init_47: */
+	REGW(0x3eaa00, 0x40008000FF7F0000);
+	/* Init_48: */
+	REGW(0x3eaa08, 0x0000000000000000);
+	/* Init_49: */
+	REGW(0x3eaa10, 0x0000000000000000);
+	/* Init_50: */
+	REGW(0x3eaa18, 0x0000000000000000);
+	/* Init_51: */
+	REGW(0x3eaa20, 0x8000000000000000);
+	/* Init_52: */
+	REGW(0x3eaa28, 0x0000000000000000);
+	/* Init_53: */
+	REGW(0x3eaa30, 0x8000000000000000);
+	/* Init_54: */
+	REGW(0x3eaa38, 0x0000000000000000);
+	/* Init_55: */
+	REGW(0x3eaa40, 0x0000000000000000);
+	/* Init_56: */
+	REGW(0x3ea810, 0x1000000000000000);
+	/* Init_57: */
+	REGW(0x3ea810, 0x8000000000000000);
+
+	/*** I2C Master init fixup */
+
+	/* Init_58: I2C Master Operation Control */
+	REGW(0x3eb0a8, 0x8100000000000000);
+}
+
+static void p7ioc_init_GEM(struct p7ioc *ioc)
+{
+	printf("P7IOC: Init GEM...\n");
+
+	/*** Check for errors */
+
+	/* XXX TODO */
+#if 0
+	/* Init_1: */
+	REGR(0x3e0008, 0);
+	/* Init_2: */
+	REGR(0x3e0010, 0);
+	/* Init_3: */
+	REGR(0x3e0018, 0);
+#endif
+
+	/*** Get ready for new errors, allow interrupts ***
+	 *
+	 * XXX: Need to leave all unused port masked to prevent
+	 * invalid errors
+	 */
+
+	/* Init_4: GEM XFIR */
+	REGW(0x3e0008, 0x0000000000000000);
+	/* Init_5: GEM Mask (See FIXME) */
+	REGW(0x3e0020, 0x000F033FFFFFFFFF);
+	/* Init_6: GEM WOF */
+	REGW(0x3e0028, 0x0000000000000000);
+}
+
+int64_t p7ioc_inits(struct p7ioc *ioc)
+{
+	p7ioc_init_BI(ioc);
+	p7ioc_init_MISC_HSS(ioc);
+	p7ioc_init_RGC(ioc);
+	p7ioc_init_CI(ioc);
+	p7ioc_init_PHBs(ioc);
+	p7ioc_init_MISC(ioc);
+	p7ioc_init_GEM(ioc);
+
+	return OPAL_SUCCESS;
+
+}
+
+void p7ioc_reset(struct io_hub *hub)
+{
+	struct p7ioc *ioc = iohub_to_p7ioc(hub);
+	unsigned int i;
+
+	/* We could do a full cold reset of P7IOC but for now, let's
+	 * not bother and just try to clean up the interrupts as best
+	 * as possible
+	 */
+
+	/* XXX TODO: RGC interrupts */
+
+	printf("P7IOC: Clearing IODA...\n");
+
+	/* First clear all IODA tables and wait a bit */
+	for (i = 0; i < 6; i++) {
+		if (p7ioc_phb_enabled(ioc, i))
+			p7ioc_phb_reset(&ioc->phbs[i].phb);
+	}
+}
diff --git a/hw/p7ioc-phb.c b/hw/p7ioc-phb.c
new file mode 100644
index 00000000..8dc76163
--- /dev/null
+++ b/hw/p7ioc-phb.c
@@ -0,0 +1,3206 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <p7ioc.h>
+#include <p7ioc-regs.h>
+#include <io.h>
+#include <timebase.h>
+#include <affinity.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <ccan/str/str.h>
+
+static void p7ioc_phb_trace(struct p7ioc_phb *p, FILE *s, const char *fmt, ...)
+__attribute__ ((format (printf, 3, 4)));
+
+static void p7ioc_phb_trace(struct p7ioc_phb *p, FILE *s, const char *fmt, ...)
+{
+	/* Use a temp stack buffer to print all at once to avoid
+	 * mixups of a trace entry on SMP
+	 */
+	char tbuf[128 + 10];
+	va_list args;
+	char *b = tbuf;
+
+	b += sprintf(b, "PHB%d: ", p->phb.opal_id);
+	va_start(args, fmt);
+	vsnprintf(b, 128, fmt, args);
+	va_end(args);
+	fputs(tbuf, s);
+}
+#define PHBDBG(p, fmt...)	p7ioc_phb_trace(p, stdout, fmt)
+#define PHBERR(p, fmt...)	p7ioc_phb_trace(p, stderr, fmt)
+
+/* Helper to select an IODA table entry */
+static inline void p7ioc_phb_ioda_sel(struct p7ioc_phb *p, uint32_t table,
+				      uint32_t addr, bool autoinc)
+{
+	out_be64(p->regs + PHB_IODA_ADDR,
+		 (autoinc ? PHB_IODA_AD_AUTOINC : 0)	|
+		 SETFIELD(PHB_IODA_AD_TSEL, 0ul, table)	|
+		 SETFIELD(PHB_IODA_AD_TADR, 0ul, addr));
+}
+
+/* Helper to set the state machine timeout */
+static inline uint64_t p7ioc_set_sm_timeout(struct p7ioc_phb *p, uint64_t dur)
+{
+	uint64_t target, now = mftb();
+
+	target = now + dur;
+	if (target == 0)
+		target++;
+	p->delay_tgt_tb = target;
+
+	return dur;
+}
+
+/*
+ * Lock callbacks. Allows the OPAL API handlers to lock the
+ * PHB around calls such as config space, EEH, etc...
+ */
+static void p7ioc_phb_lock(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	lock(&p->lock);
+}
+
+static  void p7ioc_phb_unlock(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	unlock(&p->lock);
+}
+
+static bool p7ioc_phb_fenced(struct p7ioc_phb *p)
+{
+	struct p7ioc *ioc = p->ioc;
+	uint64_t fence, fbits;
+
+	fbits = 0x0003000000000000 >> (p->index * 4);
+	fence = in_be64(ioc->regs + P7IOC_CHIP_FENCE_SHADOW);
+
+	return (fence & fbits) != 0;
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t p7ioc_pcicfg_check(struct p7ioc_phb *p, uint32_t bdfn,
+				  uint32_t offset, uint32_t size)
+{
+	uint32_t sm = size - 1;
+
+	if (offset > 0xfff || bdfn > 0xffff)
+		return OPAL_PARAMETER;
+	if (offset & sm)
+		return OPAL_PARAMETER;
+
+	/* The root bus only has a device at 0 and we get into an
+	 * error state if we try to probe beyond that, so let's
+	 * avoid that and just return an error to Linux
+	 */
+	if ((bdfn >> 8) == 0 && (bdfn & 0xff))
+		return OPAL_HARDWARE;
+
+	/* Check PHB state */
+	if (p->state == P7IOC_PHB_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	return OPAL_SUCCESS;
+}
+
+#define P7IOC_PCI_CFG_READ(size, type)	\
+static int64_t p7ioc_pcicfg_read##size(struct phb *phb, uint32_t bdfn,	\
+				       uint32_t offset, type *data)	\
+{									\
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);			\
+	uint64_t addr;							\
+	void *base = p->regs;						\
+	int64_t rc;							\
+									\
+	/* Initialize data in case of error */				\
+	*data = (type)0xffffffff;					\
+									\
+	rc = p7ioc_pcicfg_check(p, bdfn, offset, sizeof(type));		\
+	if (rc)								\
+		return rc;						\
+									\
+	if (p7ioc_phb_fenced(p)) {					\
+		if (!(p->flags & P7IOC_PHB_CFG_USE_ASB))		\
+			return OPAL_HARDWARE;				\
+									\
+		base = p->regs_asb;					\
+	} else if ((p->flags & P7IOC_PHB_CFG_BLOCKED) && bdfn != 0) {	\
+		return OPAL_HARDWARE;					\
+	}								\
+									\
+	addr = PHB_CA_ENABLE | ((uint64_t)bdfn << PHB_CA_FUNC_LSH);	\
+	addr = SETFIELD(PHB_CA_REG, addr, offset);			\
+	out_be64(base + PHB_CONFIG_ADDRESS, addr);			\
+	*data = in_le##size(base + PHB_CONFIG_DATA +			\
+		     (offset & (4 - sizeof(type))));			\
+									\
+	return OPAL_SUCCESS;						\
+}
+
+#define P7IOC_PCI_CFG_WRITE(size, type)	\
+static int64_t p7ioc_pcicfg_write##size(struct phb *phb, uint32_t bdfn,	\
+					uint32_t offset, type data)	\
+{									\
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);			\
+	void *base = p->regs;						\
+	uint64_t addr;							\
+	int64_t rc;							\
+									\
+	rc = p7ioc_pcicfg_check(p, bdfn, offset, sizeof(type));		\
+	if (rc)								\
+		return rc;						\
+									\
+	if (p7ioc_phb_fenced(p)) {					\
+		if (!(p->flags & P7IOC_PHB_CFG_USE_ASB))		\
+			return OPAL_HARDWARE;				\
+									\
+		base = p->regs_asb;					\
+	} else if ((p->flags & P7IOC_PHB_CFG_BLOCKED) && bdfn != 0) {	\
+		return OPAL_HARDWARE;					\
+	}								\
+									\
+	addr = PHB_CA_ENABLE | ((uint64_t)bdfn << PHB_CA_FUNC_LSH);	\
+	addr = SETFIELD(PHB_CA_REG, addr, offset);			\
+	out_be64(base + PHB_CONFIG_ADDRESS, addr);			\
+	out_le##size(base + PHB_CONFIG_DATA +				\
+		     (offset & (4 - sizeof(type))), data);		\
+									\
+	return OPAL_SUCCESS;						\
+}
+
+P7IOC_PCI_CFG_READ(8, uint8_t)
+P7IOC_PCI_CFG_READ(16, uint16_t)
+P7IOC_PCI_CFG_READ(32, uint32_t)
+P7IOC_PCI_CFG_WRITE(8, uint8_t)
+P7IOC_PCI_CFG_WRITE(16, uint16_t)
+P7IOC_PCI_CFG_WRITE(32, uint32_t)
+
+static int64_t p7ioc_presence_detect(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+
+	/* XXX Test for PHB in error state ? */
+
+	if (reg & PHB_PCIE_SLOTCTL2_PRSTN_STAT)
+		return OPAL_SHPC_DEV_PRESENT;
+
+	return OPAL_SHPC_DEV_NOT_PRESENT;
+}
+
+static int64_t p7ioc_link_state(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+	uint16_t lstat;
+	int64_t rc;
+
+	/* XXX Test for PHB in error state ? */
+
+	/* Link is up, let's find the actual speed */
+	if (!(reg & PHB_PCIE_DLP_TC_DL_LINKACT))
+		return OPAL_SHPC_LINK_DOWN;
+
+	rc = p7ioc_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_LSTAT,
+				 &lstat);
+	if (rc < 0) {
+		/* Shouldn't happen */
+		PHBERR(p, "Failed to read link status\n");
+		return OPAL_HARDWARE;
+	}
+	if (!(lstat & PCICAP_EXP_LSTAT_DLLL_ACT))
+		return OPAL_SHPC_LINK_DOWN;
+
+	return GETFIELD(PCICAP_EXP_LSTAT_WIDTH, lstat);
+}
+
+static int64_t p7ioc_sm_freset(struct p7ioc_phb *p)
+{
+	uint64_t reg;
+	uint32_t cfg32;
+	uint64_t ci_idx = p->index + 2;
+
+	switch(p->state) {
+	case P7IOC_PHB_STATE_FUNCTIONAL:
+		/* If the slot isn't present, we needn't do it */
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PRSTN_STAT)) {
+			PHBDBG(p, "Slot freset: no device\n");
+			return OPAL_CLOSED;
+		}
+
+		/* Mask PCIE port interrupts and AER receiver error */
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0x7E00000000000000);
+		p7ioc_pcicfg_read32(&p->phb, 0,
+			p->aercap + PCIECAP_AER_CE_MASK, &cfg32);
+		cfg32 |= PCIECAP_AER_CE_RECVR_ERR;
+		p7ioc_pcicfg_write32(&p->phb, 0,
+			p->aercap + PCIECAP_AER_CE_MASK, cfg32);
+
+		/* Mask CI port error and clear it */
+		out_be64(p->ioc->regs + P7IOC_CIn_LEM_ERR_MASK(ci_idx),
+			 0xa4f4000000000000ul);
+		out_be64(p->regs + PHB_LEM_ERROR_MASK,
+			 0xadb650c9808dd051ul);
+		out_be64(p->ioc->regs + P7IOC_CIn_LEM_FIR(ci_idx),
+			 0x0ul);
+
+		/* Disable link to avoid training issues */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		reg |= PHB_PCIE_DLP_TCTX_DISABLE;
+		out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, reg);
+		PHBDBG(p, "Slot freset: disable link training\n");
+
+		p->state = P7IOC_PHB_STATE_FRESET_DISABLE_LINK;
+		p->retries = 12;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_FRESET_DISABLE_LINK:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & PHB_PCIE_DLP_TCRX_DISABLED) {
+			/* Turn on freset */
+			reg = in_be64(p->regs + PHB_RESET);
+			reg &= ~0x2000000000000000ul;
+			out_be64(p->regs + PHB_RESET, reg);
+			PHBDBG(p, "Slot freset: assert\n");
+
+			p->state = P7IOC_PHB_STATE_FRESET_ASSERT_DELAY;
+			return p7ioc_set_sm_timeout(p, secs_to_tb(1));
+		}
+
+		if (p->retries-- == 0) {
+			PHBDBG(p, "Slot freset: timeout to disable link training\n");
+			goto error;
+		}
+
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_FRESET_ASSERT_DELAY:
+		/* Turn off freset */
+		reg = in_be64(p->regs + PHB_RESET);
+		reg |= 0x2000000000000000ul;
+		out_be64(p->regs + PHB_RESET, reg);
+		PHBDBG(p, "Slot freset: deassert\n");
+
+		p->state = P7IOC_PHB_STATE_FRESET_DEASSERT_DELAY;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(200));
+	case P7IOC_PHB_STATE_FRESET_DEASSERT_DELAY:
+		/* Restore link control */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		reg &= ~PHB_PCIE_DLP_TCTX_DISABLE;
+		out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, reg);
+		PHBDBG(p, "Slot freset: enable link training\n");
+
+		p->state = P7IOC_PHB_STATE_FRESET_WAIT_LINK;
+		p->retries = 100;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_FRESET_WAIT_LINK:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) {
+			/*
+			 * Clear spurious errors and enable PCIE port
+			 * interrupts
+			 */
+			out_be64(p->regs + UTL_PCIE_PORT_STATUS,
+				 0x00E0000000000000);
+                        out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+				 0xFE65000000000000);
+
+			/* Clear AER receiver error status */
+			p7ioc_pcicfg_write32(&p->phb, 0,
+				p->aercap + PCIECAP_AER_CE_STATUS,
+				PCIECAP_AER_CE_RECVR_ERR);
+			/* Unmask receiver error status in AER */
+			p7ioc_pcicfg_read32(&p->phb, 0,
+				p->aercap + PCIECAP_AER_CE_MASK, &cfg32);
+			cfg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+			p7ioc_pcicfg_write32(&p->phb, 0,
+				p->aercap + PCIECAP_AER_CE_MASK, cfg32);
+			/* Clear and Unmask CI port and PHB errors */
+			out_be64(p->ioc->regs + P7IOC_CIn_LEM_FIR(ci_idx),
+				 0x0ul);
+			out_be64(p->regs + PHB_LEM_FIR_ACCUM,
+				 0x0ul);
+			out_be64(p->ioc->regs + P7IOC_CIn_LEM_ERR_MASK_AND(ci_idx),
+				 0x0ul);
+			out_be64(p->regs + PHB_LEM_ERROR_MASK,
+				 0x1249a1147f500f2cul);
+			PHBDBG(p, "Slot freset: link up!\n");
+
+			p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+			p->flags &= ~P7IOC_PHB_CFG_BLOCKED;
+			return OPAL_SUCCESS;
+		}
+
+		if (p->retries-- == 0) {
+			uint16_t val;
+
+			if (p->gen == 1) {
+				PHBDBG(p, "Slot freset: timeout for link up in Gen1 mode!\n");
+				goto error;
+			}
+
+			PHBDBG(p, "Slot freset: timeout for link up.\n");
+			PHBDBG(p, "Slot freset: fallback to Gen1.\n");
+			p->gen --;
+
+			/* Limit speed to 2.5G */
+			p7ioc_pcicfg_read16(&p->phb, 0,
+					p->ecap + PCICAP_EXP_LCTL2, &val);
+			val = SETFIELD(PCICAP_EXP_LCTL2_TLSPD, val, 1);
+			p7ioc_pcicfg_write16(&p->phb, 0,
+					p->ecap + PCICAP_EXP_LCTL2,
+					val);
+
+			/* Retrain */
+			p7ioc_pcicfg_read16(&p->phb, 0,
+					p->ecap + PCICAP_EXP_LCTL, &val);
+			p7ioc_pcicfg_write16(&p->phb, 0,
+					p->ecap + PCICAP_EXP_LCTL,
+					val | PCICAP_EXP_LCTL_LINK_RETRAIN);
+
+			/* Enter FRESET_WAIT_LINK, again */
+			p->state = P7IOC_PHB_STATE_FRESET_WAIT_LINK;
+			p->retries = 100;
+			return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+		}
+
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	default:
+		break;
+	}
+
+error:
+	p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+}
+
+static int64_t p7ioc_freset(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	if (p->state != P7IOC_PHB_STATE_FUNCTIONAL)
+		return OPAL_HARDWARE;
+
+	p->flags |= P7IOC_PHB_CFG_BLOCKED;
+	return p7ioc_sm_freset(p);
+}
+
+static int64_t p7ioc_power_state(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+
+	/* XXX Test for PHB in error state ? */
+
+	if (reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)
+		return OPAL_SHPC_POWER_ON;
+
+	return OPAL_SHPC_POWER_OFF;
+}
+
+static int64_t p7ioc_sm_slot_power_off(struct p7ioc_phb *p)
+{
+	uint64_t reg;
+
+	switch(p->state) {
+	case P7IOC_PHB_STATE_FUNCTIONAL:
+		/*
+		 * Check the presence and power status. If be not
+		 * be present or power down, we stop here.
+		 */
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PRSTN_STAT)) {
+			PHBDBG(p, "Slot power off: no device\n");
+			return OPAL_CLOSED;
+		}
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)) {
+			PHBDBG(p, "Slot power off: already off\n");
+			p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+			return OPAL_SUCCESS;
+		}
+
+		/*
+		 * Mask PCIE port interrupt and turn power off
+		 *
+		 * We have to set bit 0 and clear it explicitly on PHB
+		 * hotplug override register when doing power-off on the
+		 * PHB slot. Otherwise, it won't take effect. That's the
+		 * similar thing as we did for power-on.
+		 */
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0x7e00000000000000);
+		reg = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE);
+		reg &= ~(0x8c00000000000000ul);
+		reg |= 0x8400000000000000ul;
+		out_be64(p->regs + PHB_HOTPLUG_OVERRIDE, reg);
+		reg &= ~(0x8c00000000000000ul);
+		reg |= 0x0c00000000000000ul;
+		out_be64(p->regs + PHB_HOTPLUG_OVERRIDE, reg);
+		PHBDBG(p, "Slot power off: powering off...\n");
+
+		p->state = P7IOC_PHB_STATE_SPDOWN_STABILIZE_DELAY;
+		return p7ioc_set_sm_timeout(p, secs_to_tb(2));
+	case P7IOC_PHB_STATE_SPDOWN_STABILIZE_DELAY:
+		/*
+		 * The link should be stabilized after 2 seconds.
+		 * We still need poll registers to make sure the
+		 * power is really down every 1ms until limited
+		 * 1000 times.
+		 */
+		p->retries = 1000;
+		p->state = P7IOC_PHB_STATE_SPDOWN_SLOT_STATUS;
+		PHBDBG(p, "Slot power off: waiting for power off\n");
+	case P7IOC_PHB_STATE_SPDOWN_SLOT_STATUS:
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)) {
+			/*
+			 * We completed the task. Clear link errors
+			 * and restore PCIE port interrupts.
+			 */
+			out_be64(p->regs + UTL_PCIE_PORT_STATUS,
+				0x00E0000000000000ul);
+			out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,
+				0xFE65000000000000ul);
+
+			PHBDBG(p, "Slot power off: power off completely\n");
+			p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+			return OPAL_SUCCESS;
+		}
+
+		if (p->retries-- == 0) {
+			PHBERR(p, "Timeout powering off\n");
+			goto error;
+		}
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(1));
+	default:
+		break;
+	}
+
+error:
+	p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+}
+
+static int64_t p7ioc_slot_power_off(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	if (p->state != P7IOC_PHB_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* run state machine */
+	return p7ioc_sm_slot_power_off(p);
+}
+
+static int64_t p7ioc_sm_slot_power_on(struct p7ioc_phb *p)
+{
+	uint64_t reg;
+	uint32_t reg32;
+	uint64_t ci_idx = p->index + 2;
+
+	switch(p->state) {
+	case P7IOC_PHB_STATE_FUNCTIONAL:
+		/* Check presence */
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PRSTN_STAT)) {
+			PHBDBG(p, "Slot power on: no device\n");
+			return OPAL_CLOSED;
+		}
+
+		/* Adjust UTL interrupt settings to disable various
+		 * errors that would interfere with the process
+		 */
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0x7e00000000000000);
+
+		/* If the power is not on, turn it on now */
+		if (!(reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)) {
+			/*
+			 * The hotplug override register will not properly
+			 * initiate the poweron sequence unless bit 0
+			 * transitions from 0 to 1. Since it can already be
+			 * set to 1 as a result of a previous power-on
+			 * operation (even if the slot power is now off)
+			 * we need to first clear it, then set it to 1 or
+			 * nothing will happen
+			 */
+			reg = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE);
+			reg &= ~(0x8c00000000000000ul);
+			out_be64(p->regs + PHB_HOTPLUG_OVERRIDE, reg);
+			reg |= 0x8400000000000000ul;
+			out_be64(p->regs + PHB_HOTPLUG_OVERRIDE, reg);
+			p->state = P7IOC_PHB_STATE_SPUP_STABILIZE_DELAY;
+			PHBDBG(p, "Slot power on: powering on...\n");
+			return p7ioc_set_sm_timeout(p, secs_to_tb(2));
+		}
+		/* Power is already on */
+	power_ok:
+		/* Mask AER receiver error */
+		p7ioc_pcicfg_read32(&p->phb, 0,
+			p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+		reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+		p7ioc_pcicfg_write32(&p->phb, 0,
+			p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+		/* Mask CI port error and clear it */
+		out_be64(p->ioc->regs + P7IOC_CIn_LEM_ERR_MASK(ci_idx),
+			 0xa4f4000000000000ul);
+		out_be64(p->regs + PHB_LEM_ERROR_MASK,
+			 0xadb650c9808dd051ul);
+		out_be64(p->ioc->regs + P7IOC_CIn_LEM_FIR(ci_idx),
+			 0x0ul);
+
+		/* Disable link to avoid training issues */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		reg |= PHB_PCIE_DLP_TCTX_DISABLE;
+		out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, reg);
+		PHBDBG(p, "Slot power on: disable link training\n");
+
+		/* Switch to state machine of fundamental reset */
+                p->state = P7IOC_PHB_STATE_FRESET_DISABLE_LINK;
+		p->retries = 12;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_SPUP_STABILIZE_DELAY:
+		/* Come here after the 2s delay after power up */
+		p->retries = 1000;
+		p->state = P7IOC_PHB_STATE_SPUP_SLOT_STATUS;
+		PHBDBG(p, "Slot power on: waiting for power\n");
+		/* Fall through */
+	case P7IOC_PHB_STATE_SPUP_SLOT_STATUS:
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+
+		/* Doc says to check LED status, but we ignore that, there
+		 * no point really and it's easier that way
+		 */
+		if (reg & PHB_PCIE_SLOTCTL2_PWR_EN_STAT)
+			goto power_ok;
+		if (p->retries-- == 0) {
+			/* XXX Improve error logging */
+			PHBERR(p, "Timeout powering up slot\n");
+			goto error;
+		}
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	default:
+		break;
+	}
+
+	/* Unknown state, hardware error ? */
+ error:
+	p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+}
+
+static int64_t p7ioc_slot_power_on(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	if (p->state != P7IOC_PHB_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* run state machine */
+	return p7ioc_sm_slot_power_on(p);
+}
+
+/*
+ * The OS is expected to do fundamental reset after complete
+ * reset to make sure the PHB could be recovered from the
+ * fenced state. However, the OS needn't do that explicitly
+ * since fundamental reset will be done automatically while
+ * powering on the PHB.
+ */
+static int64_t p7ioc_complete_reset(struct phb *phb, uint8_t assert)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	struct p7ioc *ioc = p->ioc;
+	uint64_t val64;
+
+	if (assert == OPAL_ASSERT_RESET) {
+		if (p->state != P7IOC_PHB_STATE_FUNCTIONAL &&
+		    p->state != P7IOC_PHB_STATE_FENCED)
+			return OPAL_HARDWARE;
+
+		p->flags |= P7IOC_PHB_CFG_BLOCKED;
+		p7ioc_phb_reset(phb);
+
+		/*
+		 * According to the experiment, we probably still have
+		 * the fenced state with the corresponding PHB in the Fence
+		 * WOF and we need clear that explicitly. Besides, the RGC
+		 * might already have informational error and we should clear
+		 * that explicitly as well. Otherwise, RGC XIVE#0 won't issue
+		 * interrupt any more.
+		 */
+		val64 = in_be64(ioc->regs + P7IOC_CHIP_FENCE_WOF);
+		val64 &= ~PPC_BIT(15 + p->index * 4);
+		out_be64(ioc->regs + P7IOC_CHIP_FENCE_WOF, val64);
+
+		/* Clear informational error from RGC */
+		val64 = in_be64(ioc->regs + P7IOC_RGC_LEM_BASE + P7IOC_LEM_WOF_OFFSET);
+		val64 &= ~PPC_BIT(18);
+		out_be64(ioc->regs + P7IOC_RGC_LEM_BASE + P7IOC_LEM_WOF_OFFSET, val64);
+		val64 = in_be64(ioc->regs + P7IOC_RGC_LEM_BASE + P7IOC_LEM_FIR_OFFSET);
+		val64 &= ~PPC_BIT(18);
+		out_be64(ioc->regs + P7IOC_RGC_LEM_BASE + P7IOC_LEM_FIR_OFFSET, val64);
+
+		return p7ioc_sm_slot_power_off(p);
+	} else {
+		if (p->state != P7IOC_PHB_STATE_FUNCTIONAL)
+			return OPAL_HARDWARE;
+
+		return p7ioc_sm_slot_power_on(p);
+	}
+
+	/* We shouldn't run to here */
+	return OPAL_PARAMETER;
+}
+
+/*
+ * We have to mask errors prior to disabling link training.
+ * Otherwise it would cause infinite frozen PEs. Also, we
+ * should have some delay after enabling link training. It's
+ * the conclusion from experiment and no document mentioned
+ * it.
+ */
+static int64_t p7ioc_sm_hot_reset(struct p7ioc_phb *p)
+{
+	uint64_t reg;
+	uint32_t cfg32;
+	uint16_t brctl;
+
+	switch(p->state) {
+	case P7IOC_PHB_STATE_FUNCTIONAL:
+		/* If the slot isn't present, we needn't do it */
+		reg = in_be64(p->regs + PHB_PCIE_SLOTCTL2);
+		if (!(reg & PHB_PCIE_SLOTCTL2_PRSTN_STAT)) {
+			PHBDBG(p, "Slot hot reset: no device\n");
+			return OPAL_CLOSED;
+		}
+
+		/* Mask PCIE port interrupts and AER receiver error */
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0x7E00000000000000);
+		p7ioc_pcicfg_read32(&p->phb, 0,
+			p->aercap + PCIECAP_AER_CE_MASK, &cfg32);
+		cfg32 |= PCIECAP_AER_CE_RECVR_ERR;
+		p7ioc_pcicfg_write32(&p->phb, 0,
+			p->aercap + PCIECAP_AER_CE_MASK, cfg32);
+
+		/* Disable link to avoid training issues */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		reg |= PHB_PCIE_DLP_TCTX_DISABLE;
+		out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, reg);
+		PHBDBG(p, "Slot hot reset: disable link training\n");
+
+		p->state = P7IOC_PHB_STATE_HRESET_DISABLE_LINK;
+		p->retries = 12;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_HRESET_DISABLE_LINK:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & PHB_PCIE_DLP_TCRX_DISABLED) {
+			/* Turn on host reset */
+			p7ioc_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+			brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+			p7ioc_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+			PHBDBG(p, "Slot hot reset: assert reset\n");
+
+			p->state = P7IOC_PHB_STATE_HRESET_DELAY;
+			return p7ioc_set_sm_timeout(p, secs_to_tb(1));
+		}
+
+		if (p->retries-- == 0) {
+			PHBDBG(p, "Slot hot reset: timeout to disable link training\n");
+			return OPAL_HARDWARE;
+		}
+
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_HRESET_DELAY:
+		/* Turn off host reset */
+		p7ioc_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		p7ioc_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		PHBDBG(p, "Slot hot reset: deassert reset\n");
+
+		p->state = P7IOC_PHB_STATE_HRESET_ENABLE_LINK;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(200));
+	case P7IOC_PHB_STATE_HRESET_ENABLE_LINK:
+		/* Restore link control */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		reg &= ~PHB_PCIE_DLP_TCTX_DISABLE;
+		out_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL, reg);
+		PHBDBG(p, "Slot hot reset: enable link training\n");
+
+		p->state = P7IOC_PHB_STATE_HRESET_WAIT_LINK;
+		p->retries = 100;
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	case P7IOC_PHB_STATE_HRESET_WAIT_LINK:
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+                if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) {
+			/*
+			 * Clear spurious errors and enable PCIE port
+			 * interrupts
+			 */
+			out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0x00E0000000000000);
+			out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xFE65000000000000);
+
+			/* Clear AER receiver error status */
+			p7ioc_pcicfg_write32(&p->phb, 0,
+				p->aercap + PCIECAP_AER_CE_STATUS,
+				PCIECAP_AER_CE_RECVR_ERR);
+			/* Unmask receiver error status in AER */
+			p7ioc_pcicfg_read32(&p->phb, 0,
+				p->aercap + PCIECAP_AER_CE_MASK, &cfg32);
+			cfg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+			p7ioc_pcicfg_write32(&p->phb, 0,
+				p->aercap + PCIECAP_AER_CE_MASK, cfg32);
+			PHBDBG(p, "Slot hot reset: link up!\n");
+
+			p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+			p->flags &= ~P7IOC_PHB_CFG_BLOCKED;
+			return OPAL_SUCCESS;
+		}
+
+		if (p->retries-- == 0) {
+			PHBDBG(p, "Slot hot reset: timeout for link up\n");
+			goto error;
+		}
+
+		return p7ioc_set_sm_timeout(p, msecs_to_tb(10));
+	default:
+		break;
+	}
+
+	/* Unknown state, hardware error ? */
+error:
+	p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+}
+
+static int64_t p7ioc_hot_reset(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	if (p->state != P7IOC_PHB_STATE_FUNCTIONAL)
+		return OPAL_HARDWARE;
+
+	p->flags |= P7IOC_PHB_CFG_BLOCKED;
+	return p7ioc_sm_hot_reset(p);
+}
+
+static int64_t p7ioc_poll(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t now = mftb();
+
+	if (p->state == P7IOC_PHB_STATE_FUNCTIONAL)
+		return OPAL_SUCCESS;
+
+	/* Check timer */
+	if (p->delay_tgt_tb &&
+	    tb_compare(now, p->delay_tgt_tb) == TB_ABEFOREB)
+		return p->delay_tgt_tb - now;
+
+	/* Expired (or not armed), clear it */
+	p->delay_tgt_tb = 0;
+
+	/* Dispatch to the right state machine */
+	switch(p->state) {
+	case P7IOC_PHB_STATE_SPUP_STABILIZE_DELAY:
+	case P7IOC_PHB_STATE_SPUP_SLOT_STATUS:
+		return p7ioc_sm_slot_power_on(p);
+	case P7IOC_PHB_STATE_SPDOWN_STABILIZE_DELAY:
+	case P7IOC_PHB_STATE_SPDOWN_SLOT_STATUS:
+		return p7ioc_sm_slot_power_off(p);
+	case P7IOC_PHB_STATE_FRESET_DISABLE_LINK:
+	case P7IOC_PHB_STATE_FRESET_ASSERT_DELAY:
+	case P7IOC_PHB_STATE_FRESET_DEASSERT_DELAY:
+	case P7IOC_PHB_STATE_FRESET_WAIT_LINK:
+		return p7ioc_sm_freset(p);
+	case P7IOC_PHB_STATE_HRESET_DISABLE_LINK:
+	case P7IOC_PHB_STATE_HRESET_ASSERT:
+	case P7IOC_PHB_STATE_HRESET_DELAY:
+	case P7IOC_PHB_STATE_HRESET_ENABLE_LINK:
+	case P7IOC_PHB_STATE_HRESET_WAIT_LINK:
+		return p7ioc_sm_hot_reset(p);
+	default:
+		break;
+	}
+
+	/* Unknown state, could be a HW error */
+	return OPAL_HARDWARE;
+}
+
+static void p7ioc_eeh_read_phb_status(struct p7ioc_phb *p,
+				      struct OpalIoP7IOCPhbErrorData *stat)
+{
+	bool locked;
+	uint16_t tmp16;
+	unsigned int i;
+
+	memset(stat, 0, sizeof(struct OpalIoP7IOCPhbErrorData));
+
+
+	/* Error data common part */
+	stat->common.version = OPAL_PHB_ERROR_DATA_VERSION_1;
+	stat->common.ioType  = OPAL_PHB_ERROR_DATA_TYPE_P7IOC;
+	stat->common.len     = sizeof(struct OpalIoP7IOCPhbErrorData);
+
+	/*
+	 * We read some registers using config space through AIB.
+	 *
+	 * Get to other registers using ASB when possible to get to them
+	 * through a fence if one is present.
+	 *
+	 * Note that the OpalIoP7IOCPhbErrorData has oddities, such as the
+	 * bridge control being 32-bit and the UTL registers being 32-bit
+	 * (which they really are, but they use the top 32-bit of a 64-bit
+	 * register so we need to be a bit careful).
+	 */
+
+	/* Use ASB to access PCICFG if the PHB has been fenced */
+	locked = lock_recursive(&p->lock);
+	p->flags |= P7IOC_PHB_CFG_USE_ASB;
+
+	/* Grab RC bridge control, make it 32-bit */
+	p7ioc_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &tmp16);
+	stat->brdgCtl = tmp16;
+
+	/* Grab UTL status registers */
+	stat->portStatusReg = hi32(in_be64(p->regs_asb
+					   + UTL_PCIE_PORT_STATUS));
+	stat->rootCmplxStatus = hi32(in_be64(p->regs_asb
+					   + UTL_RC_STATUS));
+	stat->busAgentStatus = hi32(in_be64(p->regs_asb
+					   + UTL_SYS_BUS_AGENT_STATUS));
+
+	/*
+	 * Grab various RC PCIe capability registers. All device, slot
+	 * and link status are 16-bit, so we grab the pair control+status
+	 * for each of them
+	 */
+	p7ioc_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL,
+			    &stat->deviceStatus);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL,
+			    &stat->slotStatus);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL,
+			    &stat->linkStatus);
+
+	/*
+	 * I assume those are the standard config space header, cmd & status
+	 * together makes 32-bit. Secondary status is 16-bit so I'll clear
+	 * the top on that one
+	 */
+	p7ioc_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &stat->devCmdStatus);
+	p7ioc_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &tmp16);
+	stat->devSecStatus = tmp16;
+
+	/* Grab a bunch of AER regs */
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA,
+			    &stat->rootErrorStatus);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS,
+			    &stat->uncorrErrorStatus);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+			    &stat->corrErrorStatus);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0,
+			    &stat->tlpHdr1);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1,
+			    &stat->tlpHdr2);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2,
+			    &stat->tlpHdr3);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3,
+			    &stat->tlpHdr4);
+	p7ioc_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID,
+			    &stat->sourceId);
+
+	/* Restore to AIB */
+	p->flags &= ~P7IOC_PHB_CFG_USE_ASB;
+	if (locked) {
+		unlock(&p->lock);
+		pci_put_phb(&p->phb);
+	}
+
+	/*
+	 * No idea what that that is supposed to be, opal.h says
+	 * "Record data about the call to allocate a buffer."
+	 *
+	 * Let's leave them alone for now...
+	 *
+	 * uint64_t errorClass;
+	 * uint64_t correlator;
+	*/
+
+	/* P7IOC MMIO Error Regs */
+	stat->p7iocPlssr = in_be64(p->regs_asb + PHB_CPU_LOADSTORE_STATUS);
+	stat->p7iocCsr = in_be64(p->regs_asb + PHB_DMA_CHAN_STATUS);
+	stat->lemFir = in_be64(p->regs_asb + PHB_LEM_FIR_ACCUM);
+	stat->lemErrorMask = in_be64(p->regs_asb + PHB_LEM_ERROR_MASK);
+	stat->lemWOF = in_be64(p->regs_asb + PHB_LEM_WOF);
+	stat->phbErrorStatus = in_be64(p->regs_asb + PHB_ERR_STATUS);
+	stat->phbFirstErrorStatus = in_be64(p->regs_asb + PHB_ERR1_STATUS);
+	stat->phbErrorLog0 = in_be64(p->regs_asb + PHB_ERR_LOG_0);
+	stat->phbErrorLog1 = in_be64(p->regs_asb + PHB_ERR_LOG_1);
+	stat->mmioErrorStatus = in_be64(p->regs_asb + PHB_OUT_ERR_STATUS);
+	stat->mmioFirstErrorStatus = in_be64(p->regs_asb + PHB_OUT_ERR1_STATUS);
+	stat->mmioErrorLog0 = in_be64(p->regs_asb + PHB_OUT_ERR_LOG_0);
+	stat->mmioErrorLog1 = in_be64(p->regs_asb + PHB_OUT_ERR_LOG_1);
+	stat->dma0ErrorStatus = in_be64(p->regs_asb + PHB_INA_ERR_STATUS);
+	stat->dma0FirstErrorStatus = in_be64(p->regs_asb + PHB_INA_ERR1_STATUS);
+	stat->dma0ErrorLog0 = in_be64(p->regs_asb + PHB_INA_ERR_LOG_0);
+	stat->dma0ErrorLog1 = in_be64(p->regs_asb + PHB_INA_ERR_LOG_1);
+	stat->dma1ErrorStatus = in_be64(p->regs_asb + PHB_INB_ERR_STATUS);
+	stat->dma1FirstErrorStatus = in_be64(p->regs_asb + PHB_INB_ERR1_STATUS);
+	stat->dma1ErrorLog0 = in_be64(p->regs_asb + PHB_INB_ERR_LOG_0);
+	stat->dma1ErrorLog1 = in_be64(p->regs_asb + PHB_INB_ERR_LOG_1);
+
+	/* Grab PESTA & B content */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PESTA, 0, true);
+	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++)
+		stat->pestA[i] = in_be64(p->regs_asb + PHB_IODA_DATA0);
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PESTB, 0, true);
+	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++)
+		stat->pestB[i] = in_be64(p->regs_asb + PHB_IODA_DATA0);
+}
+
+static int64_t p7ioc_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+				       uint8_t *freeze_state,
+				       uint16_t *pci_error_type,
+				       uint16_t *severity,
+				       uint64_t *phb_status)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t peev_bit = PPC_BIT(pe_number & 0x3f);
+	uint64_t peev, pesta, pestb;
+
+	/* Defaults: not frozen */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+
+	/* Check dead */
+	if (p->state == P7IOC_PHB_STATE_BROKEN) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_DEAD;
+		goto bail;
+	}
+
+	/* Check fence */
+	if (p7ioc_phb_fenced(p)) {
+		/* Should be OPAL_EEH_STOPPED_TEMP_UNAVAIL ? */
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+		p->state = P7IOC_PHB_STATE_FENCED;
+		goto bail;
+	}
+
+	/* Check the PEEV */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PEEV, 0, true);
+	peev = in_be64(p->regs + PHB_IODA_DATA0);
+	if (pe_number > 63)
+		peev = in_be64(p->regs + PHB_IODA_DATA0);
+	if (!(peev & peev_bit))
+		return OPAL_SUCCESS;
+
+	/* Indicate that we have an ER pending */
+	p7ioc_phb_set_err_pending(p, true);
+	if (severity)
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+	/* Read the PESTA & PESTB */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PESTA, pe_number, false);
+	pesta = in_be64(p->regs + PHB_IODA_DATA0);
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PESTB, pe_number, false);
+	pestb = in_be64(p->regs + PHB_IODA_DATA0);
+
+	/* Convert them */
+	if (pesta & IODA_PESTA_MMIO_FROZEN)
+		*freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+	if (pestb & IODA_PESTB_DMA_STOPPED)
+		*freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+	/* XXX Handle more causes */
+	if (pesta & IODA_PESTA_MMIO_CAUSE)
+		*pci_error_type = OPAL_EEH_PE_MMIO_ERROR;
+	else
+		*pci_error_type = OPAL_EEH_PE_DMA_ERROR;
+
+ bail:
+	if (phb_status)
+		p7ioc_eeh_read_phb_status(p, (struct OpalIoP7IOCPhbErrorData *)
+					  phb_status);
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_eeh_next_error(struct phb *phb, uint64_t *first_frozen_pe,
+				    uint16_t *pci_error_type, uint16_t *severity)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	struct p7ioc *ioc = p->ioc;
+	uint64_t fir, peev0, peev1;
+	uint32_t cfg32, i;
+
+	/* Check if there're pending errors on the IOC. */
+	if (p7ioc_err_pending(ioc) &&
+	    p7ioc_check_LEM(ioc, pci_error_type, severity))
+		return OPAL_SUCCESS;
+
+	/* Clear result */
+	*pci_error_type	= OPAL_EEH_NO_ERROR;
+        *severity	= OPAL_EEH_SEV_NO_ERROR;
+	*first_frozen_pe = (uint64_t)-1;
+
+	/* Check dead */
+	if (p->state == P7IOC_PHB_STATE_BROKEN) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_SUCCESS;
+	}
+
+	/* Check fence */
+	if (p7ioc_phb_fenced(p)) {
+		/* Should be OPAL_EEH_STOPPED_TEMP_UNAVAIL ? */
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		p->state = P7IOC_PHB_STATE_FENCED;
+		p7ioc_phb_set_err_pending(p, false);
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * If we don't have pending errors, which might be moved
+	 * from IOC to the PHB, then check if there has any frozen PEs.
+	 */
+	if (!p7ioc_phb_err_pending(p)) {
+		p7ioc_phb_ioda_sel(p, IODA_TBL_PEEV, 0, true);
+		peev0 = in_be64(p->regs + PHB_IODA_DATA0);
+		peev1 = in_be64(p->regs + PHB_IODA_DATA0);
+		if (peev0 || peev1) {
+			p->err.err_src   = P7IOC_ERR_SRC_PHB0 + p->index;
+			p->err.err_class = P7IOC_ERR_CLASS_ER;
+			p->err.err_bit   = 0;
+			p7ioc_phb_set_err_pending(p, true);
+		}
+	}
+
+	/* Check the pending errors, which might come from IOC */
+	if (p7ioc_phb_err_pending(p)) {
+		/*
+		 * If the frozen PE is caused by a malfunctioning TLP, we
+		 * need reset the PHB. So convert ER to PHB-fatal error
+		 * for the case.
+		 */
+		if (p->err.err_class == P7IOC_ERR_CLASS_ER) {
+			fir = in_be64(p->regs_asb + PHB_LEM_FIR_ACCUM);
+			if (fir & PPC_BIT(60)) {
+				p7ioc_pcicfg_read32(&p->phb, 0,
+					p->aercap + PCIECAP_AER_UE_STATUS, &cfg32);
+				if (cfg32 & PCIECAP_AER_UE_MALFORMED_TLP)
+					p->err.err_class = P7IOC_ERR_CLASS_PHB;
+                        }
+                }
+
+		/*
+		 * Map P7IOC internal error class to that one OS can handle.
+		 * For P7IOC_ERR_CLASS_ER, we also need figure out the frozen
+		 * PE.
+		 */
+		switch (p->err.err_class) {
+		case P7IOC_ERR_CLASS_PHB:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+			p7ioc_phb_set_err_pending(p, false);
+			break;
+		case P7IOC_ERR_CLASS_MAL:
+		case P7IOC_ERR_CLASS_INF:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_INF;
+			p7ioc_phb_set_err_pending(p, false);
+			break;
+		case P7IOC_ERR_CLASS_ER:
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+			p7ioc_phb_ioda_sel(p, IODA_TBL_PEEV, 0, true);
+			peev0 = in_be64(p->regs + PHB_IODA_DATA0);
+			peev1 = in_be64(p->regs + PHB_IODA_DATA0);
+
+			for (i = 0 ; i < 64; i++) {
+				if (PPC_BIT(i) & peev1) {
+					*first_frozen_pe = i + 64;
+					break;
+				}
+			}
+			for (i = 0 ;
+			     *first_frozen_pe == (uint64_t)-1 && i < 64;
+			     i++) {
+				if (PPC_BIT(i) & peev0) {
+					*first_frozen_pe = i;
+					break;
+				}
+			}
+
+			/* No frozen PE? */
+			if (*first_frozen_pe == (uint64_t)-1) {
+				*pci_error_type = OPAL_EEH_NO_ERROR;
+				*severity = OPAL_EEH_SEV_NO_ERROR;
+				p7ioc_phb_set_err_pending(p, false);
+			}
+
+			break;
+		default:
+			*pci_error_type = OPAL_EEH_NO_ERROR;
+			*severity = OPAL_EEH_SEV_NO_ERROR;
+			p7ioc_phb_set_err_pending(p, false);
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static void p7ioc_ER_err_clear(struct p7ioc_phb *p)
+{
+	u64 err, lem;
+	u32 val;
+
+	/* Rec 1,2 */
+	lem = in_be64(p->regs + PHB_LEM_FIR_ACCUM);
+
+	/* Rec 3,4,5 AER registers (could use cfg space accessors) */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000001c00000000ull);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0x10000000);
+
+	/* Rec 6,7,8 XXX DOC whacks payload & req size ... we don't */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000005000000000ull);
+	val = in_be32(p->regs + PHB_CONFIG_DATA);
+	out_be32(p->regs + PHB_CONFIG_DATA, (val & 0xe0700000) | 0x0f000f00);
+
+	/* Rec 9,10,11 */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000010400000000ull);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 12,13,14 */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000011000000000ull);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 23,24,25 */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000013000000000ull);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 26,27,28 */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000004000000000ull);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0x470100f8);
+
+	/* Rec 29..34 UTL registers */
+	err = in_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, err);
+	err = in_be64(p->regs + UTL_PCIE_PORT_STATUS);
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS, err);
+	err = in_be64(p->regs + UTL_RC_STATUS);
+	out_be64(p->regs + UTL_RC_STATUS, err);
+
+	/* PHB error traps registers */
+	err = in_be64(p->regs + PHB_ERR_STATUS);
+	out_be64(p->regs + PHB_ERR_STATUS, err);
+	out_be64(p->regs + PHB_ERR1_STATUS, 0);
+	out_be64(p->regs + PHB_ERR_LOG_0, 0);
+	out_be64(p->regs + PHB_ERR_LOG_1, 0);
+
+	err = in_be64(p->regs + PHB_OUT_ERR_STATUS);
+	out_be64(p->regs + PHB_OUT_ERR_STATUS, err);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0);
+
+	err = in_be64(p->regs + PHB_INA_ERR_STATUS);
+	out_be64(p->regs + PHB_INA_ERR_STATUS, err);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS, 0);
+	out_be64(p->regs + PHB_INA_ERR_LOG_0, 0);
+	out_be64(p->regs + PHB_INA_ERR_LOG_1, 0);
+
+	err = in_be64(p->regs + PHB_INB_ERR_STATUS);
+	out_be64(p->regs + PHB_INB_ERR_STATUS, err);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS, 0);
+	out_be64(p->regs + PHB_INB_ERR_LOG_0, 0);
+	out_be64(p->regs + PHB_INB_ERR_LOG_1, 0);
+
+	/* Rec 67, 68 LEM */
+	out_be64(p->regs + PHB_LEM_FIR_AND_MASK, ~lem);
+	out_be64(p->regs + PHB_LEM_WOF, 0);
+}
+
+static int64_t p7ioc_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+				      uint64_t eeh_action_token)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t peev0, peev1;
+
+	/* XXX Now this is a heavy hammer, coming roughly from the P7IOC doc
+	 * and my old "pseudopal" code. It will need to be refined. In general
+	 * error handling will have to be reviewed and probably done properly
+	 * "from scratch" based on the description in the p7IOC spec.
+	 *
+	 * XXX Additionally, when handling interrupts, we might want to consider
+	 * masking while processing and/or ack'ing interrupt bits etc...
+	 */
+	u64 err;
+
+	/* Summary. If nothing, move to clearing the PESTs which can
+	 * contain a freeze state from a previous error or simply set
+	 * explicitly by the user
+	 */
+	err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (err == 0)
+		goto clear_pest;
+
+	p7ioc_ER_err_clear(p);
+
+ clear_pest:
+	/* XXX We just clear the whole PESTA for MMIO clear and PESTB
+	 * for DMA clear. We might want to only clear the frozen bit
+	 * as to not clobber the rest of the state. However, we expect
+	 * the state to have been harvested before the clear operations
+	 * so this might not be an issue
+	 */
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+		p7ioc_phb_ioda_sel(p, IODA_TBL_PESTA, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+		p7ioc_phb_ioda_sel(p, IODA_TBL_PESTB, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+
+	/* Update ER pending indication */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PEEV, 0, true);
+	peev0 = in_be64(p->regs + PHB_IODA_DATA0);
+	peev1 = in_be64(p->regs + PHB_IODA_DATA0);
+	if (peev0 || peev1) {
+		p->err.err_src   = P7IOC_ERR_SRC_PHB0 + p->index;
+		p->err.err_class = P7IOC_ERR_CLASS_ER;
+		p->err.err_bit   = 0;
+		p7ioc_phb_set_err_pending(p, true);
+	} else
+		p7ioc_phb_set_err_pending(p, false);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_get_diag_data(struct phb *phb, void *diag_buffer,
+				   uint64_t diag_buffer_len)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	struct OpalIoP7IOCPhbErrorData *diag = diag_buffer;
+
+	if (diag_buffer_len < sizeof(struct OpalIoP7IOCPhbErrorData))
+		return OPAL_PARAMETER;
+
+	/* Specific error data */
+	p7ioc_eeh_read_phb_status(p, diag);
+
+	/*
+	 * We're running to here probably because of errors (MAL
+	 * or INF class) from IOC. For the case, we need clear
+	 * the pending errors and mask the error bit for MAL class
+	 * error. Fortunately, we shouldn't get MAL class error from
+	 * IOC on P7IOC.
+	 */
+	if (p7ioc_phb_err_pending(p)			&&
+	    p->err.err_class == P7IOC_ERR_CLASS_INF	&&
+	    p->err.err_src >= P7IOC_ERR_SRC_PHB0	&&
+	    p->err.err_src <= P7IOC_ERR_SRC_PHB5) {
+		p7ioc_ER_err_clear(p);
+		p7ioc_phb_set_err_pending(p, false);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * We don't support address remapping now since all M64
+ * BARs are sharing on remapping base address. We might
+ * introduce flag to the PHB in order to trace that. The
+ * flag allows to be changed for once. It's something to
+ * do in future.
+ */
+static int64_t p7ioc_set_phb_mem_window(struct phb *phb,
+                                        uint16_t window_type,
+                                        uint16_t window_num,
+                                        uint64_t base,
+                                        uint64_t __unused pci_base,
+                                        uint64_t size)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t data64;
+
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16)
+			return OPAL_PARAMETER;
+		/* The base and size should be 16MB aligned */
+		if (base & 0xFFFFFF || size & 0xFFFFFF)
+			return OPAL_PARAMETER;
+		data64 = p->m64b_cache[window_num];
+		data64 = SETFIELD(IODA_M64BT_BASE, data64, base >> 24);
+		size = (size >> 24);
+		data64 = SETFIELD(IODA_M64BT_MASK, data64, 0x1000000 - size);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * If the M64 BAR hasn't enabled yet, we needn't flush
+	 * the setting to hardware and just keep it to the cache
+	 */
+	p->m64b_cache[window_num] = data64;
+	if (!(data64 & IODA_M64BT_ENABLE))
+		return OPAL_SUCCESS;
+	p7ioc_phb_ioda_sel(p, IODA_TBL_M64BT, window_num, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * We can't enable or disable I/O and M32 dynamically, even
+ * unnecessary. So the function only support M64 BARs.
+ */
+static int64_t p7ioc_phb_mmio_enable(struct phb *phb,
+				     uint16_t window_type,
+				     uint16_t window_num,
+				     uint16_t enable)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t data64, base, mask;
+
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16 ||
+		    enable >= OPAL_ENABLE_M64_NON_SPLIT)
+			return OPAL_PARAMETER;
+
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * While enabling one specific M64 BAR, we should have
+	 * the base/size configured correctly. Otherwise, it
+	 * probably incurs fenced AIB.
+	 */
+	data64 = p->m64b_cache[window_num];
+	if (enable == OPAL_ENABLE_M64_SPLIT) {
+		base = GETFIELD(IODA_M64BT_BASE, data64);
+		base = (base << 24);
+		mask = GETFIELD(IODA_M64BT_MASK, data64);
+		if (base < p->m64_base || mask == 0x0ul)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA_M64BT_ENABLE;
+	} else if (enable == OPAL_DISABLE_M64) {
+		data64 &= ~IODA_M64BT_ENABLE;
+	} else
+		return OPAL_PARAMETER;
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_M64BT, window_num, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+	p->m64b_cache[window_num] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_map_pe_mmio_window(struct phb *phb, uint16_t pe_number,
+					uint16_t window_type,
+					uint16_t window_num,
+					uint16_t segment_num)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t tbl, index;
+	uint64_t *cache;
+
+	if (pe_number > 127)
+		return OPAL_PARAMETER;
+
+	switch(window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+		if (window_num != 0 || segment_num > 127)
+			return OPAL_PARAMETER;
+		tbl = IODA_TBL_IODT;
+		index = segment_num;
+		cache = &p->iod_cache[index];
+		break;
+	case OPAL_M32_WINDOW_TYPE:
+		if (window_num != 0 || segment_num > 127)
+			return OPAL_PARAMETER;
+		tbl = IODA_TBL_M32DT;
+		index = segment_num;
+		cache = &p->m32d_cache[index];
+		break;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num > 15 || segment_num > 7)
+			return OPAL_PARAMETER;
+
+		tbl = IODA_TBL_M64DT;
+		index = window_num << 3 | segment_num;
+		cache = &p->m64d_cache[index];
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	p7ioc_phb_ioda_sel(p, tbl, index, false);
+	out_be64(p->regs + PHB_IODA_DATA0,
+		 SETFIELD(IODA_XXDT_PE, 0ull, pe_number));
+
+	/* Update cache */
+	*cache = SETFIELD(IODA_XXDT_PE, 0ull, pe_number);
+
+	return OPAL_SUCCESS;
+}
+
+
+static int64_t p7ioc_set_pe(struct phb *phb, uint64_t pe_number,
+			    uint64_t bdfn, uint8_t bus_compare,
+			    uint8_t dev_compare, uint8_t func_compare,
+			    uint8_t pe_action)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t pelt;
+	uint64_t *cache = &p->peltm_cache[pe_number];
+
+	if (pe_number > 127 || bdfn > 0xffff)
+		return OPAL_PARAMETER;
+	if (pe_action != OPAL_MAP_PE && pe_action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (bus_compare > 7)
+		return OPAL_PARAMETER;
+
+	if (pe_action == OPAL_MAP_PE) {
+		pelt  = SETFIELD(IODA_PELTM_BUS, 0ul, bdfn >> 8);
+		pelt |= SETFIELD(IODA_PELTM_DEV, 0ul, (bdfn >> 3) & 0x1f);
+		pelt |= SETFIELD(IODA_PELTM_FUNC, 0ul, bdfn & 0x7);
+		pelt |= SETFIELD(IODA_PELTM_BUS_VALID, 0ul, bus_compare);
+		if (dev_compare)
+			pelt |= IODA_PELTM_DEV_VALID;
+		if (func_compare)
+			pelt |= IODA_PELTM_FUNC_VALID;
+	} else
+		pelt = 0;
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PELTM, pe_number, false);
+	out_be64(p->regs + PHB_IODA_DATA0, pelt);
+
+	/* Update cache */
+	*cache = pelt;
+
+	return OPAL_SUCCESS;
+}
+
+
+static int64_t p7ioc_set_peltv(struct phb *phb, uint32_t parent_pe,
+			       uint32_t child_pe, uint8_t state)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint32_t reg;
+	uint64_t mask, peltv;
+	uint64_t *cache;
+	if (parent_pe > 127 || child_pe > 127)
+		return OPAL_PARAMETER;
+
+	cache = (child_pe >> 6) ? &p->peltv_hi_cache[parent_pe] :
+		&p->peltv_lo_cache[parent_pe];
+	reg = (child_pe >> 6) ? PHB_IODA_DATA1 : PHB_IODA_DATA0;
+	child_pe &= 0x2f;
+	mask = 1ull << (63 - child_pe);
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PELTV, parent_pe, false);
+	peltv = in_be64(p->regs + reg);
+	if (state)
+		peltv |= mask;
+	else
+		peltv &= ~mask;
+	out_be64(p->regs + reg, peltv);
+
+	/* Update cache */
+	*cache = peltv;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_map_pe_dma_window(struct phb *phb, uint16_t pe_number,
+				       uint16_t window_id, uint16_t tce_levels,
+				       uint64_t tce_table_addr,
+				       uint64_t tce_table_size,
+				       uint64_t tce_page_size)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t tvt0, tvt1, t, pelt;
+	uint64_t dma_window_size;
+	uint64_t *cache_lo, *cache_hi;
+
+	if (pe_number > 127 || window_id > 255 || tce_levels != 1)
+		return OPAL_PARAMETER;
+	cache_lo = &p->tve_lo_cache[window_id];
+        cache_hi = &p->tve_hi_cache[window_id];
+
+	/* Encode table size */
+	dma_window_size = tce_page_size * (tce_table_size >> 3);
+	t = ilog2(dma_window_size);
+	if (t < 27)
+		return OPAL_PARAMETER;
+	tvt0 = SETFIELD(IODA_TVT0_TCE_TABLE_SIZE, 0ul, (t - 26));
+
+	/* Encode TCE page size */
+	switch(tce_page_size) {
+	case 0x1000:		/* 4K */
+		tvt1 = SETFIELD(IODA_TVT1_IO_PSIZE, 0ul, 1ul);
+		break;
+	case 0x10000:		/* 64K */
+		tvt1 = SETFIELD(IODA_TVT1_IO_PSIZE, 0ul, 5ul);
+		break;
+	case 0x1000000:		/* 16M */
+		tvt1 = SETFIELD(IODA_TVT1_IO_PSIZE, 0ul, 13ul);
+		break;
+	case 0x400000000:	/* 16G */
+		tvt1 = SETFIELD(IODA_TVT1_IO_PSIZE, 0ul, 23ul);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* XXX Hub number ... leave 0 for now */
+
+	/* Shift in the address. The table address is "off by 4 bits"
+	 * but since the field is itself shifted by 16, we basically
+	 * need to write the address >> 12, which basically boils down
+	 * to writing a 4k page address
+	 */
+	tvt0 = SETFIELD(IODA_TVT0_TABLE_ADDR, tvt0, tce_table_addr >> 12);
+
+	/* Read the PE filter info from the PELT-M */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PELTM, pe_number, false);
+	pelt = in_be64(p->regs + PHB_IODA_DATA0);
+
+	/* Copy in filter bits from PELT */
+	tvt0 = SETFIELD(IODA_TVT0_BUS_VALID, tvt0,
+			GETFIELD(IODA_PELTM_BUS_VALID, pelt));
+	tvt0 = SETFIELD(IODA_TVT0_BUS_NUM, tvt0,
+			GETFIELD(IODA_PELTM_BUS, pelt));
+	tvt1 = SETFIELD(IODA_TVT1_DEV_NUM, tvt1,
+			GETFIELD(IODA_PELTM_DEV, pelt));
+	tvt1 = SETFIELD(IODA_TVT1_FUNC_NUM, tvt1,
+			GETFIELD(IODA_PELTM_FUNC, pelt));
+	if (pelt & IODA_PELTM_DEV_VALID)
+		tvt1 |= IODA_TVT1_DEV_VALID;
+	if (pelt & IODA_PELTM_FUNC_VALID)
+		tvt1 |= IODA_TVT1_FUNC_VALID;
+	tvt1 = SETFIELD(IODA_TVT1_PE_NUM, tvt1, pe_number);
+
+	/* Write the TVE */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA1, tvt1);
+	out_be64(p->regs + PHB_IODA_DATA0, tvt0);
+
+	/* Update cache */
+	*cache_lo = tvt0;
+	*cache_hi = tvt1;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_map_pe_dma_window_real(struct phb *phb __unused,
+					    uint16_t pe_number __unused,
+					    uint16_t dma_window_num __unused,
+					    uint64_t pci_start_addr __unused,
+					    uint64_t pci_mem_size __unused)
+{
+	/* XXX Not yet implemented (not yet used by Linux) */
+	return OPAL_UNSUPPORTED;
+}
+
+static int64_t p7ioc_set_mve(struct phb *phb, uint32_t mve_number,
+			     uint32_t pe_number)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t pelt, mve = 0;
+	uint64_t *cache = &p->mve_cache[mve_number];
+
+	if (pe_number > 127 || mve_number > 255)
+		return OPAL_PARAMETER;
+
+	/* Read the PE filter info from the PELT-M */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PELTM, pe_number, false);
+	pelt = in_be64(p->regs + PHB_IODA_DATA0);
+
+	mve = SETFIELD(IODA_MVT_BUS_VALID, mve,
+		       GETFIELD(IODA_PELTM_BUS_VALID, pelt));
+	mve = SETFIELD(IODA_MVT_BUS_NUM, mve,
+		       GETFIELD(IODA_PELTM_BUS, pelt));
+	mve = SETFIELD(IODA_MVT_DEV_NUM, mve,
+		       GETFIELD(IODA_PELTM_DEV, pelt));
+	mve = SETFIELD(IODA_MVT_FUNC_NUM, mve,
+		       GETFIELD(IODA_PELTM_FUNC, pelt));
+	if (pelt & IODA_PELTM_DEV_VALID)
+		mve |= IODA_MVT_DEV_VALID;
+	if (pelt & IODA_PELTM_FUNC_VALID)
+		mve |= IODA_MVT_FUNC_VALID;
+	mve = SETFIELD(IODA_MVT_PE_NUM, mve, pe_number);
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_MVT, mve_number, false);
+	out_be64(p->regs + PHB_IODA_DATA0, mve);
+
+	/* Update cache */
+	*cache = mve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_set_mve_enable(struct phb *phb, uint32_t mve_number,
+				    uint32_t state)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t mve;
+	uint64_t *cache = &p->mve_cache[mve_number];
+
+	if (mve_number > 255)
+		return OPAL_PARAMETER;
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_MVT, mve_number, false);
+	mve = in_be64(p->regs + PHB_IODA_DATA0);
+	if (state)
+		mve |= IODA_MVT_VALID;
+	else
+		mve &= ~IODA_MVT_VALID;
+	out_be64(p->regs + PHB_IODA_DATA0, mve);
+
+	/* Update cache */
+	*cache = mve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_set_xive_pe(struct phb *phb, uint32_t pe_number,
+				 uint32_t xive_num)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	uint64_t xive;
+
+	if (pe_number > 127 || xive_num > 255)
+		return OPAL_PARAMETER;
+
+	/* Update MXIVE cache */
+	xive = p->mxive_cache[xive_num];
+	xive = SETFIELD(IODA_XIVT_PENUM, xive, pe_number);
+	p->mxive_cache[xive_num] = xive;
+
+	/* Update HW */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_MXIVT, xive_num, false);	
+	xive = in_be64(p->regs + PHB_IODA_DATA0);
+	xive = SETFIELD(IODA_XIVT_PENUM, xive, pe_number);
+	out_be64(p->regs + PHB_IODA_DATA0, xive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_get_xive_source(struct phb *phb, uint32_t xive_num,
+				     int32_t *interrupt_source_number)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+
+	if (xive_num > 255 || !interrupt_source_number)
+		return OPAL_PARAMETER;
+
+	*interrupt_source_number = (p->buid_msi << 4) | xive_num;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_get_msi_32(struct phb *phb __unused, uint32_t mve_number,
+				uint32_t xive_num, uint8_t msi_range,
+				uint32_t *msi_address, uint32_t *message_data)
+{
+	if (mve_number > 255 || xive_num > 255 || msi_range != 1)
+		return OPAL_PARAMETER;
+
+	*msi_address = 0xffff0000 | (mve_number << 4);
+	*message_data = xive_num;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t p7ioc_get_msi_64(struct phb *phb __unused, uint32_t mve_number,
+				uint32_t xive_num, uint8_t msi_range,
+				uint64_t *msi_address, uint32_t *message_data)
+{
+	if (mve_number > 255 || xive_num > 255 || msi_range != 1)
+		return OPAL_PARAMETER;
+
+	*msi_address = (9ul << 60) | (((u64)mve_number) << 48);
+	*message_data = xive_num;
+
+	return OPAL_SUCCESS;
+}
+
+static void p7ioc_root_port_init(struct phb *phb, struct pci_device *dev,
+				 int ecap, int aercap)
+{
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+        /* Mask various unrecoverable errors */
+	if (!aercap) return;
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32);
+	val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP |
+		  PCIECAP_AER_UE_MASK_COMPL_TIMEOUT |
+		  PCIECAP_AER_UE_MASK_COMPL_ABORT |
+		  PCIECAP_AER_UE_MASK_ECRC);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32);
+
+	/* Report various unrecoverable errors as fatal errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32);
+	val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP |
+		  PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+		  PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+		  PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL |
+		  PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+		  PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/* Mask various recoverable errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32);
+	val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+
+	/* Enable all error reporting */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32);
+	val32 |= (PCIECAP_AER_RERR_CMD_FE |
+		  PCIECAP_AER_RERR_CMD_NFE |
+		  PCIECAP_AER_RERR_CMD_CE);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32);
+}
+
+static void p7ioc_switch_port_init(struct phb *phb,
+				   struct pci_device *dev,
+				   int ecap, int aercap)
+{
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking and disable INTx */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN |
+		  PCI_CFG_CMD_INTx_DIS);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Disable partity error and enable system error */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16);
+	val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN;
+	val16 |= PCI_CFG_BRCTL_SERR_EN;
+	pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/* Unmask all unrecoverable errors */
+	if (!aercap) return;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0);
+
+	/* Severity of unrecoverable errors */
+	if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT)
+		val32 = (PCIECAP_AER_UE_SEVERITY_DLLP |
+			 PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+			 PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+			 PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	else
+		val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+        /* Mask various correctable errors */
+	val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC generation and disable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN;
+	val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void p7ioc_endpoint_init(struct phb *phb,
+				struct pci_device *dev,
+				int ecap, int aercap)
+{
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/* Enable ECRC generation and check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void p7ioc_device_init(struct phb *phb, struct pci_device *dev)
+{
+	int ecap = 0;
+	int aercap = 0;
+
+	/* Figure out AER capability */
+	if (pci_has_cap(dev, PCI_CFG_CAP_ID_EXP, false)) {
+		ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false);
+
+		if (!pci_has_cap(dev, PCIECAP_ID_AER, true)) {
+			aercap = pci_find_ecap(phb, dev->bdfn,
+					       PCIECAP_ID_AER, NULL);
+			if (aercap > 0)
+				pci_set_cap(dev, PCIECAP_ID_AER, aercap, true);
+		} else {
+			aercap = pci_cap(dev, PCIECAP_ID_AER, true);
+		}
+	}
+
+	/* Reconfigure the MPS */
+	pci_configure_mps(phb, dev);
+
+	if (dev->dev_type == PCIE_TYPE_ROOT_PORT)
+		p7ioc_root_port_init(phb, dev, ecap, aercap);
+	else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT ||
+		dev->dev_type == PCIE_TYPE_SWITCH_DNPORT)
+		p7ioc_switch_port_init(phb, dev, ecap, aercap);
+	else
+		p7ioc_endpoint_init(phb, dev, ecap, aercap);
+}
+
+static int64_t p7ioc_pci_reinit(struct phb *phb,
+				uint64_t scope, uint64_t data)
+{
+	struct pci_device *pd;
+	uint16_t bdfn = data;
+
+	if (scope != OPAL_REINIT_PCI_DEV)
+		return OPAL_PARAMETER;
+
+	pd = pci_find_dev(phb, bdfn);
+	if (!pd)
+		return OPAL_PARAMETER;
+
+	p7ioc_device_init(phb, pd);
+	return OPAL_SUCCESS;
+}
+
+static uint8_t p7ioc_choose_bus(struct phb *phb __unused,
+				struct pci_device *bridge,
+				uint8_t candidate, uint8_t *max_bus,
+				bool *use_max)
+{
+	uint8_t m, al;
+	int i;	
+
+	/* Bus number selection is nasty on P7IOC. Our EEH HW can only cope
+	 * with bus ranges that are naturally aligned powers of two. It also
+	 * has "issues" with dealing with more than 32 bus numbers.
+	 *
+	 * On the other hand we can deal with overlaps to some extent as
+	 * the PELT-M entries are ordered.
+	 *
+	 * We also don't need to bother with the busses between the upstream
+	 * and downstream ports of switches.
+	 *
+	 * For now we apply this simple mechanism which matche what OFW does
+	 * under OPAL:
+	 *
+	 * - Top level bus (PHB to RC) is 0
+	 * - RC to first device is 1..ff
+	 * - Then going down, a switch gets (N = parent bus, M = parent max)
+	 *       * Upstream bridge is N+1, M, use_max = false
+	 *       * Downstream bridge is closest power of two from 32 down and
+	 *       * use max
+	 *
+	 * XXX NOTE: If we have access to HW VPDs, we could know whether
+	 * this is a bridge with a single device on it such as IPR and
+	 * limit ourselves to a single bus number.
+	 */
+
+	/* Default use_max is false (legacy) */
+	*use_max = false;
+
+	/* If we are the root complex or we are not in PCIe land anymore, just
+	 * use legacy algorithm
+	 */
+	if (!bridge || !pci_has_cap(bridge, PCI_CFG_CAP_ID_EXP, false))
+		return candidate;
+
+	/* Figure out the bridge type */
+	switch(bridge->dev_type) {
+	case PCIE_TYPE_PCIX_TO_PCIE:
+		/* PCI-X to PCIE ... hrm, let's not bother too much with that */
+		return candidate;
+	case PCIE_TYPE_SWITCH_UPPORT:
+	case PCIE_TYPE_ROOT_PORT:
+		/* Upstream port, we use legacy handling as well */
+		return candidate;
+	case PCIE_TYPE_SWITCH_DNPORT:
+	case PCIE_TYPE_PCIE_TO_PCIX:
+		/* That leaves us with the interesting cases that we handle */
+		break;
+	default:
+		/* Should not happen, treat as legacy */
+		prerror("PCI: Device %04x has unsupported type %d in choose_bus\n",
+			bridge->bdfn, bridge->dev_type);
+		return candidate;
+	}
+
+	/* Ok, let's find a power of two that fits, fallback to 1 */
+	for (i = 5; i >= 0; i--) {
+		m = (1 << i) - 1;
+		al = (candidate + m) & ~m;
+		if (al <= *max_bus && (al + m) <= *max_bus)
+			break;
+	}
+	if (i < 0)
+		return 0;
+	*use_max = true;
+	*max_bus = al + m;
+	return al;
+}
+
+/* p7ioc_phb_init_ioda_cache - Reset the IODA cache values
+ */
+static void p7ioc_phb_init_ioda_cache(struct p7ioc_phb *p)
+{
+	unsigned int i;
+
+	for (i = 0; i < 8; i++)
+		p->lxive_cache[i] = SETFIELD(IODA_XIVT_PRIORITY, 0ull, 0xff);
+	for (i = 0; i < 256; i++) {
+		p->mxive_cache[i] = SETFIELD(IODA_XIVT_PRIORITY, 0ull, 0xff);
+		p->mve_cache[i]   = 0;
+	}
+	for (i = 0; i < 16; i++)
+		p->m64b_cache[i] = 0;
+
+	/*
+	 * Since there is only one root port under the PHB,
+	 * We make all PELTM entries except last one to be
+	 * invalid by configuring their RID to 00:00.1. The
+	 * last entry is to encompass all RIDs.
+	 */
+	for (i = 0; i < 127; i++)
+		p->peltm_cache[i] = 0x0001f80000000000;
+	p->peltm_cache[127] = 0x0ul;
+
+	for (i = 0; i < 128; i++) {
+		p->peltv_lo_cache[i]	= 0;
+		p->peltv_hi_cache[i]	= 0;
+		p->tve_lo_cache[i]	= 0;
+		p->tve_hi_cache[i]	= 0;
+		p->iod_cache[i]		= 0;
+		p->m32d_cache[i]	= 0;
+		p->m64d_cache[i]	= 0;
+	}
+}
+
+/* p7ioc_phb_ioda_reset - Reset the IODA tables
+ *
+ * @purge: If true, the cache is cleared and the cleared values
+ *         are applied to HW. If false, the cached values are
+ *         applied to HW
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ */
+static int64_t p7ioc_ioda_reset(struct phb *phb, bool purge)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	unsigned int i;
+	uint64_t reg64;
+	uint64_t data64, data64_hi;
+	uint8_t prio;
+	uint16_t server;
+	uint64_t m_server, m_prio;
+
+	/* If the "purge" argument is set, we clear the table cache */
+	if (purge)
+		p7ioc_phb_init_ioda_cache(p);
+
+	/* Init_18..19: Setup the HRT
+	 *
+	 * XXX NOTE: I still don't completely get that HRT business so
+	 * I'll just mimmic BML and put the PHB number + 1 in there
+	 */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_HRT, 0, true);
+	out_be64(p->regs + PHB_IODA_DATA0, p->index + 1);
+	out_be64(p->regs + PHB_IODA_DATA0, p->index + 1);
+	out_be64(p->regs + PHB_IODA_DATA0, p->index + 1);
+	out_be64(p->regs + PHB_IODA_DATA0, p->index + 1);
+
+	/* Init_20..21: Cleanup the LXIVT
+	 *
+	 * We set the priority to FF (masked) and clear everything
+	 * else. That means we leave the HRT index to 0 which is
+	 * going to remain unmodified... for now.
+	 */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_LXIVT, 0, true);
+	for (i = 0; i < 8; i++) {
+		data64 = p->lxive_cache[i];
+		server = GETFIELD(IODA_XIVT_SERVER, data64);
+		prio = GETFIELD(IODA_XIVT_PRIORITY, data64);
+
+		/* Now we mangle the server and priority */
+		if (prio == 0xff) {
+			m_server = 0;
+			m_prio = 0xff;
+		} else {
+			m_server = server >> 3;
+			m_prio = (prio >> 3) | ((server & 7) << 5);
+		}
+
+		data64 = SETFIELD(IODA_XIVT_SERVER,   data64, m_server);
+		data64 = SETFIELD(IODA_XIVT_PRIORITY, data64, m_prio);
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_22..23: Cleanup the MXIVT
+	 *
+	 * We set the priority to FF (masked) and clear everything
+	 * else. That means we leave the HRT index to 0 which is
+	 * going to remain unmodified... for now.
+	 */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_MXIVT, 0, true);
+	for (i = 0; i < 256; i++) {
+		data64 = p->mxive_cache[i];
+		server = GETFIELD(IODA_XIVT_SERVER, data64);
+		prio = GETFIELD(IODA_XIVT_PRIORITY, data64);
+
+		/* Now we mangle the server and priority */
+		if (prio == 0xff) {
+			m_server = 0;
+			m_prio = 0xff;
+		} else {
+			m_server = server >> 3;
+			m_prio = (prio >> 3) | ((server & 7) << 5);
+		}
+
+		data64 = SETFIELD(IODA_XIVT_SERVER,   data64, m_server);
+		data64 = SETFIELD(IODA_XIVT_PRIORITY, data64, m_prio);
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_24..25: Cleanup the MVT */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_MVT, 0, true);
+	for (i = 0; i < 256; i++) {
+		data64 = p->mve_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_26..27: Cleanup the PELTM
+	 *
+	 * A completely clear PELTM should make everything match PE 0
+	 */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PELTM, 0, true);
+	for (i = 0; i < 127; i++) {
+		data64 = p->peltm_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_28..30: Cleanup the PELTV */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PELTV, 0, true);
+	for (i = 0; i < 127; i++) {
+		data64 = p->peltv_lo_cache[i];
+		data64_hi = p->peltv_hi_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA1, data64_hi);
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_31..33: Cleanup the TVT */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_TVT, 0, true);
+	for (i = 0; i < 127; i++) {
+		data64 = p->tve_lo_cache[i];
+		data64_hi = p->tve_hi_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA1, data64_hi);
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_34..35: Cleanup the M64BT
+	 *
+	 * We don't enable M64 BARs by default. However,
+	 * we shouldn't purge the hw and cache for it in
+	 * future.
+	 */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_M64BT, 0, true);
+	for (i = 0; i < 16; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	/* Init_36..37: Cleanup the IODT */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_IODT, 0, true);
+	for (i = 0; i < 127; i++) {
+		data64 = p->iod_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_38..39: Cleanup the M32DT */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_M32DT, 0, true);
+	for (i = 0; i < 127; i++) {
+		data64 = p->m32d_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_40..41: Cleanup the M64DT */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_M64BT, 0, true);
+	for (i = 0; i < 16; i++) {
+		data64 = p->m64b_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_M64DT, 0, true);
+	for (i = 0; i < 127; i++) {
+		data64 = p->m64d_cache[i];
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Clear up the TCE cache */
+	reg64 = in_be64(p->regs + PHB_PHB2_CONFIG);
+	reg64 &= ~PHB_PHB2C_64B_TCE_EN;
+	out_be64(p->regs + PHB_PHB2_CONFIG, reg64);
+	reg64 |= PHB_PHB2C_64B_TCE_EN;
+	out_be64(p->regs + PHB_PHB2_CONFIG, reg64);
+	in_be64(p->regs + PHB_PHB2_CONFIG);
+
+	/* Clear PEST & PEEV */
+	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
+		uint64_t pesta, pestb;
+
+		p7ioc_phb_ioda_sel(p, IODA_TBL_PESTA, i, false);
+		pesta = in_be64(p->regs + PHB_IODA_DATA0);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		p7ioc_phb_ioda_sel(p, IODA_TBL_PESTB, i, false);
+		pestb = in_be64(p->regs + PHB_IODA_DATA0);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+		if ((pesta & IODA_PESTA_MMIO_FROZEN) ||
+		    (pestb & IODA_PESTB_DMA_STOPPED))
+			PHBDBG(p, "Frozen PE#%d (%s - %s)\n",
+			       i, (pestb & IODA_PESTB_DMA_STOPPED) ? "DMA" : "",
+			       (pesta & IODA_PESTA_MMIO_FROZEN) ? "MMIO" : "");
+	}
+
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PEEV, 0, true);
+	for (i = 0; i < 2; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops p7ioc_phb_ops = {
+	.lock			= p7ioc_phb_lock,
+	.unlock			= p7ioc_phb_unlock,
+	.cfg_read8		= p7ioc_pcicfg_read8,
+	.cfg_read16		= p7ioc_pcicfg_read16,
+	.cfg_read32		= p7ioc_pcicfg_read32,
+	.cfg_write8		= p7ioc_pcicfg_write8,
+	.cfg_write16		= p7ioc_pcicfg_write16,
+	.cfg_write32		= p7ioc_pcicfg_write32,
+	.choose_bus		= p7ioc_choose_bus,
+	.device_init		= p7ioc_device_init,
+	.pci_reinit		= p7ioc_pci_reinit,
+	.eeh_freeze_status	= p7ioc_eeh_freeze_status,
+	.eeh_freeze_clear	= p7ioc_eeh_freeze_clear,
+	.get_diag_data		= NULL,
+	.get_diag_data2		= p7ioc_get_diag_data,
+	.next_error		= p7ioc_eeh_next_error,
+	.phb_mmio_enable	= p7ioc_phb_mmio_enable,
+	.set_phb_mem_window	= p7ioc_set_phb_mem_window,
+	.map_pe_mmio_window	= p7ioc_map_pe_mmio_window,
+	.set_pe			= p7ioc_set_pe,
+	.set_peltv		= p7ioc_set_peltv,
+	.map_pe_dma_window	= p7ioc_map_pe_dma_window,
+	.map_pe_dma_window_real	= p7ioc_map_pe_dma_window_real,
+	.set_mve		= p7ioc_set_mve,
+	.set_mve_enable		= p7ioc_set_mve_enable,
+	.set_xive_pe		= p7ioc_set_xive_pe,
+	.get_xive_source	= p7ioc_get_xive_source,
+	.get_msi_32		= p7ioc_get_msi_32,
+	.get_msi_64		= p7ioc_get_msi_64,
+	.ioda_reset		= p7ioc_ioda_reset,
+	.presence_detect	= p7ioc_presence_detect,
+	.link_state		= p7ioc_link_state,
+	.power_state		= p7ioc_power_state,
+	.slot_power_off		= p7ioc_slot_power_off,
+	.slot_power_on		= p7ioc_slot_power_on,
+	.complete_reset		= p7ioc_complete_reset,
+	.hot_reset		= p7ioc_hot_reset,
+	.fundamental_reset	= p7ioc_freset,
+	.poll			= p7ioc_poll,
+};
+
+/* p7ioc_phb_get_xive - Interrupt control from OPAL */
+static int64_t p7ioc_msi_get_xive(void *data, uint32_t isn,
+				  uint16_t *server, uint8_t *prio)
+{
+	struct p7ioc_phb *p = data;
+	uint32_t irq, fbuid = P7_IRQ_FBUID(isn);
+	uint64_t xive;
+
+	if (fbuid < p->buid_msi || fbuid >= (p->buid_msi + 0x10))
+		return OPAL_PARAMETER;
+
+	irq = isn & 0xff;
+	xive = p->mxive_cache[irq];
+
+	*server = GETFIELD(IODA_XIVT_SERVER, xive);
+	*prio = GETFIELD(IODA_XIVT_PRIORITY, xive);
+
+	return OPAL_SUCCESS;
+}
+
+/* p7ioc_phb_set_xive - Interrupt control from OPAL */
+static int64_t p7ioc_msi_set_xive(void *data, uint32_t isn,
+				  uint16_t server, uint8_t prio)
+{
+	struct p7ioc_phb *p = data;
+	uint32_t irq, fbuid = P7_IRQ_FBUID(isn);
+	uint64_t xive, m_server, m_prio;
+
+	if (fbuid < p->buid_msi || fbuid >= (p->buid_msi + 0x10))
+		return OPAL_PARAMETER;
+
+	/* We cache the arguments because we have to mangle
+	 * it in order to hijack 3 bits of priority to extend
+	 * the server number
+	 */
+	irq = isn & 0xff;
+	xive = p->mxive_cache[irq];
+	xive = SETFIELD(IODA_XIVT_SERVER, xive, server);
+	xive = SETFIELD(IODA_XIVT_PRIORITY, xive, prio);
+	p->mxive_cache[irq] = xive;
+
+	/* Now we mangle the server and priority */
+	if (prio == 0xff) {
+		m_server = 0;
+		m_prio = 0xff;
+	} else {
+		m_server = server >> 3;
+		m_prio = (prio >> 3) | ((server & 7) << 5);
+	}
+
+	/* We use HRT entry 0 always for now */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_MXIVT, irq, false);
+	xive = in_be64(p->regs + PHB_IODA_DATA0);
+	xive = SETFIELD(IODA_XIVT_SERVER, xive, m_server);
+	xive = SETFIELD(IODA_XIVT_PRIORITY, xive, m_prio);
+	out_be64(p->regs + PHB_IODA_DATA0, xive);
+
+	return OPAL_SUCCESS;
+}
+
+/* p7ioc_phb_get_xive - Interrupt control from OPAL */
+static int64_t p7ioc_lsi_get_xive(void *data, uint32_t isn,
+				  uint16_t *server, uint8_t *prio)
+{
+	struct p7ioc_phb *p = data;
+	uint32_t irq = (isn & 0x7);
+	uint32_t fbuid = P7_IRQ_FBUID(isn);
+	uint64_t xive;
+
+	if (fbuid != p->buid_lsi)
+		return OPAL_PARAMETER;
+
+	xive = p->lxive_cache[irq];
+	*server = GETFIELD(IODA_XIVT_SERVER, xive);
+	*prio = GETFIELD(IODA_XIVT_PRIORITY, xive);
+
+	return OPAL_SUCCESS;
+}
+
+/* p7ioc_phb_set_xive - Interrupt control from OPAL */
+static int64_t p7ioc_lsi_set_xive(void *data, uint32_t isn,
+				  uint16_t server, uint8_t prio)
+{
+	struct p7ioc_phb *p = data;
+	uint32_t irq = (isn & 0x7);
+	uint32_t fbuid = P7_IRQ_FBUID(isn);
+	uint64_t xive, m_server, m_prio;
+
+	if (fbuid != p->buid_lsi)
+		return OPAL_PARAMETER;
+
+	xive = SETFIELD(IODA_XIVT_SERVER, 0ull, server);
+	xive = SETFIELD(IODA_XIVT_PRIORITY, xive, prio);
+
+	/*
+	 * We cache the arguments because we have to mangle
+	 * it in order to hijack 3 bits of priority to extend
+	 * the server number
+	 */
+	p->lxive_cache[irq] = xive;
+
+	/* Now we mangle the server and priority */
+	if (prio == 0xff) {
+		m_server = 0;
+		m_prio = 0xff;
+	} else {
+		m_server = server >> 3;
+		m_prio = (prio >> 3) | ((server & 7) << 5);
+	}
+
+	/* We use HRT entry 0 always for now */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_LXIVT, irq, false);
+	xive = in_be64(p->regs + PHB_IODA_DATA0);
+	xive = SETFIELD(IODA_XIVT_SERVER, xive, m_server);
+	xive = SETFIELD(IODA_XIVT_PRIORITY, xive, m_prio);
+	out_be64(p->regs + PHB_IODA_DATA0, xive);
+
+	return OPAL_SUCCESS;
+}
+
+static void p7ioc_phb_err_interrupt(void *data, uint32_t isn)
+{
+	struct p7ioc_phb *p = data;
+	uint64_t peev0, peev1;
+
+	PHBDBG(p, "Got interrupt 0x%04x\n", isn);
+
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
+
+	/* If the PHB is broken, go away */
+	if (p->state == P7IOC_PHB_STATE_BROKEN)
+		return;
+
+	/*
+	 * Check if there's an error pending and update PHB fence
+	 * state and return, the ER error is drowned at this point
+	 */
+	lock(&p->lock);
+	if (p7ioc_phb_fenced(p)) {
+		p->state = P7IOC_PHB_STATE_FENCED;
+		PHBERR(p, "ER error ignored, PHB fenced\n");
+		unlock(&p->lock);
+		return;
+	}
+
+	/*
+	 * If we already had pending errors, which might be
+	 * moved from IOC, then we needn't check PEEV to avoid
+	 * overwriting the errors from IOC.
+	 */
+	if (!p7ioc_phb_err_pending(p)) {
+		unlock(&p->lock);
+		return;
+	}
+
+	/*
+	 * We don't have pending errors from IOC, it's safe
+	 * to check PEEV for frozen PEs.
+	 */
+	p7ioc_phb_ioda_sel(p, IODA_TBL_PEEV, 0, true);
+	peev0 = in_be64(p->regs + PHB_IODA_DATA0);
+	peev1 = in_be64(p->regs + PHB_IODA_DATA0);
+	if (peev0 || peev1) {
+		p->err.err_src   = P7IOC_ERR_SRC_PHB0 + p->index;
+		p->err.err_class = P7IOC_ERR_CLASS_ER;
+		p->err.err_bit   = 0;
+		p7ioc_phb_set_err_pending(p, true);
+	}
+	unlock(&p->lock);
+}
+
+/* MSIs (OS owned) */
+static const struct irq_source_ops p7ioc_msi_irq_ops = {
+	.get_xive = p7ioc_msi_get_xive,
+	.set_xive = p7ioc_msi_set_xive,
+};
+
+/* LSIs (OS owned) */
+static const struct irq_source_ops p7ioc_lsi_irq_ops = {
+	.get_xive = p7ioc_lsi_get_xive,
+	.set_xive = p7ioc_lsi_set_xive,
+};
+
+/* PHB Errors (Ski owned) */
+static const struct irq_source_ops p7ioc_phb_err_irq_ops = {
+	.get_xive = p7ioc_lsi_get_xive,
+	.set_xive = p7ioc_lsi_set_xive,
+	.interrupt = p7ioc_phb_err_interrupt,
+};
+
+static void p7ioc_pcie_add_node(struct p7ioc_phb *p)
+{
+
+	uint64_t reg[2], iob, m32b, m64b, tkill;
+	uint32_t lsibase, icsp = get_ics_phandle();
+	struct dt_node *np;
+
+	reg[0] = cleanup_addr((uint64_t)p->regs);
+	reg[1] = 0x100000;
+
+	np = dt_new_addr(p->ioc->dt_node, "pciex", reg[0]);
+	if (!np)
+		return;
+
+	p->phb.dt_node = np;
+	dt_add_property_strings(np, "compatible", "ibm,p7ioc-pciex",
+				"ibm,ioda-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+	dt_add_property_cells(np, "interrupt-parent", icsp);
+	/* XXX FIXME: add slot-name */
+	//dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */
+
+	/* "ranges", we only expose IO and M32
+	 *
+	 * Note: The kernel expects us to have chopped of 64k from the
+	 * M32 size (for the 32-bit MSIs). If we don't do that, it will
+	 * get confused (OPAL does it)
+	 */
+	iob = cleanup_addr(p->io_base);
+	m32b = cleanup_addr(p->m32_base + M32_PCI_START);
+	dt_add_property_cells(np, "ranges",
+			      /* IO space */
+			      0x01000000, 0x00000000, 0x00000000,
+			      hi32(iob), lo32(iob), 0, PHB_IO_SIZE,
+			      /* M32 space */
+			      0x02000000, 0x00000000, M32_PCI_START,
+			      hi32(m32b), lo32(m32b), 0,M32_PCI_SIZE - 0x10000);
+
+	/* XXX FIXME: add opal-memwin32, dmawins, etc... */
+	m64b = cleanup_addr(p->m64_base);
+	dt_add_property_cells(np, "ibm,opal-m64-window",
+			      hi32(m64b), lo32(m64b),
+			      hi32(m64b), lo32(m64b),
+			      hi32(PHB_M64_SIZE), lo32(PHB_M64_SIZE));
+	dt_add_property_cells(np, "ibm,opal-msi-ports", 256);
+	dt_add_property_cells(np, "ibm,opal-num-pes", 128);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe", 127);
+	dt_add_property_cells(np, "ibm,opal-msi-ranges",
+			      p->buid_msi << 4, 0x100);
+	tkill = reg[0] + PHB_TCE_KILL;
+	dt_add_property_cells(np, "ibm,opal-tce-kill",
+			      hi32(tkill), lo32(tkill));
+
+	/* Add associativity properties */
+	add_chip_dev_associativity(np);
+
+	/* The interrupt maps will be generated in the RC node by the
+	 * PCI code based on the content of this structure:
+	 */
+	lsibase = p->buid_lsi << 4;
+	p->phb.lstate.int_size = 1;
+	p->phb.lstate.int_val[0][0] = lsibase + PHB_LSI_PCIE_INTA;
+	p->phb.lstate.int_val[1][0] = lsibase + PHB_LSI_PCIE_INTB;
+	p->phb.lstate.int_val[2][0] = lsibase + PHB_LSI_PCIE_INTC;
+	p->phb.lstate.int_val[3][0] = lsibase + PHB_LSI_PCIE_INTD;
+	p->phb.lstate.int_parent[0] = icsp;
+	p->phb.lstate.int_parent[1] = icsp;
+	p->phb.lstate.int_parent[2] = icsp;
+	p->phb.lstate.int_parent[3] = icsp;
+}
+
+/* p7ioc_phb_setup - Setup a p7ioc_phb data structure
+ *
+ * WARNING: This is called before the AIB register routing is
+ * established. If this wants to access PHB registers, it must
+ * use the ASB hard coded variant (slower)
+ */
+void p7ioc_phb_setup(struct p7ioc *ioc, uint8_t index)
+{
+	struct p7ioc_phb *p = &ioc->phbs[index];
+	unsigned int buid_base = ioc->buid_base + PHBn_BUID_BASE(index);
+
+	p->index = index;
+	p->ioc = ioc;
+	p->gen = 2;	/* Operate in Gen2 mode by default */
+	p->phb.ops = &p7ioc_phb_ops;
+	p->phb.phb_type = phb_type_pcie_v2;
+	p->regs_asb = ioc->regs + PHBn_ASB_BASE(index);
+	p->regs = ioc->regs + PHBn_AIB_BASE(index);
+	p->buid_lsi = buid_base + PHB_BUID_LSI_OFFSET;
+	p->buid_msi = buid_base + PHB_BUID_MSI_OFFSET;
+	p->io_base = ioc->mmio1_win_start + PHBn_IO_BASE(index);
+	p->m32_base = ioc->mmio2_win_start + PHBn_M32_BASE(index);
+	p->m64_base = ioc->mmio2_win_start + PHBn_M64_BASE(index);
+	p->state = P7IOC_PHB_STATE_UNINITIALIZED;
+	p->phb.scan_map = 0x1; /* Only device 0 to scan */
+
+	/* Find P7IOC base location code in IOC */
+	p->phb.base_loc_code = dt_prop_get_def(ioc->dt_node,
+					       "ibm,io-base-loc-code", NULL);
+	if (!p->phb.base_loc_code)
+		prerror("P7IOC: Base location code not found !\n");
+
+	/* Create device node for PHB */
+	p7ioc_pcie_add_node(p);
+
+	/* Register OS interrupt sources */
+	register_irq_source(&p7ioc_msi_irq_ops, p, p->buid_msi << 4, 256);
+	register_irq_source(&p7ioc_lsi_irq_ops, p, p->buid_lsi << 4, 4);
+
+	/* Register internal interrupt source (LSI 7) */
+	register_irq_source(&p7ioc_phb_err_irq_ops, p,
+			    (p->buid_lsi << 4) + PHB_LSI_PCIE_ERROR, 1);
+
+	/* Initialize IODA table caches */
+	p7ioc_phb_init_ioda_cache(p);
+
+	/* We register the PHB before we initialize it so we
+	 * get a useful OPAL ID for it
+	 */
+	pci_register_phb(&p->phb);
+
+	/* Platform additional setup */
+	if (platform.pci_setup_phb)
+		platform.pci_setup_phb(&p->phb, p->index);
+}
+
+static bool p7ioc_phb_wait_dlp_reset(struct p7ioc_phb *p)
+{
+	unsigned int i;
+	uint64_t val;
+
+	/*
+	 * Firmware cannot access the UTL core regs or PCI config space
+	 * until the cores are out of DL_PGRESET.
+	 * DL_PGRESET should be polled until it is inactive with a value
+	 * of '0'. The recommended polling frequency is once every 1ms.
+	 * Firmware should poll at least 200 attempts before giving up.
+	 * MMIO Stores to the link are silently dropped by the UTL core if
+	 * the link is down.
+	 * MMIO Loads to the link will be dropped by the UTL core and will
+	 * eventually time-out and will return an all ones response if the
+	 * link is down.
+	 */
+#define DLP_RESET_ATTEMPTS	400
+
+	printf("P7IOC: Waiting for DLP PG reset to complete...\n");
+	for (i = 0; i < DLP_RESET_ATTEMPTS; i++) {
+		val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!(val & PHB_PCIE_DLP_TC_DL_PGRESET))
+			break;
+		time_wait_ms(1);
+	}
+	if (val & PHB_PCIE_DLP_TC_DL_PGRESET) {
+		PHBERR(p, "Timeout waiting for DLP PG reset !\n");
+		return false;
+	}
+	return true;
+}
+
+/* p7ioc_phb_init_rc - Initialize the Root Complex config space
+ */
+static bool p7ioc_phb_init_rc_cfg(struct p7ioc_phb *p)
+{
+	int64_t ecap, aercap;
+
+	/* XXX Handle errors ? */
+
+	/* Init_51..51:
+	 *
+	 * Set primary bus to 0, secondary to 1 and subordinate to 0xff
+	 */
+	p7ioc_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100);
+
+	/* Init_52..57
+	 *
+	 * IO and Memory base & limits are set to base > limit, which
+	 * allows all inbounds.
+	 *
+	 * XXX This has the potential of confusing the OS which might
+	 * think that nothing is forwarded downstream. We probably need
+	 * to fix this to match the IO and M32 PHB windows
+	 */
+	p7ioc_pcicfg_write16(&p->phb, 0, PCI_CFG_IO_BASE, 0x0010);
+	p7ioc_pcicfg_write32(&p->phb, 0, PCI_CFG_MEM_BASE, 0x00000010);
+	p7ioc_pcicfg_write32(&p->phb, 0, PCI_CFG_PREF_MEM_BASE, 0x00000010);
+
+	/* Init_58..: Setup bridge control to enable forwarding of CORR, FATAL,
+	 * and NONFATAL errors
+	*/
+	p7ioc_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, PCI_CFG_BRCTL_SERR_EN);
+
+	/* Init_60..61
+	 *
+	 * PCIE Device control/status, enable error reporting, disable relaxed
+	 * ordering, set MPS to 128 (see note), clear errors.
+	 *
+	 * Note: The doc recommends to set MPS to 4K. This has proved to have
+	 * some issues as it requires specific claming of MRSS on devices and
+	 * we've found devices in the field that misbehave when doing that.
+	 *
+	 * We currently leave it all to 128 bytes (minimum setting) at init
+	 * time. The generic PCIe probing later on might apply a different
+	 * value, or the kernel will, but we play it safe at early init
+	 */
+	if (p->ecap <= 0) {
+		ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+		if (ecap < 0) {
+			PHBERR(p, "Can't locate PCI-E capability\n");
+			return false;
+		}
+		p->ecap = ecap;
+	} else {
+		ecap = p->ecap;
+	}
+
+	p7ioc_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVSTAT,
+			     PCICAP_EXP_DEVSTAT_CE	|
+			     PCICAP_EXP_DEVSTAT_NFE	|
+			     PCICAP_EXP_DEVSTAT_FE	|
+			     PCICAP_EXP_DEVSTAT_UE);
+
+	p7ioc_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL,
+			     PCICAP_EXP_DEVCTL_CE_REPORT	|
+			     PCICAP_EXP_DEVCTL_NFE_REPORT	|
+			     PCICAP_EXP_DEVCTL_FE_REPORT	|
+			     PCICAP_EXP_DEVCTL_UR_REPORT	|
+			     SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B));
+
+	/* Init_62..63
+	 *
+	 * Root Control Register. Enable error reporting
+	 *
+	 * Note: Added CRS visibility.
+	 */
+	p7ioc_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_RC,
+			     PCICAP_EXP_RC_SYSERR_ON_CE		|
+			     PCICAP_EXP_RC_SYSERR_ON_NFE	|
+			     PCICAP_EXP_RC_SYSERR_ON_FE		|
+			     PCICAP_EXP_RC_CRS_VISIBLE);
+
+	/* Init_64..65
+	 *
+	 * Device Control 2. Enable ARI fwd, set timer
+	 */
+	p7ioc_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2,
+			     SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 2) |
+			     PCICAP_EXP_DCTL2_ARI_FWD);
+
+	/* Init_66..81
+	 *
+	 * AER inits
+	 */
+	aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+	if (aercap < 0) {
+		/* Shouldn't happen */
+		PHBERR(p, "Failed to locate AER capability in bridge\n");
+		return false;
+	}
+	p->aercap = aercap;
+
+	/* Clear all UE status */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_STATUS,
+			     0xffffffff);
+	/* Disable some error reporting as per the P7IOC spec */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK,
+			     PCIECAP_AER_UE_POISON_TLP		|
+			     PCIECAP_AER_UE_COMPL_TIMEOUT	|
+			     PCIECAP_AER_UE_COMPL_ABORT		|
+			     PCIECAP_AER_UE_ECRC);
+	/* Report some errors as fatal */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_SEVERITY,
+			     PCIECAP_AER_UE_DLP 		|
+			     PCIECAP_AER_UE_SURPRISE_DOWN	|
+			     PCIECAP_AER_UE_FLOW_CTL_PROT	|
+			     PCIECAP_AER_UE_UNEXP_COMPL		|
+			     PCIECAP_AER_UE_RECV_OVFLOW		|
+			     PCIECAP_AER_UE_MALFORMED_TLP);
+	/* Clear all CE status */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_STATUS,
+			     0xffffffff);
+	/* Disable some error reporting as per the P7IOC spec */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_MASK,
+			     PCIECAP_AER_CE_ADV_NONFATAL);
+	/* Enable ECRC generation & checking */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CAPCTL,
+			     PCIECAP_AER_CAPCTL_ECRCG_EN	|
+			     PCIECAP_AER_CAPCTL_ECRCC_EN);
+	/* Enable reporting in root error control */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_CMD,
+			     PCIECAP_AER_RERR_CMD_FE		|
+			     PCIECAP_AER_RERR_CMD_NFE		|
+			     PCIECAP_AER_RERR_CMD_CE);
+	/* Clear root error status */
+	p7ioc_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_STA,
+			     0xffffffff);
+
+	return true;
+}
+
+static void p7ioc_phb_init_utl(struct p7ioc_phb *p)
+{
+	/* Init_82..84: Clear spurious errors and assign errors to the
+	 * right "interrupt" signal
+	 */
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS,       0xffffffffffffffff);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_ERR_SEVERITY, 0x0000000000000000);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_IRQ_EN,       0xac80000000000000);
+
+	/* Init_85..89: Setup buffer allocations */
+	out_be64(p->regs + UTL_OUT_POST_DAT_BUF_ALLOC,     0x0400000000000000);
+	out_be64(p->regs + UTL_IN_POST_HDR_BUF_ALLOC,      0x1000000000000000);
+	out_be64(p->regs + UTL_IN_POST_DAT_BUF_ALLOC,      0x4000000000000000);
+	out_be64(p->regs + UTL_PCIE_TAGS_ALLOC,            0x0800000000000000);
+	out_be64(p->regs + UTL_GBIF_READ_TAGS_ALLOC,       0x0800000000000000);
+
+	/* Init_90: PCI Express port control */
+	out_be64(p->regs + UTL_PCIE_PORT_CONTROL,          0x8480000000000000);
+
+	/* Init_91..93: Clean & setup port errors */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS,           0xff7fffffffffffff);
+	out_be64(p->regs + UTL_PCIE_PORT_ERROR_SEV,        0x00e0000000000000);
+	out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,           0x7e65000000000000);
+
+	/* Init_94 : Cleanup RC errors */
+	out_be64(p->regs + UTL_RC_STATUS,                  0xffffffffffffffff);
+}
+
+static void p7ioc_phb_init_errors(struct p7ioc_phb *p)
+{
+	/* Init_98: LEM Error Mask : Temporarily disable error interrupts */
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,		   0xffffffffffffffff);
+
+	/* Init_99..107: Configure main error traps & clear old state */
+	out_be64(p->regs + PHB_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR_LEM_ENABLE,		   0xffffffffefffffff);
+	out_be64(p->regs + PHB_ERR_FREEZE_ENABLE,	   0x0000000061c00000);
+	out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE,	   0xffffffc58c000000);
+	out_be64(p->regs + PHB_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR_STATUS_MASK,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_108_116: Configure MMIO error traps & clear old state */
+	out_be64(p->regs + PHB_OUT_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR_LEM_ENABLE,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_OUT_ERR_FREEZE_ENABLE,	   0x0000430803000000);
+	out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE,   0x9df3bc00f0f0700f);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR_STATUS_MASK,	   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_117_125: Configure DMA_A error traps & clear old state */
+	out_be64(p->regs + PHB_INA_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR_LEM_ENABLE,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INA_ERR_FREEZE_ENABLE,	   0xc00003ff01006000);
+	out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE,   0x3fff50007e559fd8);
+	out_be64(p->regs + PHB_INA_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR_STATUS_MASK,	   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_126_134: Configure DMA_B error traps & clear old state */
+	out_be64(p->regs + PHB_INB_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_LEM_ENABLE,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,	   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE,   0x18ff80ffff7f0000);
+	out_be64(p->regs + PHB_INB_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_STATUS_MASK,	   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_135..138: Cleanup & configure LEM */
+	out_be64(p->regs + PHB_LEM_FIR_ACCUM,		   0x0000000000000000);
+	out_be64(p->regs + PHB_LEM_ACTION0,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_LEM_ACTION1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_LEM_WOF,			   0x0000000000000000);
+}
+
+/* p7ioc_phb_init - Initialize the PHB hardware
+ *
+ * This is currently only called at boot time. It will eventually
+ * be called at runtime, for example in some cases of error recovery
+ * after a PHB reset in which case we might need locks etc... 
+ */
+int64_t p7ioc_phb_init(struct p7ioc_phb *p)
+{
+	uint64_t val;
+
+	PHBDBG(p, "Initializing PHB %d...\n", p->index);
+
+	p->state = P7IOC_PHB_STATE_INITIALIZING;
+
+	/* For some reason, the doc wants us to read the version
+	 * register, so let's do it. We shoud probably check that
+	 * the value makes sense...
+	 */
+	val = in_be64(p->regs_asb + PHB_VERSION);
+
+	PHBDBG(p, "Version reg: %llx\n", val);
+
+	/*
+	 * Configure AIB operations
+	 *
+	 * This register maps upbound commands to AIB channels.
+	 * DMA Write=0, DMA Read=2, MMIO Load Response=1,
+	 * Interrupt Request=1, TCE Read=3.
+	 */
+	/* Init_1: AIB TX Channel Mapping */
+	out_be64(p->regs_asb + PHB_AIB_TX_CHAN_MAPPING,    0x0211300000000000);
+
+	/*
+	 * This group of steps initializes the AIB RX credits for
+	 * the CI block’s port that is attached to this PHB.
+	 *
+	 * Channel 0 (Dkill): 32 command credits, 0 data credits
+	 *                    (effectively infinite command credits)
+	 * Channel 1 (DMA/TCE Read Responses): 32 command credits, 32 data
+	 *                                     credits (effectively infinite
+	 *                                     command and data credits)
+	 * Channel 2 (Interrupt Reissue/Return): 32 command, 0 data credits
+	 *                                       (effectively infinite
+	 *                                       command credits)
+	 * Channel 3 (MMIO Load/Stores, EOIs): 1 command, 1 data credit
+	 */
+
+	/* Init_2: AIB RX Command Credit */
+	out_be64(p->regs_asb + PHB_AIB_RX_CMD_CRED,        0x0020002000200001);
+	/* Init_3: AIB RX Data Credit */
+	out_be64(p->regs_asb + PHB_AIB_RX_DATA_CRED,       0x0000002000000001);
+	/* Init_4: AXIB RX Credit Init Timer */
+	out_be64(p->regs_asb + PHB_AIB_RX_CRED_INIT_TIMER, 0xFF00000000000000);
+
+	/*
+	 * Enable all 32 AIB and TCE tags.
+	 *
+	 * AIB tags are used for DMA read requests.
+	 * TCE tags are used for every internal transaction as well as TCE
+	 * read requests.
+	 */
+
+	/* Init_5:  PHB - AIB Tag Enable Register */
+	out_be64(p->regs_asb + PHB_AIB_TAG_ENABLE,         0xFFFFFFFF00000000);
+	/* Init_6: PHB – TCE Tag Enable Register */
+	out_be64(p->regs_asb + PHB_TCE_TAG_ENABLE,         0xFFFFFFFF00000000);
+
+	/* Init_7: PCIE - System Configuration Register
+	 *
+	 * This is the default value out of reset. This register can be
+	 * modified to change the following fields if needed:
+	 *
+	 *  bits 04:09 - SYS_EC0C_MAXLINKWIDTH[5:0]
+	 *               The default link width is x8. This can be reduced
+	 *               to x1 or x4, if needed.
+	 *
+	 *  bits 10:12 - SYS_EC04_MAX_PAYLOAD[2:0]
+	 *
+	 *               The default max payload size is 4KB. This can be
+	 *               reduced to the allowed ranges from 128B
+	 *               to 2KB if needed.
+	 */
+	out_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG,         0x422800FC20000000);
+
+	/* Init_8: PHB - PCI-E Reset Register
+	 *
+	 * This will deassert reset for the PCI-E cores, including the
+	 * PHY and HSS macros. The TLDLP core will begin link training
+	 * shortly after this register is written.
+	 * This will also assert reset for the internal scan-only error
+	 * report macros. The error report macro reset will be deasserted
+	 * in a later step.
+	 * Firmware will verify in a later step whether the PCI-E link
+	 * has been established.
+	 *
+	 * NOTE: We perform a PERST at the end of the init sequence so
+	 * we could probably skip that link training.
+	 */
+	out_be64(p->regs + PHB_RESET,                      0xE800000000000000);
+
+	/* Init_9: BUID
+	 *
+	 * Only the top 5 bit of the MSI field are implemented, the bottom
+	 * are always 0. Our buid_msi value should also be a multiple of
+	 * 16 so it should all fit well
+	 */
+	val  = SETFIELD(PHB_BUID_LSI, 0ul, P7_BUID_BASE(p->buid_lsi));
+	val |= SETFIELD(PHB_BUID_MSI, 0ul, P7_BUID_BASE(p->buid_msi));
+	out_be64(p->regs + PHB_BUID, val);
+
+	/* Init_10..12: IO Space */
+	out_be64(p->regs + PHB_IO_BASE_ADDR, p->io_base);
+	out_be64(p->regs + PHB_IO_BASE_MASK, ~(PHB_IO_SIZE - 1));
+	out_be64(p->regs + PHB_IO_START_ADDR, 0);
+
+	/* Init_13..15: M32 Space */
+	out_be64(p->regs + PHB_M32_BASE_ADDR, p->m32_base + M32_PCI_START);
+	out_be64(p->regs + PHB_M32_BASE_MASK, ~(M32_PCI_SIZE - 1));
+	out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START);
+
+	/* Init_16: PCIE-E Outbound Request Upper Address */
+	out_be64(p->regs + PHB_M64_UPPER_BITS, 0);
+
+	/* Init_17: PCIE-E PHB2 Configuration
+	 *
+	 * We enable IO, M32, 32-bit MSI and 64-bit MSI
+	 */
+	out_be64(p->regs + PHB_PHB2_CONFIG,
+		 PHB_PHB2C_32BIT_MSI_EN	|
+		 PHB_PHB2C_IO_EN	|
+		 PHB_PHB2C_64BIT_MSI_EN	|
+		 PHB_PHB2C_M32_EN |
+		 PHB_PHB2C_64B_TCE_EN);
+
+	/* Init_18..xx: Reset all IODA tables */
+	p7ioc_ioda_reset(&p->phb, false);
+
+	/* Init_42..47: Clear UTL & DLP error log regs */
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG1,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG2,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG3,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG4,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_DLP_ERRLOG1,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_DLP_ERRLOG2,	   0xffffffffffffffff);
+
+	/* Init_48: Wait for DLP core to be out of reset */
+	if (!p7ioc_phb_wait_dlp_reset(p))
+		goto failed;
+
+	/* Init_49 - Clear port status */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS,	   0xffffffffffffffff);
+
+	/* Init_50..81: Init root complex config space */
+	if (!p7ioc_phb_init_rc_cfg(p))
+		goto failed;
+
+	/* Init_82..94 : Init UTL */
+	p7ioc_phb_init_utl(p);
+
+	/* Init_95: PCI-E Reset, deassert reset for internal error macros */
+	out_be64(p->regs + PHB_RESET,			   0xe000000000000000);
+
+	/* Init_96: PHB Control register. Various PHB settings:
+	 *
+	 * - Enable ECC for various internal RAMs
+	 * - Enable all TCAM entries
+	 * - Set failed DMA read requests to return Completer Abort on error
+	 */
+	out_be64(p->regs + PHB_CONTROL, 	       	   0x7f38000000000000);
+
+	/* Init_97: Legacy Control register
+	 *
+	 * The spec sets bit 0 to enable DKill to flush the TCEs. We do not
+	 * use that mechanism however, we require the OS to directly access
+	 * the TCE Kill register, so we leave that bit set to 0
+	 */
+	out_be64(p->regs + PHB_LEGACY_CTRL,		   0x0000000000000000);
+
+	/* Init_98..138  : Setup error registers */
+	p7ioc_phb_init_errors(p);
+
+	/* Init_139: Read error summary */
+	val = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (val) {
+		PHBERR(p, "Errors detected during PHB init: 0x%16llx\n", val);
+		goto failed;
+	}
+
+	/* Steps Init_140..142 have been removed from the spec. */
+
+	/* Init_143..144: Enable IO, MMIO, Bus master etc... and clear
+	 * status bits
+	 */
+	p7ioc_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT,
+			     PCI_CFG_STAT_SENT_TABORT	|
+			     PCI_CFG_STAT_RECV_TABORT	|
+			     PCI_CFG_STAT_RECV_MABORT	|
+			     PCI_CFG_STAT_SENT_SERR	|
+			     PCI_CFG_STAT_RECV_PERR);
+	p7ioc_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD,
+			     PCI_CFG_CMD_SERR_EN	|
+			     PCI_CFG_CMD_PERR_RESP	|
+			     PCI_CFG_CMD_BUS_MASTER_EN	|
+			     PCI_CFG_CMD_MEM_EN		|
+			     PCI_CFG_CMD_IO_EN);
+
+	/* At this point, the spec suggests doing a bus walk. However we
+	 * haven't powered up the slots with the SHCP controller. We'll
+	 * deal with that and link training issues later, for now, let's
+	 * enable the full range of error detection
+	 */
+
+	/* Init_145..149: Enable error interrupts and LEM */
+	out_be64(p->regs + PHB_ERR_IRQ_ENABLE,		   0x0000000061c00000);
+	out_be64(p->regs + PHB_OUT_ERR_IRQ_ENABLE,	   0x0000430803000000);
+	out_be64(p->regs + PHB_INA_ERR_IRQ_ENABLE,	   0xc00003ff01006000);
+	out_be64(p->regs + PHB_INB_ERR_IRQ_ENABLE,	   0x0000000000000000);
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,		   0x1249a1147f500f2c);
+
+	/* Init_150: Enable DMA read/write TLP address speculation */
+	out_be64(p->regs + PHB_TCE_PREFETCH,		   0x0000c00000000000);
+
+	/* Init_151..152: Set various timeouts */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL1,		   0x1611112010200000);
+	out_be64(p->regs + PHB_TIMEOUT_CTRL2,		   0x0000561300000000);
+
+	/* Mark the PHB as functional which enables all the various sequences */
+	p->state = P7IOC_PHB_STATE_FUNCTIONAL;
+
+	return OPAL_SUCCESS;
+
+ failed:
+	PHBERR(p, "Initialization failed\n");
+	p->state = P7IOC_PHB_STATE_BROKEN;
+
+	return OPAL_HARDWARE;
+}
+
+void p7ioc_phb_reset(struct phb *phb)
+{
+	struct p7ioc_phb *p = phb_to_p7ioc_phb(phb);
+	struct p7ioc *ioc = p->ioc;
+	uint64_t ci_idx, rreg;
+	unsigned int i;
+	bool fenced;
+
+	/* Check our fence status. The fence bits we care about are
+	 * two bits per PHB at IBM bit location 14 and 15 + 4*phb
+	 */
+	fenced = p7ioc_phb_fenced(p);
+
+	PHBDBG(p, "PHB reset... (fenced: %d)\n", (int)fenced);
+
+	/*
+	 * If not fenced and already functional, let's do an IODA reset
+	 * to clear pending DMAs and wait a bit for thing to settle. It's
+	 * notable that the IODA table cache won't be emptied so that we
+	 * can restore them during error recovery.
+	 */
+	if (p->state == P7IOC_PHB_STATE_FUNCTIONAL && !fenced) {
+		PHBDBG(p, "  ioda reset ...\n");
+		p7ioc_ioda_reset(&p->phb, false);
+		time_wait_ms(100);
+	}
+
+	/* CI port index */
+	ci_idx = p->index + 2;
+
+	/* Reset register bits for this PHB */
+	rreg =  0;/*PPC_BIT(8 + ci_idx * 2);*/	/* CI port config reset */
+	rreg |= PPC_BIT(9 + ci_idx * 2);	/* CI port func reset */
+	rreg |= PPC_BIT(32 + p->index);		/* PHBn config reset */
+
+	/* Mask various errors during reset and clear pending errors */
+	out_be64(ioc->regs + P7IOC_CIn_LEM_ERR_MASK(ci_idx),
+		 0xa4f4000000000000ul);
+	out_be64(p->regs_asb + PHB_LEM_ERROR_MASK, 0xadb650c9808dd051ul);
+	out_be64(ioc->regs + P7IOC_CIn_LEM_FIR(ci_idx), 0);
+
+	/* We need to retry in case the fence doesn't lift due to a
+	 * problem with lost credits (HW guys). How many times ?
+	 */
+#define MAX_PHB_RESET_RETRIES	5
+	for (i = 0; i < MAX_PHB_RESET_RETRIES; i++) {
+		PHBDBG(p, "  reset try %d...\n", i);
+		/* Apply reset */
+		out_be64(ioc->regs + P7IOC_CCRR, rreg);
+		time_wait_ms(1);
+		out_be64(ioc->regs + P7IOC_CCRR, 0);
+
+		/* Check if fence lifed */
+		fenced = p7ioc_phb_fenced(p);
+		PHBDBG(p, "  fenced: %d...\n", (int)fenced);
+		if (!fenced)
+			break;
+	}
+
+	/* Reset failed, not much to do, maybe add an error return */
+	if (fenced) {
+		PHBERR(p, "Reset failed, fence still set !\n");
+		p->state = P7IOC_PHB_STATE_BROKEN;
+		return;
+	}
+
+	/* Wait a bit */
+	time_wait_ms(100);
+
+	/* Re-initialize the PHB */
+	p7ioc_phb_init(p);
+
+	/* Restore the CI error mask */
+	out_be64(ioc->regs + P7IOC_CIn_LEM_ERR_MASK_AND(ci_idx), 0);
+}
+
diff --git a/hw/p7ioc.c b/hw/p7ioc.c
new file mode 100644
index 00000000..9aa6480e
--- /dev/null
+++ b/hw/p7ioc.c
@@ -0,0 +1,677 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <p7ioc.h>
+#include <p7ioc-regs.h>
+#include <cec.h>
+#include <opal.h>
+#include <io.h>
+#include <vpd.h>
+#include <interrupts.h>
+#include <ccan/str/str.h>
+
+/*
+ * Determine the base address of LEM registers according to
+ * the indicated error source.
+ */
+static void *p7ioc_LEM_base(struct p7ioc *ioc, uint32_t err_src)
+{
+	uint32_t index;
+	void *base = NULL;
+
+	switch (err_src) {
+	case P7IOC_ERR_SRC_RGC:
+		base = ioc->regs + P7IOC_RGC_LEM_BASE;
+		break;
+	case P7IOC_ERR_SRC_BI_UP:
+		base = ioc->regs + P7IOC_BI_UP_LEM_BASE;
+		break;
+	case P7IOC_ERR_SRC_BI_DOWN:
+		base = ioc->regs + P7IOC_BI_DOWN_LEM_BASE;
+		break;
+	case P7IOC_ERR_SRC_CI_P0:
+	case P7IOC_ERR_SRC_CI_P1:
+	case P7IOC_ERR_SRC_CI_P2:
+	case P7IOC_ERR_SRC_CI_P3:
+	case P7IOC_ERR_SRC_CI_P4:
+	case P7IOC_ERR_SRC_CI_P5:
+	case P7IOC_ERR_SRC_CI_P6:
+	case P7IOC_ERR_SRC_CI_P7:
+		index = err_src - P7IOC_ERR_SRC_CI_P0;
+		base = ioc->regs + P7IOC_CI_PORTn_LEM_BASE(index);
+		break;
+	case P7IOC_ERR_SRC_PHB0:
+	case P7IOC_ERR_SRC_PHB1:
+	case P7IOC_ERR_SRC_PHB2:
+	case P7IOC_ERR_SRC_PHB3:
+	case P7IOC_ERR_SRC_PHB4:
+	case P7IOC_ERR_SRC_PHB5:
+		index = err_src - P7IOC_ERR_SRC_PHB0;
+		base = ioc->regs + P7IOC_PHBn_LEM_BASE(index);
+		break;
+	case P7IOC_ERR_SRC_MISC:
+		base = ioc->regs + P7IOC_MISC_LEM_BASE;
+		break;
+	case P7IOC_ERR_SRC_I2C:
+		base = ioc->regs + P7IOC_I2C_LEM_BASE;
+		break;
+	default:
+		prerror("%s: Unknown error source %d\n",
+			__func__, err_src);
+	}
+
+	return base;
+}
+
+static void p7ioc_get_diag_common(struct p7ioc *ioc,
+				  void *base,
+				  struct OpalIoP7IOCErrorData *data)
+{
+	/* GEM */
+	data->gemXfir    = in_be64(ioc->regs + P7IOC_GEM_XFIR);
+	data->gemRfir    = in_be64(ioc->regs + P7IOC_GEM_RFIR);
+	data->gemRirqfir = in_be64(ioc->regs + P7IOC_GEM_RIRQFIR);
+	data->gemMask    = in_be64(ioc->regs + P7IOC_GEM_MASK);
+	data->gemRwof    = in_be64(ioc->regs + P7IOC_GEM_RWOF);
+
+	/* LEM */
+	data->lemFir     = in_be64(base + P7IOC_LEM_FIR_OFFSET);
+	data->lemErrMask = in_be64(base + P7IOC_LEM_ERR_MASK_OFFSET);
+	data->lemAction0 = in_be64(base + P7IOC_LEM_ACTION_0_OFFSET);
+	data->lemAction1 = in_be64(base + P7IOC_LEM_ACTION_1_OFFSET);
+	data->lemWof     = in_be64(base + P7IOC_LEM_WOF_OFFSET);
+}
+
+static int64_t p7ioc_get_diag_data(struct io_hub *hub,
+				   void *diag_buffer,
+				   uint64_t diag_buffer_len)
+{
+	struct p7ioc *ioc = iohub_to_p7ioc(hub);
+	struct OpalIoP7IOCErrorData *data = diag_buffer;
+	void *base;
+
+	/* Make sure we have enough buffer */
+	if (diag_buffer_len < sizeof(struct OpalIoP7IOCErrorData))
+		return OPAL_PARAMETER;
+
+	/* We need do nothing if there're no pending errors */
+	if (!p7ioc_err_pending(ioc))
+		return OPAL_CLOSED;
+
+	/*
+	 * We needn't collect diag-data for CI Port{2, ..., 7}
+	 * and PHB{0, ..., 5} since their errors (except GXE)
+	 * have been cached to the specific PHB.
+	 */
+	base = p7ioc_LEM_base(ioc, ioc->err.err_src);
+	if (!base) {
+		p7ioc_set_err_pending(ioc, false);
+		return OPAL_INTERNAL_ERROR;
+	}
+
+	switch (ioc->err.err_src) {
+	case P7IOC_ERR_SRC_RGC:
+		data->type = OPAL_P7IOC_DIAG_TYPE_RGC;
+		p7ioc_get_diag_common(ioc, base, data);
+
+		data->rgc.rgcStatus	= in_be64(ioc->regs + 0x3E1C10);
+		data->rgc.rgcLdcp	= in_be64(ioc->regs + 0x3E1C18);
+
+		break;
+	case P7IOC_ERR_SRC_BI_UP:
+		data->type = OPAL_P7IOC_DIAG_TYPE_BI;
+		data->bi.biDownbound = 0;
+		p7ioc_get_diag_common(ioc, base, data);
+
+		data->bi.biLdcp0	= in_be64(ioc->regs + 0x3C0100);
+		data->bi.biLdcp1	= in_be64(ioc->regs + 0x3C0108);
+		data->bi.biLdcp2	= in_be64(ioc->regs + 0x3C0110);
+		data->bi.biFenceStatus	= in_be64(ioc->regs + 0x3C0130);
+
+		break;
+	case P7IOC_ERR_SRC_BI_DOWN:
+		data->type = OPAL_P7IOC_DIAG_TYPE_BI;
+		data->bi.biDownbound = 1;
+		p7ioc_get_diag_common(ioc, base, data);
+
+		data->bi.biLdcp0	= in_be64(ioc->regs + 0x3C0118);
+		data->bi.biLdcp1	= in_be64(ioc->regs + 0x3C0120);
+		data->bi.biLdcp2	= in_be64(ioc->regs + 0x3C0128);
+		data->bi.biFenceStatus	= in_be64(ioc->regs + 0x3C0130);
+
+		break;
+	case P7IOC_ERR_SRC_CI_P0:
+	case P7IOC_ERR_SRC_CI_P1:
+		data->type = OPAL_P7IOC_DIAG_TYPE_CI;
+		data->ci.ciPort = ioc->err.err_src - P7IOC_ERR_SRC_CI_P0;
+		p7ioc_get_diag_common(ioc, base, data);
+
+		data->ci.ciPortStatus	= in_be64(base + 0x008);
+		data->ci.ciPortLdcp	= in_be64(base + 0x010);
+		break;
+	case P7IOC_ERR_SRC_MISC:
+		data->type = OPAL_P7IOC_DIAG_TYPE_MISC;
+		p7ioc_get_diag_common(ioc, base, data);
+		break;
+	case P7IOC_ERR_SRC_I2C:
+		data->type = OPAL_P7IOC_DIAG_TYPE_I2C;
+		p7ioc_get_diag_common(ioc, base, data);
+		break;
+	default:
+		p7ioc_set_err_pending(ioc, false);
+		return OPAL_CLOSED;
+	}
+
+	/* For errors of MAL class, we need mask it */
+	if (ioc->err.err_class == P7IOC_ERR_CLASS_MAL)
+		out_be64(base + P7IOC_LEM_ERR_MASK_OR_OFFSET,
+			 PPC_BIT(63 - ioc->err.err_bit));
+	p7ioc_set_err_pending(ioc, false);
+
+	return OPAL_SUCCESS;
+}
+
+static const struct io_hub_ops p7ioc_hub_ops = {
+	.set_tce_mem	= NULL, /* No set_tce_mem for p7ioc, we use FMTC */
+	.get_diag_data	= p7ioc_get_diag_data,
+	.reset		= p7ioc_reset,
+};
+
+static int64_t p7ioc_rgc_get_xive(void *data, uint32_t isn,
+				  uint16_t *server, uint8_t *prio)
+{
+	struct p7ioc *ioc = data;
+	uint32_t irq = (isn & 0xf);
+	uint32_t fbuid = P7_IRQ_FBUID(isn);
+	uint64_t xive;
+
+	if (fbuid != ioc->rgc_buid)
+		return OPAL_PARAMETER;
+
+	xive = ioc->xive_cache[irq];
+	*server = GETFIELD(IODA_XIVT_SERVER, xive);
+	*prio = GETFIELD(IODA_XIVT_PRIORITY, xive);
+
+	return OPAL_SUCCESS;
+ }
+
+static int64_t p7ioc_rgc_set_xive(void *data, uint32_t isn,
+				  uint16_t server, uint8_t prio)
+{
+	struct p7ioc *ioc = data;
+	uint32_t irq = (isn & 0xf);
+	uint32_t fbuid = P7_IRQ_FBUID(isn);
+	uint64_t xive;
+	uint64_t m_server, m_prio;
+
+	if (fbuid != ioc->rgc_buid)
+		return OPAL_PARAMETER;
+
+	xive = SETFIELD(IODA_XIVT_SERVER, 0ull, server);
+	xive = SETFIELD(IODA_XIVT_PRIORITY, xive, prio);
+	ioc->xive_cache[irq] = xive;
+
+	/* Now we mangle the server and priority */
+	if (prio == 0xff) {
+		m_server = 0;
+		m_prio = 0xff;
+	} else {
+		m_server = server >> 3;
+		m_prio = (prio >> 3) | ((server & 7) << 5);
+	}
+
+	/* Update the XIVE. Don't care HRT entry on P7IOC */
+	out_be64(ioc->regs + 0x3e1820, (0x0002000000000000 | irq));
+	xive = in_be64(ioc->regs + 0x3e1830);
+	xive = SETFIELD(IODA_XIVT_SERVER, xive, m_server);
+	xive = SETFIELD(IODA_XIVT_PRIORITY, xive, m_prio);
+	out_be64(ioc->regs + 0x3e1830, xive);
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * The function is used to figure out the error class and error
+ * bit according to LEM WOF.
+ *
+ * The bits of WOF register have been classified according to
+ * the error severity. Of course, we should process those errors
+ * with higher priority. For example, there have 2 errors (GXE, INF)
+ * pending, we should process GXE, and INF is meaningless in face
+ * of GXE.
+ */
+static bool p7ioc_err_bit(struct p7ioc *ioc, uint64_t wof)
+{
+	uint64_t val, severity[P7IOC_ERR_CLASS_LAST];
+        int32_t class, bit, err_bit = -1;
+
+	/* Clear severity array */
+	memset(severity, 0, sizeof(uint64_t) * P7IOC_ERR_CLASS_LAST);
+
+	/*
+	 * The severity array has fixed values. However, it depends
+	 * on the damage settings for individual components. We're
+	 * using fixed values based on the assumption that damage settings
+	 * are fixed for now. If we change it some day, we also need
+	 * change the severity array accordingly. Anyway, it's something
+	 * to improve in future so that we can figure out the severity
+	 * array from hardware registers.
+	 */
+	switch (ioc->err.err_src) {
+	case P7IOC_ERR_SRC_EI:
+		/* EI won't create interrupt yet */
+		break;
+	case P7IOC_ERR_SRC_RGC:
+		severity[P7IOC_ERR_CLASS_GXE] = 0xF00086E0F4FCFFFF;
+		severity[P7IOC_ERR_CLASS_RGA] = 0x0000010000000000;
+		severity[P7IOC_ERR_CLASS_INF] = 0x0FFF781F0B030000;
+		break;
+	case P7IOC_ERR_SRC_BI_UP:
+		severity[P7IOC_ERR_CLASS_GXE] = 0xF7FFFFFF7FFFFFFF;
+		severity[P7IOC_ERR_CLASS_INF] = 0x0800000080000000;
+		break;
+	case P7IOC_ERR_SRC_BI_DOWN:
+		severity[P7IOC_ERR_CLASS_GXE] = 0xDFFFF7F35F8000BF;
+		severity[P7IOC_ERR_CLASS_INF] = 0x2000080CA07FFF40;
+		break;
+	case P7IOC_ERR_SRC_CI_P0:
+		severity[P7IOC_ERR_CLASS_GXE] = 0xF5FF000000000000;
+		severity[P7IOC_ERR_CLASS_INF] = 0x0200FFFFFFFFFFFF;
+		severity[P7IOC_ERR_CLASS_MAL] = 0x0800000000000000;
+		break;
+	case P7IOC_ERR_SRC_CI_P1:
+		severity[P7IOC_ERR_CLASS_GXE] = 0xFFFF000000000000;
+		severity[P7IOC_ERR_CLASS_INF] = 0x0000FFFFFFFFFFFF;
+		break;
+	case P7IOC_ERR_SRC_CI_P2:
+	case P7IOC_ERR_SRC_CI_P3:
+	case P7IOC_ERR_SRC_CI_P4:
+	case P7IOC_ERR_SRC_CI_P5:
+	case P7IOC_ERR_SRC_CI_P6:
+	case P7IOC_ERR_SRC_CI_P7:
+		severity[P7IOC_ERR_CLASS_GXE] = 0x5B0B000000000000;
+		severity[P7IOC_ERR_CLASS_PHB] = 0xA4F4000000000000;
+		severity[P7IOC_ERR_CLASS_INF] = 0x0000FFFFFFFFFFFF;
+		break;
+	case P7IOC_ERR_SRC_MISC:
+		severity[P7IOC_ERR_CLASS_GXE] = 0x0000000310000000;
+		severity[P7IOC_ERR_CLASS_PLL] = 0x0000000001C00000;
+		severity[P7IOC_ERR_CLASS_INF] = 0x555FFFF0EE3FFFFF;
+		severity[P7IOC_ERR_CLASS_MAL] = 0xAAA0000C00000000;
+		break;
+	case P7IOC_ERR_SRC_I2C:
+		severity[P7IOC_ERR_CLASS_GXE] = 0x1100000000000000;
+		severity[P7IOC_ERR_CLASS_INF] = 0xEEFFFFFFFFFFFFFF;
+		break;
+	case P7IOC_ERR_SRC_PHB0:
+	case P7IOC_ERR_SRC_PHB1:
+	case P7IOC_ERR_SRC_PHB2:
+	case P7IOC_ERR_SRC_PHB3:
+	case P7IOC_ERR_SRC_PHB4:
+	case P7IOC_ERR_SRC_PHB5:
+		severity[P7IOC_ERR_CLASS_PHB] = 0xADB650CB808DD051;
+		severity[P7IOC_ERR_CLASS_ER]  = 0x0000A0147F50092C;
+		severity[P7IOC_ERR_CLASS_INF] = 0x52490F2000222682;
+		break;
+	}
+
+        /*
+         * The error class (ERR_CLASS) has been defined based on
+         * their severity. The priority of those errors out of same
+         * class should be defined based on the position of corresponding
+         * bit in LEM (Local Error Macro) register.
+         */
+	for (class = P7IOC_ERR_CLASS_NONE + 1;
+	     err_bit < 0 && class < P7IOC_ERR_CLASS_LAST;
+	     class++) {
+		val = wof & severity[class];
+		if (!val) continue;
+
+		for (bit = 0; bit < 64; bit++) {
+			if (val & PPC_BIT(bit)) {
+				err_bit = 63 - bit;
+				break;
+			}
+		}
+	}
+
+	/* If we don't find the error bit, we needn't go on. */
+	if (err_bit < 0)
+		return false;
+
+	ioc->err.err_class = class - 1;
+	ioc->err.err_bit   = err_bit;
+	return true;
+}
+
+/*
+ * Check LEM to determine the detailed error information.
+ * The function is expected to be called while OS calls
+ * to OPAL API opal_pci_next_error(). Eventually, the errors
+ * from CI Port{2, ..., 7} or PHB{0, ..., 5} would be cached
+ * to the specific PHB, the left errors would be cached to
+ * the IOC.
+ */
+bool p7ioc_check_LEM(struct p7ioc *ioc,
+		     uint16_t *pci_error_type,
+		     uint16_t *severity)
+{
+	void *base;
+	uint64_t fir, wof, mask;
+	struct p7ioc_phb *p;
+	int32_t index;
+	bool ret;
+
+	/* Make sure we have error pending on IOC */
+	if (!p7ioc_err_pending(ioc))
+		return false;
+
+	/*
+	 * The IOC probably has been put to fatal error
+	 * state (GXE) because of failure on reading on
+	 * GEM FIR.
+	 */
+	if (ioc->err.err_src == P7IOC_ERR_SRC_NONE &&
+	    ioc->err.err_class != P7IOC_ERR_CLASS_NONE)
+		goto err;
+
+	/*
+	 * Get the base address of LEM registers according
+	 * to the error source. If we failed to get that,
+	 * the error pending flag would be cleared.
+	 */
+	base = p7ioc_LEM_base(ioc, ioc->err.err_src);
+	if (!base) {
+		p7ioc_set_err_pending(ioc, false);
+		return false;
+	}
+
+	/* IOC would be broken upon broken FIR */
+	fir = in_be64(base + P7IOC_LEM_FIR_OFFSET);
+	if (fir == 0xffffffffffffffff) {
+		ioc->err.err_src   = P7IOC_ERR_SRC_NONE;
+		ioc->err.err_class = P7IOC_ERR_CLASS_GXE;
+		goto err;
+	}
+
+	/* Read on ERR_MASK and WOF. However, we needn't do for PHBn */
+	wof = in_be64(base + P7IOC_LEM_WOF_OFFSET);
+	if (ioc->err.err_src >= P7IOC_ERR_SRC_PHB0 &&
+	    ioc->err.err_src <= P7IOC_ERR_SRC_PHB5) {
+		mask = 0x0ull;
+	} else {
+		mask = in_be64(base + P7IOC_LEM_ERR_MASK_OFFSET);
+		in_be64(base + P7IOC_LEM_ACTION_0_OFFSET);
+		in_be64(base + P7IOC_LEM_ACTION_1_OFFSET);
+	}
+
+        /*
+         * We need process those unmasked error first. If we're
+         * failing to get the error bit, we needn't proceed.
+         */
+	if (wof & ~mask)
+		wof &= ~mask;
+	if (!wof) {
+		p7ioc_set_err_pending(ioc, false);
+		return false;
+        }
+
+	if (!p7ioc_err_bit(ioc, wof)) {
+		p7ioc_set_err_pending(ioc, false);
+		return false;
+	}
+
+err:
+	/*
+	 * We run into here because of valid error. Those errors
+	 * from CI Port{2, ..., 7} and PHB{0, ..., 5} will be cached
+	 * to the specific PHB. However, we will cache the global
+	 * errors (e.g. GXE) to IOC directly. For the left errors,
+	 * they will be cached to IOC.
+	 */
+	if (((ioc->err.err_src >= P7IOC_ERR_SRC_CI_P2  &&
+	      ioc->err.err_src <= P7IOC_ERR_SRC_CI_P7) ||
+	     (ioc->err.err_src >= P7IOC_ERR_SRC_PHB0   &&
+	      ioc->err.err_src <= P7IOC_ERR_SRC_PHB5)) &&
+	     ioc->err.err_class != P7IOC_ERR_CLASS_GXE) {
+		index = (ioc->err.err_src >= P7IOC_ERR_SRC_PHB0 &&
+			 ioc->err.err_src <= P7IOC_ERR_SRC_PHB5) ?
+			(ioc->err.err_src - P7IOC_ERR_SRC_PHB0) :
+			(ioc->err.err_src - P7IOC_ERR_SRC_CI_P2);
+		p = &ioc->phbs[index];
+
+		if (p7ioc_phb_enabled(ioc, index)) {
+			p->err.err_src   = ioc->err.err_src;
+			p->err.err_class = ioc->err.err_class;
+			p->err.err_bit   = ioc->err.err_bit;
+			p7ioc_phb_set_err_pending(p, true);
+			p7ioc_set_err_pending(ioc, false);
+
+			return false;
+		}
+	}
+
+	/*
+	 * Map the internal error class to that OS can recognize.
+	 * Errors from PHB or the associated CI port would be
+	 * GXE, PHB-fatal, ER, or INF. For the case, GXE will be
+	 * cached to IOC and the left classes will be cached to
+	 * the specific PHB.
+	 */
+	switch (ioc->err.err_class) {
+	case P7IOC_ERR_CLASS_GXE:
+	case P7IOC_ERR_CLASS_PLL:
+	case P7IOC_ERR_CLASS_RGA:
+		*pci_error_type = OPAL_EEH_IOC_ERROR;
+		*severity = OPAL_EEH_SEV_IOC_DEAD;
+		ret = true;
+		break;
+	case P7IOC_ERR_CLASS_INF:
+	case P7IOC_ERR_CLASS_MAL:
+		*pci_error_type = OPAL_EEH_IOC_ERROR;
+		*severity = OPAL_EEH_SEV_INF;
+		ret = false;
+		break;
+	default:
+		p7ioc_set_err_pending(ioc, false);
+		ret = false;
+	}
+
+	return ret;
+}
+
+/*
+ * Check GEM to see if there has any problematic components.
+ * The function is expected to be called in RGC interrupt
+ * handler. Also, it's notable that failure on reading on
+ * XFIR will cause GXE directly.
+ */
+static bool p7ioc_check_GEM(struct p7ioc *ioc)
+{
+	uint64_t xfir, rwof;
+
+	/*
+	 * Recov_5: Read GEM Xfir
+	 * Recov_6: go to GXE recovery?
+	 */
+	xfir = in_be64(ioc->regs + P7IOC_GEM_XFIR);
+	if (xfir == 0xffffffffffffffff) {
+		ioc->err.err_src   = P7IOC_ERR_SRC_NONE;
+		ioc->err.err_class = P7IOC_ERR_CLASS_GXE;
+		p7ioc_set_err_pending(ioc, true);
+		return true;
+	}
+
+	/*
+	 * Recov_7: Read GEM Rfir
+	 * Recov_8: Read GEM RIRQfir
+	 * Recov_9: Read GEM RWOF
+	 * Recov_10: Read Fence Shadow
+	 * Recov_11: Read Fence Shadow WOF
+	 */
+        in_be64(ioc->regs + P7IOC_GEM_RFIR);
+        in_be64(ioc->regs + P7IOC_GEM_RIRQFIR);
+	rwof = in_be64(ioc->regs + P7IOC_GEM_RWOF);
+	in_be64(ioc->regs + P7IOC_CHIP_FENCE_SHADOW);
+	in_be64(ioc->regs + P7IOC_CHIP_FENCE_WOF);
+
+	/*
+	 * Check GEM RWOF to see which component has been
+	 * put into problematic state.
+	 */
+	ioc->err.err_src = P7IOC_ERR_SRC_NONE;
+	if	(rwof & PPC_BIT(1))  ioc->err.err_src = P7IOC_ERR_SRC_RGC;
+	else if (rwof & PPC_BIT(2))  ioc->err.err_src = P7IOC_ERR_SRC_BI_UP;
+	else if (rwof & PPC_BIT(3))  ioc->err.err_src = P7IOC_ERR_SRC_BI_DOWN;
+	else if (rwof & PPC_BIT(4))  ioc->err.err_src = P7IOC_ERR_SRC_CI_P0;
+	else if (rwof & PPC_BIT(5))  ioc->err.err_src = P7IOC_ERR_SRC_CI_P1;
+	else if (rwof & PPC_BIT(6))  ioc->err.err_src = P7IOC_ERR_SRC_CI_P2;
+	else if (rwof & PPC_BIT(7))  ioc->err.err_src = P7IOC_ERR_SRC_CI_P3;
+	else if (rwof & PPC_BIT(8))  ioc->err.err_src = P7IOC_ERR_SRC_CI_P4;
+	else if (rwof & PPC_BIT(9))  ioc->err.err_src = P7IOC_ERR_SRC_CI_P5;
+	else if (rwof & PPC_BIT(10)) ioc->err.err_src = P7IOC_ERR_SRC_CI_P6;
+	else if (rwof & PPC_BIT(11)) ioc->err.err_src = P7IOC_ERR_SRC_CI_P7;
+	else if (rwof & PPC_BIT(16)) ioc->err.err_src = P7IOC_ERR_SRC_PHB0;
+	else if (rwof & PPC_BIT(17)) ioc->err.err_src = P7IOC_ERR_SRC_PHB1;
+	else if (rwof & PPC_BIT(18)) ioc->err.err_src = P7IOC_ERR_SRC_PHB2;
+	else if (rwof & PPC_BIT(19)) ioc->err.err_src = P7IOC_ERR_SRC_PHB3;
+	else if (rwof & PPC_BIT(20)) ioc->err.err_src = P7IOC_ERR_SRC_PHB4;
+	else if (rwof & PPC_BIT(21)) ioc->err.err_src = P7IOC_ERR_SRC_PHB5;
+	else if (rwof & PPC_BIT(24)) ioc->err.err_src = P7IOC_ERR_SRC_MISC;
+	else if (rwof & PPC_BIT(25)) ioc->err.err_src = P7IOC_ERR_SRC_I2C;
+
+	/*
+	 * If we detect any problematic components, the OS is
+	 * expected to poll that for more details through OPAL
+	 * interface.
+	 */
+        if (ioc->err.err_src != P7IOC_ERR_SRC_NONE) {
+		p7ioc_set_err_pending(ioc, true);
+		return true;
+	}
+
+	return false;
+}
+
+static void p7ioc_rgc_interrupt(void *data, uint32_t isn)
+{
+	struct p7ioc *ioc = data;
+
+	printf("Got RGC interrupt 0x%04x\n", isn);
+
+	/* We will notify OS while getting error from GEM */
+	if (p7ioc_check_GEM(ioc))
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+}
+
+static const struct irq_source_ops p7ioc_rgc_irq_ops = {
+	.get_xive = p7ioc_rgc_get_xive,
+	.set_xive = p7ioc_rgc_set_xive,
+	.interrupt = p7ioc_rgc_interrupt,
+};
+
+static void p7ioc_create_hub(struct dt_node *np)
+{
+	struct p7ioc *ioc;
+	unsigned int i, id;
+	u64 bar1, bar2;
+	u32 pdt;
+	char *path;
+
+	/* Use the BUID extension as ID and add it to device-tree */
+	id = dt_prop_get_u32(np, "ibm,buid-ext");
+	path = dt_get_path(np);	
+	printf("P7IOC: Found at %s ID 0x%x\n", path, id);
+	free(path);
+
+	/* Load VPD LID */
+	vpd_iohub_load(np);
+
+	ioc = zalloc(sizeof(struct p7ioc));
+	if (!ioc)
+		return;
+	ioc->hub.hub_id = id;
+	ioc->hub.ops = &p7ioc_hub_ops;
+	ioc->dt_node = np;
+
+	bar1 = dt_prop_get_u64(np, "ibm,gx-bar-1");
+	bar2 = dt_prop_get_u64(np, "ibm,gx-bar-2");
+
+	ioc->regs = (void *)bar1;
+
+	ioc->mmio1_win_start = bar1;
+	ioc->mmio1_win_size = MWIN1_SIZE;
+	ioc->mmio2_win_start = bar2;
+	ioc->mmio2_win_size = MWIN2_SIZE;
+
+	ioc->buid_base = id << 9;
+	ioc->rgc_buid = ioc->buid_base + RGC_BUID_OFFSET;
+
+	/* Add some DT properties */
+	dt_add_property_cells(np, "ibm,opal-hubid", 0, id);
+
+	/* XXX Fixme: how many RGC interrupts ? */
+	dt_add_property_cells(np, "interrupts", ioc->rgc_buid << 4);
+	dt_add_property_cells(np, "interrupt-base", ioc->rgc_buid << 4);
+
+	/* XXX What about ibm,opal-mmio-real ? */
+
+	/* Clear the RGC XIVE cache */
+	for (i = 0; i < 16; i++)
+		ioc->xive_cache[i] = SETFIELD(IODA_XIVT_PRIORITY, 0ull, 0xff);
+
+	/*
+	 * Register RGC interrupts
+	 *
+	 * For now I assume only 0 is... to verify with Greg or HW guys,
+	 * we support all 16
+	 */
+	register_irq_source(&p7ioc_rgc_irq_ops, ioc, ioc->rgc_buid << 4, 1);
+
+	/* Check for presence detect from HDAT, we use only BR1 on P7IOC */
+	pdt = dt_prop_get_u32_def(np, "ibm,br1-presence-detect", 0xffffffff);
+	if (pdt != 0xffffffff)
+		printf("P7IOC: Presence detect from HDAT : 0x%02x\n", pdt);
+	else {
+	}
+	ioc->phb_pdt = pdt & 0xff;
+
+	/* Setup PHB structures (no HW access yet) */
+	for (i = 0; i < P7IOC_NUM_PHBS; i++) {
+		if (p7ioc_phb_enabled(ioc, i))
+			p7ioc_phb_setup(ioc, i);
+		else
+			ioc->phbs[i].state = P7IOC_PHB_STATE_OFF;
+	}
+	
+	/* Now, we do the bulk of the inits */
+	p7ioc_inits(ioc);
+
+	printf("P7IOC: Initialization complete\n");
+
+	cec_register(&ioc->hub);
+}
+
+void probe_p7ioc(void)
+{
+	struct dt_node *np;
+
+	dt_for_each_compatible(dt_root, np, "ibm,p7ioc")
+		p7ioc_create_hub(np);
+}
+
diff --git a/hw/phb3.c b/hw/phb3.c
new file mode 100644
index 00000000..5e51e71b
--- /dev/null
+++ b/hw/phb3.c
@@ -0,0 +1,3880 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * PHB3 support
+ *
+ */
+
+/*
+ *
+ * FIXME:
+ *   More stuff for EEH support:
+ *      - PBCQ error reporting interrupt
+ *	- I2C-based power management (replacing SHPC)
+ *	- Directly detect fenced PHB through one dedicated HW reg
+ */
+
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <vpd.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <xscom.h>
+#include <affinity.h>
+#include <phb3.h>
+#include <phb3-regs.h>
+#include <capp.h>
+#include <fsp.h>
+
+/* Enable this to disable error interrupts for debug purposes */
+#undef DISABLE_ERR_INTS
+
+static void phb3_init_hw(struct phb3 *p);
+
+static void phb3_trace(struct phb3 *p, FILE *s, const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+
+static void phb3_trace(struct phb3 *p, FILE *s, const char *fmt, ...)
+{
+	/* Use a temp stack buffer to print all at once to avoid
+	 * mixups of a trace entry on SMP
+	 */
+	char tbuf[128 + 10];
+	va_list args;
+	char *b = tbuf;
+
+	b += sprintf(b, "PHB%d: ", p->phb.opal_id);
+	va_start(args, fmt);
+	vsnprintf(b, 128, fmt, args);
+	va_end(args);
+	fputs(tbuf, s);
+}
+#define PHBDBG(p, fmt...)	phb3_trace(p, stdout, fmt)
+#define PHBINF(p, fmt...)	phb3_trace(p, stderr, fmt)
+#define PHBERR(p, fmt...)	phb3_trace(p, stderr, fmt)
+
+/*
+ * Lock callbacks. Allows the OPAL API handlers to lock the
+ * PHB around calls such as config space, EEH, etc...
+ */
+static void phb3_lock(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	lock(&p->lock);
+}
+
+static  void phb3_unlock(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	unlock(&p->lock);
+}
+
+/* Helper to select an IODA table entry */
+static inline void phb3_ioda_sel(struct phb3 *p, uint32_t table,
+				 uint32_t addr, bool autoinc)
+{
+	out_be64(p->regs + PHB_IODA_ADDR,
+		 (autoinc ? PHB_IODA_AD_AUTOINC : 0)	|
+		 SETFIELD(PHB_IODA_AD_TSEL, 0ul, table)	|
+		 SETFIELD(PHB_IODA_AD_TADR, 0ul, addr));
+}
+
+/* Helper to set the state machine timeout */
+static inline uint64_t phb3_set_sm_timeout(struct phb3 *p, uint64_t dur)
+{
+	uint64_t target, now = mftb();
+
+	target = now + dur;
+	if (target == 0)
+		target++;
+	p->delay_tgt_tb = target;
+
+	return dur;
+}
+
+/* Check if AIB is fenced via PBCQ NFIR */
+static bool phb3_fenced(struct phb3 *p)
+{
+	uint64_t nfir;
+
+	/* We still probably has crazy xscom */
+	xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir);
+	if (nfir & PPC_BIT(16)) {
+		p->flags |= PHB3_AIB_FENCED;
+		p->state = PHB3_STATE_FENCED;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Configuration space access
+ *
+ * The PHB lock is assumed to be already held
+ */
+static int64_t phb3_pcicfg_check(struct phb3 *p, uint32_t bdfn,
+				 uint32_t offset, uint32_t size,
+				 uint8_t *pe)
+{
+	uint32_t sm = size - 1;
+
+	if (offset > 0xfff || bdfn > 0xffff)
+		return OPAL_PARAMETER;
+	if (offset & sm)
+		return OPAL_PARAMETER;
+
+	/* The root bus only has a device at 0 and we get into an
+	 * error state if we try to probe beyond that, so let's
+	 * avoid that and just return an error to Linux
+	 */
+	if ((bdfn >> 8) == 0 && (bdfn & 0xff))
+		return OPAL_HARDWARE;
+
+	/* Check PHB state */
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	/* Fetch the PE# from cache */
+	*pe = p->rte_cache[bdfn];
+
+	return OPAL_SUCCESS;
+}
+
+#define PHB3_PCI_CFG_READ(size, type)	\
+static int64_t phb3_pcicfg_read##size(struct phb *phb, uint32_t bdfn,	\
+                                      uint32_t offset, type *data)	\
+{									\
+	struct phb3 *p = phb_to_phb3(phb);				\
+	uint64_t addr, val64;						\
+	int64_t rc;							\
+	uint8_t pe;							\
+	bool use_asb = false;						\
+									\
+	/* Initialize data in case of error */				\
+	*data = (type)0xffffffff;					\
+									\
+	rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe);	\
+	if (rc)								\
+		return rc;						\
+									\
+	if (p->flags & PHB3_AIB_FENCED) {				\
+		if (!(p->flags & PHB3_CFG_USE_ASB))			\
+			return OPAL_HARDWARE;				\
+		use_asb = true;						\
+	} else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) {	\
+		return OPAL_HARDWARE;					\
+	}								\
+									\
+	addr = PHB_CA_ENABLE | ((uint64_t)bdfn << PHB_CA_FUNC_LSH);	\
+	addr = SETFIELD(PHB_CA_REG, addr, offset);			\
+	addr = SETFIELD(PHB_CA_PE, addr, pe);				\
+	if (use_asb) {							\
+		phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr);	\
+		sync();							\
+		val64 = bswap_64(phb3_read_reg_asb(p, PHB_CONFIG_DATA));	\
+		*data = (type)(val64 >> (8 * (offset & (4 - sizeof(type)))));	\
+	} else {							\
+		out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);		\
+		*data = in_le##size(p->regs + PHB_CONFIG_DATA +		\
+				    (offset & (4 - sizeof(type))));	\
+	}								\
+									\
+	return OPAL_SUCCESS;						\
+}
+
+#define PHB3_PCI_CFG_WRITE(size, type)	\
+static int64_t phb3_pcicfg_write##size(struct phb *phb, uint32_t bdfn,	\
+                                       uint32_t offset, type data)	\
+{									\
+	struct phb3 *p = phb_to_phb3(phb);				\
+	uint64_t addr, val64 = 0;					\
+	int64_t rc;							\
+	uint8_t pe;							\
+	bool use_asb = false;						\
+									\
+	rc = phb3_pcicfg_check(p, bdfn, offset, sizeof(type), &pe);	\
+	if (rc)								\
+		return rc;						\
+									\
+	if (p->flags & PHB3_AIB_FENCED) {				\
+		if (!(p->flags & PHB3_CFG_USE_ASB))			\
+			return OPAL_HARDWARE;				\
+		use_asb = true;						\
+	} else if ((p->flags & PHB3_CFG_BLOCKED) && bdfn != 0) {	\
+		return OPAL_HARDWARE;					\
+	}								\
+									\
+	addr = PHB_CA_ENABLE | ((uint64_t)bdfn << PHB_CA_FUNC_LSH);	\
+	addr = SETFIELD(PHB_CA_REG, addr, offset);			\
+	addr = SETFIELD(PHB_CA_PE, addr, pe);				\
+	if (use_asb) {							\
+		val64 = data;						\
+		val64 = bswap_64(val64 << 8 * (offset & (4 - sizeof(type))));	\
+		phb3_write_reg_asb(p, PHB_CONFIG_ADDRESS, addr);	\
+		sync();							\
+		phb3_write_reg_asb(p, PHB_CONFIG_DATA, val64);		\
+	} else {							\
+		out_be64(p->regs + PHB_CONFIG_ADDRESS, addr);		\
+		out_le##size(p->regs + PHB_CONFIG_DATA +		\
+			     (offset & (4 - sizeof(type))), data);	\
+	}								\
+									\
+        return OPAL_SUCCESS;						\
+}
+
+PHB3_PCI_CFG_READ(8, u8)
+PHB3_PCI_CFG_READ(16, u16)
+PHB3_PCI_CFG_READ(32, u32)
+PHB3_PCI_CFG_WRITE(8, u8)
+PHB3_PCI_CFG_WRITE(16, u16)
+PHB3_PCI_CFG_WRITE(32, u32)
+
+static uint8_t phb3_choose_bus(struct phb *phb __unused,
+			       struct pci_device *bridge __unused,
+			       uint8_t candidate, uint8_t *max_bus __unused,
+			       bool *use_max)
+{
+	/* Use standard bus number selection */
+	*use_max = false;
+	return candidate;
+}
+
+static void phb3_root_port_init(struct phb *phb, struct pci_device *dev,
+				int ecap, int aercap)
+{
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_SERR_EN | PCI_CFG_CMD_PERR_RESP);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	if (!aercap) return;
+
+	/* Mask various unrecoverable errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, &val32);
+	val32 |= (PCIECAP_AER_UE_MASK_POISON_TLP |
+		  PCIECAP_AER_UE_MASK_COMPL_TIMEOUT |
+		  PCIECAP_AER_UE_MASK_COMPL_ABORT |
+		  PCIECAP_AER_UE_MASK_ECRC);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, val32);
+
+	/* Report various unrecoverable errors as fatal errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, &val32);
+	val32 |= (PCIECAP_AER_UE_SEVERITY_DLLP |
+		  PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+		  PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+		  PCIECAP_AER_UE_SEVERITY_UNEXP_COMPL |
+		  PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+		  PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/* Mask various recoverable errors */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, &val32);
+	val32 |= PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+
+	/* Enable all error reporting */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, &val32);
+	val32 |= (PCIECAP_AER_RERR_CMD_FE |
+		  PCIECAP_AER_RERR_CMD_NFE |
+		  PCIECAP_AER_RERR_CMD_CE);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_RERR_CMD, val32);
+}
+
+static void phb3_switch_port_init(struct phb *phb,
+				  struct pci_device *dev,
+				  int ecap, int aercap)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking and disable INTx */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN |
+		  PCI_CFG_CMD_INTx_DIS);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Disable partity error and enable system error */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_BRCTL, &val16);
+	val16 &= ~PCI_CFG_BRCTL_PERR_RESP_EN;
+	val16 |= PCI_CFG_BRCTL_SERR_EN;
+	pci_cfg_write16(phb, bdfn, PCI_CFG_BRCTL, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 |= (PCICAP_EXP_DEVCTL_CE_REPORT |
+		  PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT);
+	/* HW279570 - Disable reporting of correctable errors */
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/* Unmask all unrecoverable errors */
+	if (!aercap) return;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_MASK, 0x0);
+
+	/* Severity of unrecoverable errors */
+	if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT)
+		val32 = (PCIECAP_AER_UE_SEVERITY_DLLP |
+			 PCIECAP_AER_UE_SEVERITY_SURPRISE_DOWN |
+			 PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_RECV_OVFLOW |
+			 PCIECAP_AER_UE_SEVERITY_MALFORMED_TLP |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	else
+		val32 = (PCIECAP_AER_UE_SEVERITY_FLOW_CTL_PROT |
+			 PCIECAP_AER_UE_SEVERITY_INTERNAL);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_UE_SEVERITY, val32);
+
+	/*
+	 * Mask various correctable errors
+	 *
+         * On Murano and Venice DD1.0 we disable emission of corrected
+         * error messages to the PHB completely to workaround errata
+         * HW257476 causing the loss of tags.
+	 */
+	if (p->rev < PHB3_REV_MURANO_DD20)
+		val32 = 0xffffffff;
+	else
+		val32 = PCIECAP_AER_CE_MASK_ADV_NONFATAL;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK, val32);
+
+	/* Enable ECRC generation and disable ECRC check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= PCIECAP_AER_CAPCTL_ECRCG_EN;
+	val32 &= ~PCIECAP_AER_CAPCTL_ECRCC_EN;
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void phb3_endpoint_init(struct phb *phb,
+			       struct pci_device *dev,
+			       int ecap, int aercap)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint16_t bdfn = dev->bdfn;
+	uint16_t val16;
+	uint32_t val32;
+
+	/* Enable SERR and parity checking */
+	pci_cfg_read16(phb, bdfn, PCI_CFG_CMD, &val16);
+	val16 |= (PCI_CFG_CMD_PERR_RESP |
+		  PCI_CFG_CMD_SERR_EN);
+	pci_cfg_write16(phb, bdfn, PCI_CFG_CMD, val16);
+
+	/* Enable reporting various errors */
+	if (!ecap) return;
+	pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, &val16);
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	val16 |= (PCICAP_EXP_DEVCTL_NFE_REPORT |
+		  PCICAP_EXP_DEVCTL_FE_REPORT |
+		  PCICAP_EXP_DEVCTL_UR_REPORT);
+	/* HW279570 - Disable reporting of correctable errors */
+	val16 &= ~PCICAP_EXP_DEVCTL_CE_REPORT;
+	pci_cfg_write16(phb, bdfn, ecap + PCICAP_EXP_DEVCTL, val16);
+
+	/*
+	 * On Murano and Venice DD1.0 we disable emission of corrected
+	 * error messages to the PHB completely to workaround errata
+	 * HW257476 causing the loss of tags.
+	 */
+	if (p->rev < PHB3_REV_MURANO_DD20)
+		pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CE_MASK,
+				0xffffffff);
+
+	/* Enable ECRC generation and check */
+	pci_cfg_read32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, &val32);
+	val32 |= (PCIECAP_AER_CAPCTL_ECRCG_EN |
+		  PCIECAP_AER_CAPCTL_ECRCC_EN);
+	pci_cfg_write32(phb, bdfn, aercap + PCIECAP_AER_CAPCTL, val32);
+}
+
+static void phb3_device_init(struct phb *phb, struct pci_device *dev)
+{
+	int ecap = 0;
+	int aercap = 0;
+
+	/* Figure out PCIe & AER capability */
+	if (pci_has_cap(dev, PCI_CFG_CAP_ID_EXP, false)) {
+		ecap = pci_cap(dev, PCI_CFG_CAP_ID_EXP, false);
+
+		if (!pci_has_cap(dev, PCIECAP_ID_AER, true)) {
+			aercap = pci_find_ecap(phb, dev->bdfn,
+					       PCIECAP_ID_AER, NULL);
+			if (aercap > 0)
+				pci_set_cap(dev, PCIECAP_ID_AER, aercap, true);
+		} else {
+			aercap = pci_cap(dev, PCIECAP_ID_AER, true);
+		}
+	}
+
+	/* Reconfigure the MPS */
+	pci_configure_mps(phb, dev);
+
+	if (dev->dev_type == PCIE_TYPE_ROOT_PORT)
+		phb3_root_port_init(phb, dev, ecap, aercap);
+	else if (dev->dev_type == PCIE_TYPE_SWITCH_UPPORT ||
+		 dev->dev_type == PCIE_TYPE_SWITCH_DNPORT)
+		phb3_switch_port_init(phb, dev, ecap, aercap);
+	else
+		phb3_endpoint_init(phb, dev, ecap, aercap);
+}
+
+static int64_t phb3_pci_reinit(struct phb *phb, uint64_t scope, uint64_t data)
+{
+	struct pci_device *pd;
+	uint16_t bdfn = data;
+
+	if (scope != OPAL_REINIT_PCI_DEV)
+		return OPAL_PARAMETER;
+
+	pd = pci_find_dev(phb, bdfn);
+	if (!pd)
+		return OPAL_PARAMETER;
+
+	phb3_device_init(phb, pd);
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_presence_detect(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint16_t slot_stat;
+	uint64_t hp_override;
+	int64_t rc;
+
+	/* Test for PHB in error state ? */
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	/* XXX Check bifurcation stuff ? */
+
+	/* Read slot status register */
+	rc = phb3_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTSTAT,
+					&slot_stat);
+	if (rc != OPAL_SUCCESS)
+		return OPAL_HARDWARE;
+
+	/* Read hotplug override */
+	hp_override = in_be64(p->regs + PHB_HOTPLUG_OVERRIDE);
+
+	printf("PHB%d: slot_stat: 0x%04x, hp_override: 0x%016llx\n",
+	       phb->opal_id, slot_stat, hp_override);
+
+	/* So if the slot status says nothing connected, we bail out */
+	if (!(slot_stat & PCICAP_EXP_SLOTSTAT_PDETECTST))
+		return OPAL_SHPC_DEV_NOT_PRESENT;
+
+	/*
+	 * At this point, we can have one of those funky IBM
+	 * systems that has the presence bit set in the slot
+	 * status and nothing actually connected. If so, we
+	 * check the hotplug override A/B bits
+	 */
+	if (p->use_ab_detect &&
+	    (hp_override & PHB_HPOVR_PRESENCE_A) &&
+	    (hp_override & PHB_HPOVR_PRESENCE_B))
+		return OPAL_SHPC_DEV_NOT_PRESENT;
+
+	/*
+	 * Anything else, we assume device present, the link state
+	 * machine will perform an early bail out if no electrical
+	 * signaling is established after a second.
+	 */
+	return OPAL_SHPC_DEV_PRESENT;
+}
+
+/* Clear IODA cache tables */
+static void phb3_init_ioda_cache(struct phb3 *p)
+{
+	uint32_t i;
+	uint64_t *data64;
+
+	/*
+	 * RTT and PELTV. RTE should be 0xFF's to indicate
+	 * invalid PE# for the corresponding RID.
+	 *
+	 * Note: Instead we set all RTE entries to 0x00 to
+	 * work around a problem where PE lookups might be
+	 * done before Linux has established valid PE's
+	 * (during PCI probing). We can revisit that once/if
+	 * Linux has been fixed to always setup valid PEs.
+	 *
+	 * The value 0x00 corresponds to the default PE# Linux
+	 * uses to check for config space freezes before it
+	 * has assigned PE# to busses.
+	 *
+	 * WARNING: Additionally, we need to be careful, there's
+	 * a HW issue, if we get an MSI on an RTT entry that is
+	 * FF, things will go bad. We need to ensure we don't
+	 * ever let a live FF RTT even temporarily when resetting
+	 * for EEH etc... (HW278969).
+	 */
+	memset(p->rte_cache, 0x00, RTT_TABLE_SIZE);
+	memset(p->peltv_cache, 0x0,  sizeof(p->peltv_cache));
+
+	/* Disable all LSI */
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+		data64 = &p->lxive_cache[i];
+		*data64 = SETFIELD(IODA2_LXIVT_PRIORITY, 0ul, 0xff);
+		*data64 = SETFIELD(IODA2_LXIVT_SERVER, *data64, 0x0);
+	}
+
+	/* Diable all MSI */
+	for (i = 0; i < ARRAY_SIZE(p->ive_cache); i++) {
+		data64 = &p->ive_cache[i];
+		*data64 = SETFIELD(IODA2_IVT_PRIORITY, 0ul, 0xff);
+		*data64 = SETFIELD(IODA2_IVT_SERVER, *data64, 0x0);
+	}
+
+	/* Clear TVT */
+	memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+	/* Clear M32 domain */
+	memset(p->m32d_cache, 0x0, sizeof(p->m32d_cache));
+	/* Clear M64 domain */
+	memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache));
+}
+
+/* phb3_ioda_reset - Reset the IODA tables
+ *
+ * @purge: If true, the cache is cleared and the cleared values
+ *         are applied to HW. If false, the cached values are
+ *         applied to HW
+ *
+ * This reset the IODA tables in the PHB. It is called at
+ * initialization time, on PHB reset, and can be called
+ * explicitly from OPAL
+ */
+static int64_t phb3_ioda_reset(struct phb *phb, bool purge)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t server, prio;
+	uint64_t *pdata64, data64;
+	uint32_t i;
+
+	if (purge) {
+		printf("PHB%d: Purging all IODA tables...\n", p->phb.opal_id);
+		phb3_init_ioda_cache(p);
+	}
+
+	/* Init_27..28 - LIXVT */
+	phb3_ioda_sel(p, IODA2_TBL_LXIVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+		data64 = p->lxive_cache[i];
+		server = GETFIELD(IODA2_LXIVT_SERVER, data64);
+		prio = GETFIELD(IODA2_LXIVT_PRIORITY, data64);
+		data64 = SETFIELD(IODA2_LXIVT_SERVER, data64, server);
+		data64 = SETFIELD(IODA2_LXIVT_PRIORITY, data64, prio);
+		out_be64(p->regs + PHB_IODA_DATA0, data64);
+	}
+
+	/* Init_29..30 - MRT */
+	phb3_ioda_sel(p, IODA2_TBL_MRT, 0, true);
+	for (i = 0; i < 8; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	/* Init_31..32 - TVT */
+	phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+	/* Init_33..34 - M64BT */
+	phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]);
+
+	/* Init_35..36 - M32DT */
+	phb3_ioda_sel(p, IODA2_TBL_M32DT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->m32d_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->m32d_cache[i]);
+
+	/* Load RTE, PELTV */
+	if (p->tbl_rtt)
+		memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE);
+	if (p->tbl_peltv)
+		memcpy((void *)p->tbl_peltv, p->peltv_cache, PELTV_TABLE_SIZE);
+
+	/* Load IVT */
+	if (p->tbl_ivt) {
+		pdata64 = (uint64_t *)p->tbl_ivt;
+		for (i = 0; i < IVT_TABLE_ENTRIES; i++)
+			pdata64[i * IVT_TABLE_STRIDE] = p->ive_cache[i];
+	}
+
+	/* Invalidate RTE, IVE, TCE cache */
+	out_be64(p->regs + PHB_RTC_INVALIDATE, PHB_RTC_INVALIDATE_ALL);
+	out_be64(p->regs + PHB_IVC_INVALIDATE, PHB_IVC_INVALIDATE_ALL);
+	out_be64(p->regs + PHB_TCE_KILL, PHB_TCE_KILL_ALL);
+
+	/* Clear RBA */
+	if (p->rev >= PHB3_REV_MURANO_DD20) {
+		phb3_ioda_sel(p, IODA2_TBL_RBA, 0, true);
+		for (i = 0; i < 32; i++)
+			out_be64(p->regs + PHB_IODA_DATA0, 0x0ul);
+	}
+
+	/* Clear PEST & PEEV */
+	for (i = 0; i < PHB3_MAX_PE_NUM; i++) {
+		uint64_t pesta, pestb;
+
+		phb3_ioda_sel(p, IODA2_TBL_PESTA, i, false);
+		pesta = in_be64(p->regs + PHB_IODA_DATA0);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		phb3_ioda_sel(p, IODA2_TBL_PESTB, i, false);
+		pestb = in_be64(p->regs + PHB_IODA_DATA0);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+		if ((pesta & IODA2_PESTA_MMIO_FROZEN) ||
+		    (pestb & IODA2_PESTB_DMA_STOPPED))
+			PHBDBG(p, "Frozen PE#%d (%s - %s)\n",
+			       i, (pesta & IODA2_PESTA_MMIO_FROZEN) ? "DMA" : "",
+			       (pestb & IODA2_PESTB_DMA_STOPPED) ? "MMIO" : "");
+	}
+
+	phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+	for (i = 0; i < 4; i++)
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_phb_mem_window(struct phb *phb,
+				       uint16_t window_type,
+				       uint16_t window_num,
+				       uint64_t addr,
+				       uint64_t __unused pci_addr,
+				       uint64_t size)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t data64;
+
+	/*
+	 * By design, PHB3 doesn't support IODT any more.
+	 * Besides, we can't enable M32 BAR as well. So
+	 * the function is used to do M64 mapping and each
+	 * BAR is supposed to be shared by all PEs.
+	 */
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16)
+			return OPAL_PARAMETER;
+
+		data64 = p->m64b_cache[window_num];
+		if (data64 & IODA2_M64BT_SINGLE_PE) {
+			if ((addr & 0x1FFFFFFul) ||
+			    (size & 0x1FFFFFFul))
+				return OPAL_PARAMETER;
+		} else {
+			if ((addr & 0xFFFFFul) ||
+			    (size & 0xFFFFFul))
+				return OPAL_PARAMETER;
+		}
+
+		/* size should be 2^N */
+		if (!size || size & (size-1))
+			return OPAL_PARAMETER;
+
+		/* address should be size aligned */
+		if (addr & (size - 1))
+			return OPAL_PARAMETER;
+
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	if (data64 & IODA2_M64BT_SINGLE_PE) {
+		data64 = SETFIELD(IODA2_M64BT_SINGLE_BASE, data64,
+				  addr >> 25);
+		data64 = SETFIELD(IODA2_M64BT_SINGLE_MASK, data64,
+				  0x20000000 - (size >> 25));
+	} else {
+		data64 = SETFIELD(IODA2_M64BT_BASE, data64,
+				  addr >> 20);
+		data64 = SETFIELD(IODA2_M64BT_MASK, data64,
+				  0x40000000 - (size >> 20));
+	}
+	p->m64b_cache[window_num] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * For one specific M64 BAR, it can be shared by all PEs,
+ * or owned by single PE exclusively.
+ */
+static int64_t phb3_phb_mmio_enable(struct phb *phb,
+				    uint16_t window_type,
+				    uint16_t window_num,
+				    uint16_t enable)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t data64, base, mask;
+
+	/*
+	 * By design, PHB3 doesn't support IODT any more.
+	 * Besides, we can't enable M32 BAR as well. So
+	 * the function is used to do M64 mapping and each
+	 * BAR is supposed to be shared by all PEs.
+	 */
+	switch (window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+	case OPAL_M32_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16 ||
+		    enable > OPAL_ENABLE_M64_NON_SPLIT)
+			return OPAL_PARAMETER;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * We need check the base/mask while enabling
+	 * the M64 BAR. Otherwise, invalid base/mask
+	 * might cause fenced AIB unintentionally
+	 */
+	data64 = p->m64b_cache[window_num];
+	switch (enable) {
+	case OPAL_DISABLE_M64:
+		data64 &= ~IODA2_M64BT_SINGLE_PE;
+		data64 &= ~IODA2_M64BT_ENABLE;
+		break;
+	case OPAL_ENABLE_M64_SPLIT:
+		if (data64 & IODA2_M64BT_SINGLE_PE)
+			return OPAL_PARAMETER;
+		base = GETFIELD(IODA2_M64BT_BASE, data64);
+		base = (base << 20);
+		mask = GETFIELD(IODA2_M64BT_MASK, data64);
+		if (base < p->mm0_base || !mask)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA2_M64BT_ENABLE;
+		break;
+	case OPAL_ENABLE_M64_NON_SPLIT:
+		if (!(data64 & IODA2_M64BT_SINGLE_PE))
+			return OPAL_PARAMETER;
+		base = GETFIELD(IODA2_M64BT_SINGLE_BASE, data64);
+		base = (base << 25);
+		mask = GETFIELD(IODA2_M64BT_SINGLE_MASK, data64);
+		if (base < p->mm0_base || !mask)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA2_M64BT_SINGLE_PE;
+		data64 |= IODA2_M64BT_ENABLE;
+		break;
+	}
+
+	/* Update HW and cache */
+	phb3_ioda_sel(p, IODA2_TBL_M64BT, window_num, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+	p->m64b_cache[window_num] = data64;
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_mmio_window(struct phb *phb,
+				       uint16_t pe_num,
+				       uint16_t window_type,
+				       uint16_t window_num,
+				       uint16_t segment_num)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t data64, *cache;
+
+	if (pe_num >= PHB3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	/*
+	 * PHB3 doesn't support IODT any more. On the other
+	 * hand, PHB3 support M64DT with much more flexibility.
+	 * we need figure it out later. At least, we never use
+	 * M64DT in kernel.
+	 */
+	switch(window_type) {
+	case OPAL_IO_WINDOW_TYPE:
+		return OPAL_UNSUPPORTED;
+	case OPAL_M32_WINDOW_TYPE:
+		if (window_num != 0 || segment_num >= PHB3_MAX_PE_NUM)
+			return OPAL_PARAMETER;
+
+		cache = &p->m32d_cache[segment_num];
+		phb3_ioda_sel(p, IODA2_TBL_M32DT, segment_num, false);
+		out_be64(p->regs + PHB_IODA_DATA0,
+			 SETFIELD(IODA2_M32DT_PE, 0ull, pe_num));
+		*cache = SETFIELD(IODA2_M32DT_PE, 0ull, pe_num);
+
+		break;
+	case OPAL_M64_WINDOW_TYPE:
+		if (window_num >= 16)
+			return OPAL_PARAMETER;
+		cache = &p->m64b_cache[window_num];
+		data64 = *cache;
+
+		/* The BAR shouldn't be enabled yet */
+		if (data64 & IODA2_M64BT_ENABLE)
+			return OPAL_PARTIAL;
+
+		data64 |= IODA2_M64BT_SINGLE_PE;
+		data64 = SETFIELD(IODA2_M64BT_PE_HI, data64, pe_num >> 5);
+		data64 = SETFIELD(IODA2_M64BT_PE_LOW, data64, pe_num);
+		*cache = data64;
+
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_dma_window(struct phb *phb,
+				      uint16_t pe_num,
+				      uint16_t window_id,
+				      uint16_t tce_levels,
+				      uint64_t tce_table_addr,
+				      uint64_t tce_table_size,
+				      uint64_t tce_page_size)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/*
+	 * Sanity check. We currently only support "2 window per PE" mode
+	 * ie, only bit 59 of the PCI address is used to select the window
+	 */
+	if (pe_num >= PHB3_MAX_PE_NUM ||
+	    (window_id >> 1) != pe_num)
+		return OPAL_PARAMETER;
+
+	/*
+	 * tce_table_size == 0 is used to disable an entry, in this case
+	 * we ignore other arguments
+	 */
+	if (tce_table_size == 0) {
+		phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+		p->tve_cache[window_id] = 0;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 || tce_levels > 5 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* Encode TCE table size */
+	data64 = SETFIELD(IODA2_TVT_TABLE_ADDR, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 31)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(IODA2_TVT_TCE_TABLE_SIZE, data64, tts_encoded);
+
+	/* Encode TCE page size */
+	switch (tce_page_size) {
+	case 0x1000:	/* 4K */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 1);
+		break;
+	case 0x10000:	/* 64K */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 5);
+		break;
+	case 0x1000000:	/* 16M */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 13);
+		break;
+	case 0x10000000: /* 256M */
+		data64 = SETFIELD(IODA2_TVT_IO_PSIZE, data64, 17);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* Encode number of levels */
+	data64 = SETFIELD(IODA2_TVT_NUM_LEVELS, data64, tce_levels - 1);
+
+	phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_map_pe_dma_window_real(struct phb *phb,
+					   uint16_t pe_num,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr,
+					   uint64_t pci_mem_size)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t end = pci_start_addr + pci_mem_size;
+	uint64_t tve;
+
+	if (pe_num >= PHB3_MAX_PE_NUM ||
+	    (window_id >> 1) != pe_num)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* Enable */
+
+		/*
+		 * Check that the start address has the right TVE index,
+		 * we only support the 1 bit mode where each PE has 2
+		 * TVEs
+		 */
+		if ((pci_start_addr >> 59) != (window_id & 1))
+			return OPAL_PARAMETER;
+		pci_start_addr &= ((1ull << 59) - 1);
+		end = pci_start_addr + pci_mem_size;
+
+		/* We have to be 16M aligned */
+		if ((pci_start_addr & 0x00ffffff) ||
+		    (pci_mem_size & 0x00ffffff))
+			return OPAL_PARAMETER;
+
+		/*
+		 * It *looks* like this is the max we can support (we need
+		 * to verify this. Also we are not checking for rollover,
+		 * but then we aren't trying too hard to protect ourselves
+		 * againt a completely broken OS.
+		 */
+		if (end > 0x0003ffffffffffffull)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+		 * and end address bits 49:24 into TVE[54:55]||[24:47]
+		 * and set TVE[51]
+		 */
+		tve  = (pci_start_addr << 16) & (0xffffffull << 48);
+		tve |= (pci_start_addr >> 38) & (3ull << 10);
+		tve |= (end >>  8) & (0xfffffful << 16);
+		tve |= (end >> 40) & (3ull << 8);
+		tve |= PPC_BIT(51);
+	} else {
+		/* Disable */
+		tve = 0;
+	}
+
+	phb3_ioda_sel(p, IODA2_TBL_TVT, window_id, false);
+	out_be64(p->regs + PHB_IODA_DATA0, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_pci_msi_check_q(struct phb3 *p, uint32_t ive_num)
+{
+	uint64_t ive, ivc, ffi;
+	uint8_t *q_byte;
+
+	/* Each IVE has 16-bytes or 128-bytes */
+	ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+	q_byte = (uint8_t *)(ive + 5);
+
+	/*
+	 * Handle Q bit. If the Q bit doesn't show up,
+	 * we would have CI load to make that.
+	 */
+	if (!(*q_byte & 0x1)) {
+		/* Read from random PHB reg to force flush */
+		in_be64(p->regs + PHB_IVC_UPDATE);
+
+		/* Order with subsequent read of Q */
+		sync();
+
+		/* Q still not set, bail out */
+		if (!(*q_byte & 0x1))
+			return;
+	}
+
+	/* Lock FFI and send interrupt */
+	while (in_be64(p->regs + PHB_FFI_LOCK))
+		/* XXX Handle fences ! */
+		;
+
+	/* Clear Q bit and update IVC */
+	*q_byte = 0;
+	ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+		PHB_IVC_UPDATE_ENABLE_Q;
+	out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+
+	/*
+	 * Resend interrupt. Note the lock clear bit isn't documented in
+	 * the PHB3 spec and thus is probably unnecessary but it's in
+	 * IODA2 so let's be safe here, it won't hurt to set it
+	 */
+	ffi = SETFIELD(PHB_FFI_REQUEST_ISN, 0ul, ive_num) | PHB_FFI_LOCK_CLEAR;
+	out_be64(p->regs + PHB_FFI_REQUEST, ffi);
+}
+
+static int64_t phb3_pci_msi_eoi(struct phb *phb,
+				uint32_t hwirq)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint32_t ive_num = PHB3_IRQ_NUM(hwirq);
+	uint64_t ive, ivc;
+	uint8_t *p_byte, gp, gen;
+
+	/* OS might not configure IVT yet */
+	if (!p->tbl_ivt)
+		return OPAL_HARDWARE;
+
+	/* Each IVE has 16-bytes or 128-bytes */
+	ive = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+	p_byte = (uint8_t *)(ive + 4);
+
+	/* Read generation and P */
+	gp = *p_byte;
+	gen = gp >> 1;
+
+	/* Increment generation count and clear P */
+	*p_byte = ((gen + 1) << 1) & 0x7;
+
+	/* Update the IVC with a match against the old gen count */
+	ivc = SETFIELD(PHB_IVC_UPDATE_SID, 0ul, ive_num) |
+		PHB_IVC_UPDATE_ENABLE_P |
+		PHB_IVC_UPDATE_ENABLE_GEN |
+		SETFIELD(PHB_IVC_UPDATE_GEN_MATCH, 0ul, gen);
+	out_be64(p->regs + PHB_IVC_UPDATE, ivc);
+
+	/* Handle Q bit */
+	phb3_pci_msi_check_q(p, ive_num);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_ive_pe(struct phb *phb,
+			       uint32_t pe_num,
+			       uint32_t ive_num)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t *cache, ivep, data64;
+	uint16_t *pe_word;
+
+	/* OS should enable the BAR in advance */
+	if (!p->tbl_ivt)
+		return OPAL_HARDWARE;
+
+	/* Each IVE reserves 128 bytes */
+	if (pe_num >= PHB3_MAX_PE_NUM ||
+	    ive_num >= IVT_TABLE_ENTRIES)
+		return OPAL_PARAMETER;
+
+	/* Update IVE cache */
+	cache = &p->ive_cache[ive_num];
+	*cache = SETFIELD(IODA2_IVT_PE, *cache, pe_num);
+
+	/* Update in-memory IVE without clobbering P and Q */
+	ivep = p->tbl_ivt + (ive_num * IVT_TABLE_STRIDE * 8);
+	pe_word = (uint16_t *)(ivep + 6);
+	*pe_word = pe_num;
+
+	/* Invalidate IVC */
+	data64 = SETFIELD(PHB_IVC_INVALIDATE_SID, 0ul, ive_num);
+	out_be64(p->regs + PHB_IVC_INVALIDATE, data64);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_msi_32(struct phb *phb __unused,
+			       uint32_t pe_num,
+			       uint32_t ive_num,
+			       uint8_t msi_range,
+			       uint32_t *msi_address,
+			       uint32_t *message_data)
+{
+	/*
+	 * Sanity check. We needn't check on mve_number (PE#)
+	 * on PHB3 since the interrupt source is purely determined
+	 * by its DMA address and data, but the check isn't
+	 * harmful.
+	 */
+	if (pe_num >= PHB3_MAX_PE_NUM ||
+	    ive_num >= IVT_TABLE_ENTRIES ||
+	    msi_range != 1 || !msi_address|| !message_data)
+		return OPAL_PARAMETER;
+
+	/*
+	 * DMA address and data will form the IVE index.
+	 * For more details, please refer to IODA2 spec.
+	 */
+	*msi_address = 0xFFFF0000 | ((ive_num << 4) & 0xFFFFFE0F);
+	*message_data = ive_num & 0x1F;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_msi_64(struct phb *phb __unused,
+			       uint32_t pe_num,
+			       uint32_t ive_num,
+			       uint8_t msi_range,
+			       uint64_t *msi_address,
+			       uint32_t *message_data)
+{
+	/* Sanity check */
+	if (pe_num >= PHB3_MAX_PE_NUM ||
+	    ive_num >= IVT_TABLE_ENTRIES ||
+	    msi_range != 1 || !msi_address || !message_data)
+		return OPAL_PARAMETER;
+
+	/*
+	 * DMA address and data will form the IVE index.
+	 * For more details, please refer to IODA2 spec.
+	 */
+	*msi_address = (0x1ul << 60) | ((ive_num << 4) & 0xFFFFFFFFFFFFFE0Ful);
+	*message_data = ive_num & 0x1F;
+
+	return OPAL_SUCCESS;
+}
+
+static bool phb3_err_check_pbcq(struct phb3 *p)
+{
+	uint64_t nfir, mask, wof, val64;
+	int32_t class, bit;
+	uint64_t severity[PHB3_ERR_CLASS_LAST] = {
+		0x0000000000000000,	/* NONE	*/
+		0x018000F800000000,	/* DEAD */
+		0x7E7DC70000000000,	/* FENCED */
+		0x0000000000000000,	/* ER	*/
+		0x0000000000000000	/* INF	*/
+	};
+
+	/*
+	 * Read on NFIR to see if XSCOM is working properly.
+	 * If XSCOM doesn't work well, we need take the PHB
+	 * into account any more.
+	 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x0, &nfir);
+	if (nfir == 0xffffffffffffffff) {
+		p->err.err_src = PHB3_ERR_SRC_NONE;
+		p->err.err_class = PHB3_ERR_CLASS_DEAD;
+		phb3_set_err_pending(p, true);
+		return true;
+	}
+
+	/*
+	 * Check WOF. We need handle unmasked errors firstly.
+	 * We probably run into the situation (on simulator)
+	 * where we have asserted FIR bits, but WOF has nothing.
+	 * For that case, we should check FIR as well.
+	 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x3, &mask);
+	xscom_read(p->chip_id, p->pe_xscom + 0x8, &wof);
+	if (wof & ~mask)
+		wof &= ~mask;
+	if (!wof) {
+		if (nfir & ~mask)
+			nfir &= ~mask;
+		if (!nfir)
+			return false;
+		wof = nfir;
+	}
+
+	/* We shouldn't hit class PHB3_ERR_CLASS_NONE */
+	for (class = PHB3_ERR_CLASS_NONE;
+	     class < PHB3_ERR_CLASS_LAST;
+	     class++) {
+		val64 = wof & severity[class];
+		if (!val64)
+			continue;
+
+		for (bit = 0; bit < 64; bit++) {
+			if (val64 & PPC_BIT(bit)) {
+				p->err.err_src = PHB3_ERR_SRC_PBCQ;
+				p->err.err_class = class;
+				p->err.err_bit = 63 - bit;
+				phb3_set_err_pending(p, true);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+static bool phb3_err_check_lem(struct phb3 *p)
+{
+	uint64_t fir, wof, mask, val64;
+	int32_t class, bit;
+	uint64_t severity[PHB3_ERR_CLASS_LAST] = {
+		0x0000000000000000,	/* NONE */
+		0x0000000000000000,	/* DEAD */
+		0xADB670C980ADD151,	/* FENCED */
+		0x000800107F500A2C,	/* ER   */
+		0x42018E2200002482	/* INF  */
+	};
+
+	/*
+	 * Read FIR. If XSCOM or ASB is frozen, we needn't
+	 * go forward and just mark the PHB with dead state
+	 */
+	fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+	if (fir == 0xffffffffffffffff) {
+		p->err.err_src = PHB3_ERR_SRC_PHB;
+		p->err.err_class = PHB3_ERR_CLASS_DEAD;
+		phb3_set_err_pending(p, true);
+		return true;
+	}
+
+	/*
+	 * Check on WOF for the unmasked errors firstly. Under
+	 * some situation where we run skiboot on simulator,
+	 * we already had FIR bits asserted, but WOF is still zero.
+	 * For that case, we check FIR directly.
+	 */
+	wof = phb3_read_reg_asb(p, PHB_LEM_WOF);
+	mask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK);
+	if (wof & ~mask)
+		wof &= ~mask;
+	if (!wof) {
+		if (fir & ~mask)
+			fir &= ~mask;
+		if (!fir)
+			return false;
+		wof = fir;
+	}
+
+	/* We shouldn't hit PHB3_ERR_CLASS_NONE */
+	for (class = PHB3_ERR_CLASS_NONE;
+	     class < PHB3_ERR_CLASS_LAST;
+	     class++) {
+		val64 = wof & severity[class];
+		if (!val64)
+			continue;
+
+		for (bit = 0; bit < 64; bit++) {
+			if (val64 & PPC_BIT(bit)) {
+				p->err.err_src = PHB3_ERR_SRC_PHB;
+				p->err.err_class = class;
+				p->err.err_bit = 63 - bit;
+				phb3_set_err_pending(p, true);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+/*
+ * The function can be called during error recovery for INF
+ * and ER class. For INF case, it's expected to be called
+ * when grabbing the error log. We will call it explicitly
+ * when clearing frozen PE state for ER case.
+ */
+static void phb3_err_ER_clear(struct phb3 *p)
+{
+	uint32_t val32;
+	uint64_t val64;
+	uint64_t fir = in_be64(p->regs + PHB_LEM_FIR_ACCUM);
+
+	/* Rec 1: Grab the PCI config lock */
+	/* Removed... unnecessary. We have our own lock here */
+
+	/* Rec 2/3/4: Take all inbound transactions */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000001c00000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0x10000000);
+
+	/* Rec 5/6/7: Clear pending non-fatal errors */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000005000000000ul);
+	val32 = in_be32(p->regs + PHB_CONFIG_DATA);
+	out_be32(p->regs + PHB_CONFIG_DATA, (val32 & 0xe0700000) | 0x0f000f00);
+
+	/* Rec 8/9/10: Clear pending fatal errors for AER */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000010400000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 11/12/13: Clear pending non-fatal errors for AER */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000011000000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 22/23/24: Clear root port errors */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000013000000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0xffffffff);
+
+	/* Rec 25/26/27: Enable IO and MMIO bar */
+	out_be64(p->regs + PHB_CONFIG_ADDRESS, 0x8000004000000000ul);
+	out_be32(p->regs + PHB_CONFIG_DATA, 0x470100f8);
+
+	/* Rec 28: Release the PCI config lock */
+	/* Removed... unnecessary. We have our own lock here */
+
+	/* Rec 29...34: Clear UTL errors */
+	val64 = in_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS, val64);
+	val64 = in_be64(p->regs + UTL_PCIE_PORT_STATUS);
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS, val64);
+	val64 = in_be64(p->regs + UTL_RC_STATUS);
+	out_be64(p->regs + UTL_RC_STATUS, val64);
+
+	/* Rec 39...66: Clear PHB error trap */
+	val64 = in_be64(p->regs + PHB_ERR_STATUS);
+	out_be64(p->regs + PHB_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_ERR_LOG_1, 0x0ul);
+
+	val64 = in_be64(p->regs + PHB_OUT_ERR_STATUS);
+	out_be64(p->regs + PHB_OUT_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_1, 0x0ul);
+
+	val64 = in_be64(p->regs + PHB_INA_ERR_STATUS);
+	out_be64(p->regs + PHB_INA_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_INA_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_INA_ERR_LOG_1, 0x0ul);
+
+	val64 = in_be64(p->regs + PHB_INB_ERR_STATUS);
+	out_be64(p->regs + PHB_INB_ERR_STATUS, val64);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS, 0x0ul);
+	out_be64(p->regs + PHB_INB_ERR_LOG_0, 0x0ul);
+	out_be64(p->regs + PHB_INB_ERR_LOG_1, 0x0ul);
+
+	/* Rec 67/68: Clear FIR/WOF */
+	out_be64(p->regs + PHB_LEM_FIR_AND_MASK, ~fir);
+	out_be64(p->regs + PHB_LEM_WOF, 0x0ul);
+}
+
+static void phb3_read_phb_status(struct phb3 *p,
+				 struct OpalIoPhb3ErrorData *stat)
+{
+	bool locked;
+	uint16_t val;
+	uint64_t *pPEST;
+	uint64_t val64 = 0;
+	uint32_t i;
+
+	memset(stat, 0, sizeof(struct OpalIoPhb3ErrorData));
+
+	/* Error data common part */
+	stat->common.version = OPAL_PHB_ERROR_DATA_VERSION_1;
+	stat->common.ioType  = OPAL_PHB_ERROR_DATA_TYPE_PHB3;
+	stat->common.len     = sizeof(struct OpalIoPhb3ErrorData);
+
+	/*
+	 * We read some registers using config space through AIB.
+	 *
+	 * Get to other registers using ASB when possible to get to them
+	 * through a fence if one is present.
+	 */
+
+	/* Use ASB to access PCICFG if the PHB has been fenced */
+	locked = lock_recursive(&p->lock);
+	p->flags |= PHB3_CFG_USE_ASB;
+
+	/* Grab RC bridge control, make it 32-bit */
+	phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &val);
+	stat->brdgCtl = val;
+
+	/* Grab UTL status registers */
+	stat->portStatusReg = hi32(phb3_read_reg_asb(p, UTL_PCIE_PORT_STATUS));
+	stat->rootCmplxStatus = hi32(phb3_read_reg_asb(p, UTL_RC_STATUS));
+	stat->busAgentStatus = hi32(phb3_read_reg_asb(p, UTL_SYS_BUS_AGENT_STATUS));
+
+	/*
+	 * Grab various RC PCIe capability registers. All device, slot
+	 * and link status are 16-bit, so we grab the pair control+status
+	 * for each of them
+	 */
+	phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_DEVCTL,
+			   &stat->deviceStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_SLOTCTL,
+			   &stat->slotStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->ecap + PCICAP_EXP_LCTL,
+			   &stat->linkStatus);
+
+	/*
+	 * I assume those are the standard config space header, cmd & status
+	 * together makes 32-bit. Secondary status is 16-bit so I'll clear
+	 * the top on that one
+	 */
+	phb3_pcicfg_read32(&p->phb, 0, PCI_CFG_CMD, &stat->devCmdStatus);
+	phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_SECONDARY_STATUS, &val);
+	stat->devSecStatus = val;
+
+	/* Grab a bunch of AER regs */
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_RERR_STA,
+			   &stat->rootErrorStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_UE_STATUS,
+			   &stat->uncorrErrorStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+			   &stat->corrErrorStatus);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG0,
+			   &stat->tlpHdr1);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG1,
+			   &stat->tlpHdr2);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG2,
+			   &stat->tlpHdr3);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_HDR_LOG3,
+			   &stat->tlpHdr4);
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_SRCID,
+			   &stat->sourceId);
+
+	/* Restore to AIB */
+	p->flags &= ~PHB3_CFG_USE_ASB;
+	if (locked) {
+		unlock(&p->lock);
+		pci_put_phb(&p->phb);
+	}
+
+	/* PEC NFIR */
+	xscom_read(p->chip_id, p->pe_xscom + 0x0, &stat->nFir);
+	xscom_read(p->chip_id, p->pe_xscom + 0x3, &stat->nFirMask);
+	xscom_read(p->chip_id, p->pe_xscom + 0x8, &stat->nFirWOF);
+
+	/* PHB3 inbound and outbound error Regs */
+	stat->phbPlssr = phb3_read_reg_asb(p, PHB_CPU_LOADSTORE_STATUS);
+	stat->phbCsr = phb3_read_reg_asb(p, PHB_DMA_CHAN_STATUS);
+	stat->lemFir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+	stat->lemErrorMask = phb3_read_reg_asb(p, PHB_LEM_ERROR_MASK);
+	stat->lemWOF = phb3_read_reg_asb(p, PHB_LEM_WOF);
+	stat->phbErrorStatus = phb3_read_reg_asb(p, PHB_ERR_STATUS);
+	stat->phbFirstErrorStatus = phb3_read_reg_asb(p, PHB_ERR1_STATUS);
+	stat->phbErrorLog0 = phb3_read_reg_asb(p, PHB_ERR_LOG_0);
+	stat->phbErrorLog1 = phb3_read_reg_asb(p, PHB_ERR_LOG_1);
+	stat->mmioErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR_STATUS);
+	stat->mmioFirstErrorStatus = phb3_read_reg_asb(p, PHB_OUT_ERR1_STATUS);
+	stat->mmioErrorLog0 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_0);
+	stat->mmioErrorLog1 = phb3_read_reg_asb(p, PHB_OUT_ERR_LOG_1);
+	stat->dma0ErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR_STATUS);
+	stat->dma0FirstErrorStatus = phb3_read_reg_asb(p, PHB_INA_ERR1_STATUS);
+	stat->dma0ErrorLog0 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_0);
+	stat->dma0ErrorLog1 = phb3_read_reg_asb(p, PHB_INA_ERR_LOG_1);
+	stat->dma1ErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR_STATUS);
+	stat->dma1FirstErrorStatus = phb3_read_reg_asb(p, PHB_INB_ERR1_STATUS);
+	stat->dma1ErrorLog0 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_0);
+	stat->dma1ErrorLog1 = phb3_read_reg_asb(p, PHB_INB_ERR_LOG_1);
+
+	/*
+	 * Grab PESTA & B content. The error bit (bit#0) should
+	 * be fetched from IODA and the left content from memory
+	 * resident tables.
+	 */
+	pPEST = (uint64_t *)p->tbl_pest;
+	val64 = PHB_IODA_AD_AUTOINC;
+	val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTA);
+	phb3_write_reg_asb(p, PHB_IODA_ADDR, val64);
+	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+		stat->pestA[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0);
+		stat->pestA[i] |= pPEST[2 * i];
+	}
+
+	val64 = PHB_IODA_AD_AUTOINC;
+	val64 = SETFIELD(PHB_IODA_AD_TSEL, val64, IODA2_TBL_PESTB);
+	phb3_write_reg_asb(p, PHB_IODA_ADDR, val64);
+	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+		stat->pestB[i] = phb3_read_reg_asb(p, PHB_IODA_DATA0);
+		stat->pestB[i] |= pPEST[2 * i + 1];
+	}
+}
+
+static int64_t phb3_msi_get_xive(void *data,
+				 uint32_t isn,
+				 uint16_t *server,
+				 uint8_t *prio)
+{
+	struct phb3 *p = data;
+	uint32_t chip, index, irq;
+	uint64_t ive;
+
+	chip = P8_IRQ_TO_CHIP(isn);
+	index = P8_IRQ_TO_PHB(isn);
+	irq = PHB3_IRQ_NUM(isn);
+
+	if (chip != p->chip_id ||
+	    index != p->index ||
+	    irq > PHB3_MSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Each IVE has 16 bytes in cache. Note that the kernel
+	 * should strip the link bits from server field.
+	 */
+	ive = p->ive_cache[irq];
+	*server = GETFIELD(IODA2_IVT_SERVER, ive);
+	*prio = GETFIELD(IODA2_IVT_PRIORITY, ive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_msi_set_xive(void *data,
+				 uint32_t isn,
+				 uint16_t server,
+				 uint8_t prio)
+{
+	struct phb3 *p = data;
+	uint32_t chip, index;
+	uint64_t *cache, ive_num, data64, m_server, m_prio;
+	uint32_t *ive;
+
+	chip = P8_IRQ_TO_CHIP(isn);
+	index = P8_IRQ_TO_PHB(isn);
+	ive_num = PHB3_IRQ_NUM(isn);
+
+	if (p->state == PHB3_STATE_BROKEN || !p->tbl_rtt)
+		return OPAL_HARDWARE;
+	if (chip != p->chip_id ||
+	    index != p->index ||
+	    ive_num > PHB3_MSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	/*
+	 * We need strip the link from server. As Milton told
+	 * me, the server is assigned as follows and the left
+	 * bits unused: node/chip/core/thread/link = 2/3/4/3/2
+	 *
+	 * Note: the server has added the link bits to server.
+	 */
+	m_server = server;
+	m_prio = prio;
+
+	cache = &p->ive_cache[ive_num];
+	*cache = SETFIELD(IODA2_IVT_SERVER,   *cache, m_server);
+	*cache = SETFIELD(IODA2_IVT_PRIORITY, *cache, m_prio);
+
+	/*
+	 * Update IVT and IVC. We need use IVC update register
+	 * to do that. Each IVE in the table has 128 bytes
+	 */
+	ive = (uint32_t *)(p->tbl_ivt + ive_num * IVT_TABLE_STRIDE * 8);
+	data64 = PHB_IVC_UPDATE_ENABLE_SERVER | PHB_IVC_UPDATE_ENABLE_PRI;
+	data64 = SETFIELD(PHB_IVC_UPDATE_SID, data64, ive_num);
+	data64 = SETFIELD(PHB_IVC_UPDATE_SERVER, data64, m_server);
+	data64 = SETFIELD(PHB_IVC_UPDATE_PRI, data64, m_prio);
+
+	/*
+	 * We don't use SETFIELD because we are doing a 32-bit access
+	 * in order to avoid touching the P and Q bits
+	 */
+	*ive = (m_server << 8) | m_prio;
+	out_be64(p->regs + PHB_IVC_UPDATE, data64);
+
+	/*
+	 * Handle Q bit if we're going to enable the interrupt.
+	 * The OS should make sure the interrupt handler has
+	 * been installed already.
+	 */
+	if (prio != 0xff)
+		phb3_pci_msi_check_q(p, ive_num);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_lsi_get_xive(void *data,
+				 uint32_t isn,
+				 uint16_t *server,
+				 uint8_t *prio)
+{
+	struct phb3 *p = data;
+	uint32_t chip, index, irq;
+	uint64_t lxive;
+
+	chip = P8_IRQ_TO_CHIP(isn);
+	index = P8_IRQ_TO_PHB(isn);
+	irq = PHB3_IRQ_NUM(isn);
+
+	if (chip != p->chip_id	||
+	    index != p->index	||
+	    irq < PHB3_LSI_IRQ_MIN ||
+	    irq > PHB3_LSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	lxive = p->lxive_cache[irq - PHB3_LSI_IRQ_MIN];
+	*server = GETFIELD(IODA2_LXIVT_SERVER, lxive);
+	*prio = GETFIELD(IODA2_LXIVT_PRIORITY, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_lsi_set_xive(void *data,
+				 uint32_t isn,
+				 uint16_t server,
+				 uint8_t prio)
+{
+	struct phb3 *p = data;
+	uint32_t chip, index, irq, entry;
+	uint64_t lxive;
+
+	chip = P8_IRQ_TO_CHIP(isn);
+	index = P8_IRQ_TO_PHB(isn);
+	irq = PHB3_IRQ_NUM(isn);
+
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	if (chip != p->chip_id	||
+	    index != p->index	||
+	    irq < PHB3_LSI_IRQ_MIN ||
+	    irq > PHB3_LSI_IRQ_MAX)
+		return OPAL_PARAMETER;
+
+	lxive = SETFIELD(IODA2_LXIVT_SERVER, 0ul, server);
+	lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio);
+
+	/*
+	 * We cache the arguments because we have to mangle
+	 * it in order to hijack 3 bits of priority to extend
+	 * the server number
+	 */
+	entry = irq - PHB3_LSI_IRQ_MIN;
+	p->lxive_cache[entry] = lxive;
+
+	/* We use HRT entry 0 always for now */
+	phb3_ioda_sel(p, IODA2_TBL_LXIVT, entry, false);
+	lxive = in_be64(p->regs + PHB_IODA_DATA0);
+	lxive = SETFIELD(IODA2_LXIVT_SERVER, lxive, server);
+	lxive = SETFIELD(IODA2_LXIVT_PRIORITY, lxive, prio);
+	out_be64(p->regs + PHB_IODA_DATA0, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_err_interrupt(void *data, uint32_t isn)
+{
+	struct phb3 *p = data;
+
+	PHBDBG(p, "Got interrupt 0x%08x\n", isn);
+
+	/* Update pending event */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+
+	/* If the PHB is broken, go away */
+	if (p->state == PHB3_STATE_BROKEN)
+		return;
+
+	/*
+	 * Mark the PHB has pending error so that the OS
+	 * can handle it at late point.
+	 */
+	phb3_set_err_pending(p, true);
+}
+
+/* MSIs (OS owned) */
+static const struct irq_source_ops phb3_msi_irq_ops = {
+	.get_xive = phb3_msi_get_xive,
+	.set_xive = phb3_msi_set_xive,
+};
+
+/* LSIs (OS owned) */
+static const struct irq_source_ops phb3_lsi_irq_ops = {
+	.get_xive = phb3_lsi_get_xive,
+	.set_xive = phb3_lsi_set_xive,
+};
+
+/* Error LSIs (skiboot owned) */
+static const struct irq_source_ops phb3_err_lsi_irq_ops = {
+	.get_xive = phb3_lsi_get_xive,
+	.set_xive = phb3_lsi_set_xive,
+	.interrupt = phb3_err_interrupt,
+};
+
+static int64_t phb3_set_pe(struct phb *phb,
+			   uint64_t pe_num,
+                           uint64_t bdfn,
+			   uint8_t bcompare,
+			   uint8_t dcompare,
+			   uint8_t fcompare,
+			   uint8_t action)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t mask, val, tmp, idx;
+	int32_t all = 0;
+	uint16_t *rte;
+
+	/* Sanity check */
+	if (!p->tbl_rtt)
+		return OPAL_HARDWARE;
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_num >= PHB3_MAX_PE_NUM || bdfn > 0xffff ||
+	    bcompare > OpalPciBusAll ||
+	    dcompare > OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare > OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_PARAMETER;
+
+	/* Figure out the RID range */
+	if (bcompare == OpalPciBusAny) {
+		mask = 0x0;
+		val  = 0x0;
+		all  = 0x1;
+	} else {
+		tmp  = ((0x1 << (bcompare + 1)) - 1) << (15 - bcompare);
+		mask = tmp;
+		val  = bdfn & tmp;
+	}
+
+	if (dcompare == OPAL_IGNORE_RID_DEVICE_NUMBER)
+		all = (all << 1) | 0x1;
+	else {
+		mask |= 0xf8;
+		val  |= (bdfn & 0xf8);
+	}
+
+	if (fcompare == OPAL_IGNORE_RID_FUNCTION_NUMBER)
+		all = (all << 1) | 0x1;
+	else {
+		mask |= 0x7;
+		val  |= (bdfn & 0x7);
+	}
+
+	/* Map or unmap the RTT range */
+	if (all == 0x7) {
+		if (action == OPAL_MAP_PE) {
+			for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++)
+				p->rte_cache[idx] = pe_num;
+		} else {
+			memset(p->rte_cache, 0xff, RTT_TABLE_SIZE);
+		}
+		memcpy((void *)p->tbl_rtt, p->rte_cache, RTT_TABLE_SIZE);
+		out_be64(p->regs + PHB_RTC_INVALIDATE,
+			 PHB_RTC_INVALIDATE_ALL);
+	} else {
+		rte = (uint16_t *)p->tbl_rtt;
+		for (idx = 0; idx < RTT_TABLE_ENTRIES; idx++, rte++) {
+			if ((idx & mask) != val)
+				continue;
+			p->rte_cache[idx] = (action ? pe_num : 0xffff);
+			*rte = p->rte_cache[idx];
+
+			/*
+			 * We might not need invalidate RTC one by one since
+			 * the RTT is expected to be updated in batch mode
+			 * in host kernel.
+			 */
+			out_be64(p->regs + PHB_RTC_INVALIDATE,
+				 SETFIELD(PHB_RTC_INVALIDATE_RID, 0ul, idx));
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_set_peltv(struct phb *phb,
+			      uint32_t parent_pe,
+			      uint32_t child_pe,
+			      uint8_t state)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint8_t *peltv;
+	uint32_t idx, mask;
+
+	/* Sanity check */
+	if (!p->tbl_peltv)
+		return OPAL_HARDWARE;
+	if (parent_pe >= PHB3_MAX_PE_NUM || child_pe >= PHB3_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	/* Find index for parent PE */
+	idx = parent_pe * (PHB3_MAX_PE_NUM / 8);
+	idx += (child_pe / 8);
+	mask = 0x1 << (7 - (child_pe % 8));
+
+	peltv = (uint8_t *)p->tbl_peltv;
+	peltv += idx;
+	if (state) {
+		*peltv |= mask;
+		p->peltv_cache[idx] |= mask;
+	} else {
+		*peltv &= ~mask;
+		p->peltv_cache[idx] &= ~mask;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_link_state(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+	uint16_t lstat;
+	int64_t rc;
+
+	/* XXX Test for PHB in error state ? */
+
+	/* Link is up, let's find the actual speed */
+	if (!(reg & PHB_PCIE_DLP_TC_DL_LINKACT))
+		return OPAL_SHPC_LINK_DOWN;
+
+	rc = phb3_pcicfg_read16(&p->phb, 0, p->ecap + PCICAP_EXP_LSTAT,
+				&lstat);
+	if (rc < 0) {
+		/* Shouldn't happen */
+		PHBERR(p, "Failed to read link status\n");
+		return OPAL_HARDWARE;
+	}
+	if (!(lstat & PCICAP_EXP_LSTAT_DLLL_ACT))
+		return OPAL_SHPC_LINK_DOWN;
+
+	return GETFIELD(PCICAP_EXP_LSTAT_WIDTH, lstat);
+}
+
+static int64_t phb3_power_state(struct phb __unused *phb)
+{
+	/* XXX Test for PHB in error state ? */
+
+	/* XXX TODO - External power control ? */
+
+	return OPAL_SHPC_POWER_ON;
+}
+
+static int64_t phb3_slot_power_off(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+	if (p->state != PHB3_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* XXX TODO - External power control ? */
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_slot_power_on(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+	if (p->state != PHB3_STATE_FUNCTIONAL)
+		return OPAL_BUSY;
+
+	/* XXX TODO - External power control ? */
+
+	return OPAL_SUCCESS;
+}
+
+static void phb3_setup_for_link_down(struct phb3 *p)
+{
+	uint32_t reg32;
+
+	/* Mark link down */
+	p->has_link = false;
+
+	/* Mask PCIE port interrupts */
+	out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad42800000000000);
+
+	/* Mask AER receiver error */
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+	reg32 |= PCIECAP_AER_CE_RECVR_ERR;
+	phb3_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_MASK, reg32);
+}
+
+static void phb3_setup_for_link_up(struct phb3 *p)
+{
+	uint32_t reg32;
+	
+	/* Clear AER receiver error status */
+	phb3_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_STATUS,
+			    PCIECAP_AER_CE_RECVR_ERR);
+	/* Unmask receiver error status in AER */
+	phb3_pcicfg_read32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_MASK, &reg32);
+	reg32 &= ~PCIECAP_AER_CE_RECVR_ERR;
+	phb3_pcicfg_write32(&p->phb, 0, p->aercap + PCIECAP_AER_CE_MASK, reg32);
+
+	/* Clear spurrious errors and enable PCIE port interrupts */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS, 0xffdfffffffffffff);
+	out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN, 0xad5a800000000000);
+
+	/* Mark link down */
+	p->has_link = true;
+
+	/* Don't block PCI-CFG */
+	p->flags &= ~PHB3_CFG_BLOCKED;
+}
+
+static int64_t phb3_sm_link_poll(struct phb3 *p)
+{
+	uint64_t reg;
+
+	/* This is the state machine to wait for the link to come
+	 * up. Currently we just wait until we timeout, eventually
+	 * we want to add retries and fallback to Gen1.
+	 */
+	switch(p->state) {
+	case PHB3_STATE_WAIT_LINK_ELECTRICAL:
+		/* Wait for the link electrical connection to be
+		 * established (shorter timeout). This allows us to
+		 * workaround spurrious presence detect on some machines
+		 * without waiting 10s each time
+		 *
+		 * Note: We *also* check for the full link up bit here
+		 * because simics doesn't seem to implement the electrical
+		 * link bit at all
+		 */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & (PHB_PCIE_DLP_INBAND_PRESENCE |
+			   PHB_PCIE_DLP_TC_DL_LINKACT)) {
+			PHBDBG(p, "Electrical link detected...\n");
+			p->state = PHB3_STATE_WAIT_LINK;
+			p->retries = PHB3_LINK_WAIT_RETRIES;
+		} else if (p->retries-- == 0) {
+			PHBDBG(p, "Timeout waiting for electrical link\n");
+			PHBDBG(p, "DLP train control: 0x%016llx\n", reg);
+			/* No link, we still mark the PHB as functional */
+			p->state = PHB3_STATE_FUNCTIONAL;
+			return OPAL_SUCCESS;
+		}
+		return phb3_set_sm_timeout(p, msecs_to_tb(100));
+	case PHB3_STATE_WAIT_LINK:
+		/* XXX I used the PHB_PCIE_LINK_MANAGEMENT register here but
+		 *     simics doesn't seem to give me anything, so I've switched
+		 *     to PCIE_DLP_TRAIN_CTL which appears more reliable
+		 */
+		reg = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (reg & PHB_PCIE_DLP_TC_DL_LINKACT) {
+			/* Setup PHB for link up */
+			phb3_setup_for_link_up(p);
+			PHBDBG(p, "Link is up!\n");
+			p->state = PHB3_STATE_FUNCTIONAL;
+			return OPAL_SUCCESS;
+		}
+		if (p->retries-- == 0) {
+			PHBDBG(p, "Timeout waiting for link up\n");
+			PHBDBG(p, "DLP train control: 0x%016llx\n", reg);
+			/* No link, we still mark the PHB as functional */
+			p->state = PHB3_STATE_FUNCTIONAL;
+			return OPAL_SUCCESS;
+		}
+		return phb3_set_sm_timeout(p, msecs_to_tb(100));
+	default:
+		/* How did we get here ? */
+		assert(false);
+	}
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb3_start_link_poll(struct phb3 *p)
+{
+	/*
+	 * Wait for link up to 10s. However, we give up after
+	 * only a second if the electrical connection isn't
+	 * stablished according to the DLP link control register
+	 */
+	p->retries = PHB3_LINK_ELECTRICAL_RETRIES;
+	p->state = PHB3_STATE_WAIT_LINK_ELECTRICAL;
+	return phb3_set_sm_timeout(p, msecs_to_tb(100));
+}
+
+static int64_t phb3_sm_hot_reset(struct phb3 *p)
+{
+	uint16_t brctl;
+
+	switch (p->state) {
+	case PHB3_STATE_FUNCTIONAL:
+		/* We need do nothing with available slot */
+		if (phb3_presence_detect(&p->phb) != OPAL_SHPC_DEV_PRESENT) {
+			PHBDBG(p, "Slot hreset: no device\n");
+			return OPAL_CLOSED;
+		}
+
+		/* Prepare for link going down */
+		phb3_setup_for_link_down(p);
+
+		/* Turn on hot reset */
+		phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl |= PCI_CFG_BRCTL_SECONDARY_RESET;
+		phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		PHBDBG(p, "Slot hreset: assert reset\n");
+
+		p->state = PHB3_STATE_HRESET_DELAY;
+		return phb3_set_sm_timeout(p, secs_to_tb(1));
+	case PHB3_STATE_HRESET_DELAY:
+		/* Turn off hot reset */
+		phb3_pcicfg_read16(&p->phb, 0, PCI_CFG_BRCTL, &brctl);
+		brctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, brctl);
+		PHBDBG(p, "Slot hreset: deassert reset\n");
+
+		/*
+		 * Due to some oddball adapters bouncing the link
+		 * training a couple of times, we wait for a full second
+		 * before we start checking the link status, otherwise
+		 * we can get a spurrious link down interrupt which
+		 * causes us to EEH immediately.
+		 */
+		p->state = PHB3_STATE_HRESET_DELAY2;
+		return phb3_set_sm_timeout(p, secs_to_tb(1));
+	case PHB3_STATE_HRESET_DELAY2:
+		return phb3_start_link_poll(p);
+	default:
+		PHBDBG(p, "Slot hreset: wrong state %d\n", p->state);
+		break;
+	}
+
+	p->state = PHB3_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb3_hot_reset(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (p->state != PHB3_STATE_FUNCTIONAL) {
+		PHBDBG(p, "phb3_hot_reset: wrong state %d\n",
+		       p->state);
+		return OPAL_HARDWARE;
+	}
+
+	p->flags |= PHB3_CFG_BLOCKED;
+	return phb3_sm_hot_reset(p);
+}
+
+static int64_t phb3_sm_fundamental_reset(struct phb3 *p)
+{
+	uint64_t reg;
+
+
+	/*
+	 * Check if there's something connected. We do that here
+	 * instead of the switch case below because we want to do
+	 * that before we test the skip_perst
+	 */
+	if (p->state == PHB3_STATE_FUNCTIONAL &&
+	    phb3_presence_detect(&p->phb) != OPAL_SHPC_DEV_PRESENT) {
+		PHBDBG(p, "Slot freset: no device\n");
+		return OPAL_CLOSED;
+	}
+
+	/* Handle boot time skipping of reset */
+	if (p->skip_perst && p->state == PHB3_STATE_FUNCTIONAL) {
+		PHBINF(p, "Cold boot, skipping PERST assertion\n");
+		p->state = PHB3_STATE_FRESET_ASSERT_DELAY;
+		/* PERST skipping happens only once */
+		p->skip_perst = false;
+	}
+
+	switch(p->state) {
+	case PHB3_STATE_FUNCTIONAL:
+		PHBINF(p, "Performing PERST...\n");
+
+		/* Prepare for link going down */
+		phb3_setup_for_link_down(p);
+
+		/* Assert PERST */
+		reg = in_be64(p->regs + PHB_RESET);
+		reg &= ~0x2000000000000000ul;
+		out_be64(p->regs + PHB_RESET, reg);
+		PHBDBG(p, "Slot freset: Asserting PERST\n");
+
+		/* XXX Check delay for PERST... doing 1s for now */
+		p->state = PHB3_STATE_FRESET_ASSERT_DELAY;
+		return phb3_set_sm_timeout(p, secs_to_tb(1));
+
+	case PHB3_STATE_FRESET_ASSERT_DELAY:
+		/* Deassert PERST */
+		reg = in_be64(p->regs + PHB_RESET);
+		reg |= 0x2000000000000000ul;
+		out_be64(p->regs + PHB_RESET, reg);
+		PHBDBG(p, "Slot freset: Deasserting PERST\n");
+
+		/* Wait 200ms before polling link */
+		p->state = PHB3_STATE_FRESET_DEASSERT_DELAY;
+		return phb3_set_sm_timeout(p, msecs_to_tb(200));
+
+	case PHB3_STATE_FRESET_DEASSERT_DELAY:
+		/* Switch to generic link poll state machine */
+		return phb3_start_link_poll(p);
+
+	default:
+		PHBDBG(p, "Slot freset: wrong state %d\n",
+		       p->state);
+		break;
+	}
+
+	p->state = PHB3_STATE_FUNCTIONAL;
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb3_fundamental_reset(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if (p->state != PHB3_STATE_FUNCTIONAL) {
+		PHBDBG(p, "phb3_fundamental_reset: wrong state %d\n", p->state);
+		return OPAL_HARDWARE;
+	}
+
+	p->flags |= PHB3_CFG_BLOCKED;
+	return phb3_sm_fundamental_reset(p);
+}
+
+/*
+ * The OS is expected to do fundamental reset after complete
+ * reset to make sure the PHB could be recovered from the
+ * fenced state. However, the OS needn't do that explicitly
+ * since fundamental reset will be done automatically while
+ * powering on the PHB.
+ *
+ *
+ * Usually, we need power off/on the PHB. That includes the
+ * fundamental reset. However, we don't know how to control
+ * the power stuff yet. So skip that and do fundamental reset
+ * directly after reinitialization the hardware.
+ */
+static int64_t phb3_sm_complete_reset(struct phb3 *p)
+{
+	uint64_t cqsts, val;
+
+	switch (p->state) {
+	case PHB3_STATE_FENCED:
+	case PHB3_STATE_FUNCTIONAL:
+		/*
+		 * The users might be doing error injection through PBCQ
+		 * Error Inject Control Register. Without clearing that,
+		 * we will get recrusive error during recovery and it will
+		 * fail eventually.
+		 */
+		xscom_write(p->chip_id, p->pe_xscom + 0xa, 0x0ul);
+
+		/*
+		 * We might have escalated frozen state on non-existing PE
+		 * to fenced PHB. For the case, the PHB isn't fenced in the
+		 * hardware level and it's not safe to do ETU reset. So we
+		 * have to force fenced PHB prior to ETU reset.
+		 */
+		if (!phb3_fenced(p))
+			xscom_write(p->chip_id, p->pe_xscom + 0x2, 0x000000f000000000ull);
+
+		/* Clear errors in NFIR and raise ETU reset */
+		xscom_read(p->chip_id, p->pe_xscom + 0x0, &p->nfir_cache);
+
+		xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */
+		xscom_write(p->chip_id, p->pci_xscom + 0xa,
+			    0x8000000000000000);
+		p->state = PHB3_STATE_CRESET_WAIT_CQ;
+		p->retries = 500;
+		return phb3_set_sm_timeout(p, msecs_to_tb(10));
+	case PHB3_STATE_CRESET_WAIT_CQ:
+		xscom_read(p->chip_id, p->pe_xscom + 0x1c, &val);
+		xscom_read(p->chip_id, p->pe_xscom + 0x1d, &val);
+		xscom_read(p->chip_id, p->pe_xscom + 0x1e, &val);
+		xscom_read(p->chip_id, p->pe_xscom + 0xf, &cqsts);
+		if (!(cqsts & 0xC000000000000000)) {
+			xscom_write(p->chip_id, p->pe_xscom + 0x1, ~p->nfir_cache);
+
+			p->state = PHB3_STATE_CRESET_REINIT;
+			return phb3_set_sm_timeout(p, msecs_to_tb(100));
+		}
+
+		if (p->retries-- == 0) {
+			PHBERR(p, "Timeout waiting for pending transaction\n");
+			goto error;
+		}
+		return phb3_set_sm_timeout(p, msecs_to_tb(10));
+	case PHB3_STATE_CRESET_REINIT:
+		p->flags &= ~PHB3_AIB_FENCED;
+		phb3_init_hw(p);
+
+		p->state = PHB3_STATE_CRESET_FRESET;
+		return phb3_set_sm_timeout(p, msecs_to_tb(100));
+	case PHB3_STATE_CRESET_FRESET:
+		p->state = PHB3_STATE_FUNCTIONAL;
+		p->flags |= PHB3_CFG_BLOCKED;
+		return phb3_sm_fundamental_reset(p);
+	default:
+		assert(false);
+	}
+
+	/* Mark the PHB as dead and expect it to be removed */
+error:
+	p->state = PHB3_STATE_BROKEN;
+	return OPAL_PARAMETER;
+}
+
+static int64_t phb3_complete_reset(struct phb *phb, uint8_t assert)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+
+	if ((assert == OPAL_ASSERT_RESET &&
+	    p->state != PHB3_STATE_FUNCTIONAL &&
+	    p->state != PHB3_STATE_FENCED) ||
+	    (assert == OPAL_DEASSERT_RESET &&
+	    p->state != PHB3_STATE_FUNCTIONAL)) {
+		PHBERR(p, "phb3_creset: wrong state %d\n",
+		       p->state);
+		return OPAL_HARDWARE;
+	}
+
+	/* Block PCI-CFG access */
+	p->flags |= PHB3_CFG_BLOCKED;
+
+	if (assert == OPAL_ASSERT_RESET) {
+		PHBINF(p, "Starting PHB reset sequence\n");
+		return phb3_sm_complete_reset(p);
+	} else {
+		return phb3_sm_hot_reset(p);
+	}
+}
+
+static int64_t phb3_poll(struct phb *phb)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t now = mftb();
+
+	if (p->state == PHB3_STATE_FUNCTIONAL)
+		return OPAL_SUCCESS;
+
+	/* Check timer */
+	if (p->delay_tgt_tb &&
+	    tb_compare(now, p->delay_tgt_tb) == TB_ABEFOREB)
+		return p->delay_tgt_tb - now;
+
+	/* Expired (or not armed), clear it */
+	p->delay_tgt_tb = 0;
+
+	/* Dispatch to the right state machine */
+	switch(p->state) {
+	case PHB3_STATE_HRESET_DELAY:
+	case PHB3_STATE_HRESET_DELAY2:
+		return phb3_sm_hot_reset(p);
+	case PHB3_STATE_FRESET_ASSERT_DELAY:
+	case PHB3_STATE_FRESET_DEASSERT_DELAY:
+		return phb3_sm_fundamental_reset(p);
+	case PHB3_STATE_CRESET_WAIT_CQ:
+	case PHB3_STATE_CRESET_REINIT:
+	case PHB3_STATE_CRESET_FRESET:
+		return phb3_sm_complete_reset(p);
+	case PHB3_STATE_WAIT_LINK_ELECTRICAL:
+	case PHB3_STATE_WAIT_LINK:
+		return phb3_sm_link_poll(p);
+	default:
+		PHBDBG(p, "phb3_poll: wrong state %d\n", p->state);
+		break;
+	}
+
+	/* Unknown state, could be a HW error */
+	return OPAL_HARDWARE;
+}
+
+static int64_t phb3_eeh_freeze_status(struct phb *phb, uint64_t pe_number,
+				      uint8_t *freeze_state,
+				      uint16_t *pci_error_type,
+				      uint16_t *severity,
+				      uint64_t *phb_status)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t peev_bit = PPC_BIT(pe_number & 0x3f);
+	uint64_t peev, pesta, pestb;
+
+	/* Defaults: not frozen */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+
+	/* Check dead */
+	if (p->state == PHB3_STATE_BROKEN) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_HARDWARE;
+	}
+
+	/* Check fence */
+	if (phb3_fenced(p)) {
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		if (severity)
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+		goto bail;
+	}
+
+	/* Check the PEEV */
+	phb3_ioda_sel(p, IODA2_TBL_PEEV, pe_number / 64, false);
+	peev = in_be64(p->regs + PHB_IODA_DATA0);
+	if (!(peev & peev_bit))
+		return OPAL_SUCCESS;
+
+	/* Indicate that we have an ER pending */
+	phb3_set_err_pending(p, true);
+	if (severity)
+		*severity = OPAL_EEH_SEV_PE_ER;
+
+	/* Read the PESTA & PESTB */
+	phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+	pesta = in_be64(p->regs + PHB_IODA_DATA0);
+	phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+	pestb = in_be64(p->regs + PHB_IODA_DATA0);
+
+	/* Convert them */
+	if (pesta & IODA2_PESTA_MMIO_FROZEN)
+		*freeze_state |= OPAL_EEH_STOPPED_MMIO_FREEZE;
+	if (pestb & IODA2_PESTB_DMA_STOPPED)
+		*freeze_state |= OPAL_EEH_STOPPED_DMA_FREEZE;
+
+bail:
+	if (phb_status)
+		phb3_read_phb_status(p,
+			(struct OpalIoPhb3ErrorData *)phb_status);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_freeze_clear(struct phb *phb, uint64_t pe_number,
+				     uint64_t eeh_action_token)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t err, peev[4];
+	int32_t i;
+	bool frozen_pe = false;
+
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	/* Summary. If nothing, move to clearing the PESTs which can
+	 * contain a freeze state from a previous error or simply set
+	 * explicitely by the user
+	 */
+	err = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (err == 0xffffffffffffffff) {
+		if (phb3_fenced(p)) {
+			PHBERR(p, "eeh_freeze_clear on fenced PHB\n");
+			return OPAL_HARDWARE;
+		}
+	}
+	if (err != 0)
+		phb3_err_ER_clear(p);
+
+	/*
+	 * We have PEEV in system memory. It would give more performance
+	 * to access that directly.
+	 */
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO) {
+		phb3_ioda_sel(p, IODA2_TBL_PESTA, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+	if (eeh_action_token & OPAL_EEH_ACTION_CLEAR_FREEZE_DMA) {
+		phb3_ioda_sel(p, IODA2_TBL_PESTB, pe_number, false);
+		out_be64(p->regs + PHB_IODA_DATA0, 0);
+	}
+
+
+	/* Update ER pending indication */
+	phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+	for (i = 0; i < ARRAY_SIZE(peev); i++) {
+		peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+		if (peev[i]) {
+			frozen_pe = true;
+			break;
+		}
+	}
+	if (frozen_pe) {
+		p->err.err_src	 = PHB3_ERR_SRC_PHB;
+		p->err.err_class = PHB3_ERR_CLASS_ER;
+		p->err.err_bit   = -1;
+		phb3_set_err_pending(p, true);
+	} else
+		phb3_set_err_pending(p, false);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_eeh_next_error(struct phb *phb,
+				   uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type,
+				   uint16_t *severity)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t fir, peev[4];
+	uint32_t cfg32;
+	int32_t i, j;
+
+	/* If the PHB is broken, we needn't go forward */
+	if (p->state == PHB3_STATE_BROKEN) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_DEAD;
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * Check if we already have pending errors. If that's
+	 * the case, then to get more information about the
+	 * pending errors. Here we try PBCQ prior to PHB.
+	 */
+	if (phb3_err_pending(p) &&
+	    !phb3_err_check_pbcq(p) &&
+	    !phb3_err_check_lem(p))
+		phb3_set_err_pending(p, false);
+
+	/* Clear result */
+	*pci_error_type  = OPAL_EEH_NO_ERROR;
+	*severity	 = OPAL_EEH_SEV_NO_ERROR;
+	*first_frozen_pe = (uint64_t)-1;
+
+	/* Check frozen PEs */
+	if (!phb3_err_pending(p)) {
+		phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+		for (i = 0; i < ARRAY_SIZE(peev); i++) {
+			peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+			if (peev[i]) {
+				p->err.err_src	 = PHB3_ERR_SRC_PHB;
+				p->err.err_class = PHB3_ERR_CLASS_ER;
+				p->err.err_bit	 = -1;
+				phb3_set_err_pending(p, true);
+				break;
+			}
+		}
+        }
+
+	/* Mapping errors */
+	if (phb3_err_pending(p)) {
+		/*
+		 * If the frozen PE is caused by a malfunctioning TLP, we
+		 * need reset the PHB. So convert ER to PHB-fatal error
+		 * for the case.
+		 */
+		if (p->err.err_class == PHB3_ERR_CLASS_ER) {
+			fir = phb3_read_reg_asb(p, PHB_LEM_FIR_ACCUM);
+			if (fir & PPC_BIT(60)) {
+				phb3_pcicfg_read32(&p->phb, 0,
+					p->aercap + PCIECAP_AER_UE_STATUS, &cfg32);
+				if (cfg32 & PCIECAP_AER_UE_MALFORMED_TLP)
+					p->err.err_class = PHB3_ERR_CLASS_FENCED;
+			}
+		}
+
+		switch (p->err.err_class) {
+		case PHB3_ERR_CLASS_DEAD:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_PHB_DEAD;
+			break;
+		case PHB3_ERR_CLASS_FENCED:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_PHB_FENCED;
+			break;
+		case PHB3_ERR_CLASS_ER:
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+
+			phb3_ioda_sel(p, IODA2_TBL_PEEV, 0, true);
+			for (i = 0; i < ARRAY_SIZE(peev); i++)
+				peev[i] = in_be64(p->regs + PHB_IODA_DATA0);
+			for (i = ARRAY_SIZE(peev) - 1; i >= 0; i--) {
+				for (j = 0; j < 64; j++) {
+					if (peev[i] & PPC_BIT(j)) {
+						*first_frozen_pe = i * 64 + j;
+						break;
+					}
+				}
+
+				if (*first_frozen_pe != (uint64_t)(-1))
+					break;
+			}
+
+			/* No frozen PE ? */
+			if (*first_frozen_pe == (uint64_t)-1) {
+				*pci_error_type = OPAL_EEH_NO_ERROR;
+				*severity = OPAL_EEH_SEV_NO_ERROR;
+				phb3_set_err_pending(p, false);
+			}
+
+                        break;
+		case PHB3_ERR_CLASS_INF:
+			*pci_error_type = OPAL_EEH_PHB_ERROR;
+			*severity = OPAL_EEH_SEV_INF;
+			break;
+		default:
+			*pci_error_type = OPAL_EEH_NO_ERROR;
+			*severity = OPAL_EEH_SEV_NO_ERROR;
+			phb3_set_err_pending(p, false);
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t phb3_get_diag_data(struct phb *phb,
+				  void *diag_buffer,
+				  uint64_t diag_buffer_len)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	struct OpalIoPhb3ErrorData *data = diag_buffer;
+
+	if (diag_buffer_len < sizeof(struct OpalIoPhb3ErrorData))
+		return OPAL_PARAMETER;
+	if (p->state == PHB3_STATE_BROKEN)
+		return OPAL_HARDWARE;
+
+	/*
+	 * Dummy check for fence so that phb3_read_phb_status knows
+	 * whether to use ASB or AIB
+	 */
+	phb3_fenced(p);
+	phb3_read_phb_status(p, data);
+
+	/*
+	 * We're running to here probably because of errors
+	 * (INF class). For that case, we need clear the error
+	 * explicitly.
+	 */
+	if (phb3_err_pending(p) &&
+	    p->err.err_class == PHB3_ERR_CLASS_INF &&
+	    p->err.err_src == PHB3_ERR_SRC_PHB) {
+		phb3_err_ER_clear(p);
+		phb3_set_err_pending(p, false);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static uint64_t capp_fsp_lid_load(void)
+{
+	uint32_t lid_no = 0x80a02001; /* murano dd2.1 */
+
+#define CAPP_UCODE_MAX_SIZE 0x4000
+	void *data = malloc(CAPP_UCODE_MAX_SIZE);
+	size_t size;
+	int rc;
+	if (!data) {
+		prerror("PHB3: Failed to allocated memory for capp ucode lid\n");
+		return 0;
+	}
+	size = CAPP_UCODE_MAX_SIZE;
+	rc = fsp_fetch_data(0, FSP_DATASET_NONSP_LID, lid_no, 0, data, &size);
+	if (rc) {
+		prerror("PHB3: Error %d loading capp ucode lid\n", rc);
+		free(data);
+		return 0;
+	}
+
+	return (uint64_t)data;
+}
+
+static int64_t capp_load_ucode(struct phb3 *p)
+{
+
+	struct capp_ucode_lid_hdr *ucode_hdr;
+	struct capp_ucode_data_hdr *data_hdr;
+	uint64_t data, *val;
+	int size_read = 0;
+	int i;
+
+	/* if fsp not present p->ucode_base gotten from device tree */
+	if (fsp_present() && (p->capp_ucode_base == 0))
+		p->capp_ucode_base = capp_fsp_lid_load();
+
+	if (p->capp_ucode_base == 0) {
+		PHBERR(p, "capp ucode base address not set\n");
+		return OPAL_HARDWARE;
+	}
+
+	PHBINF(p, "Loading capp microcode @%llx\n", p->capp_ucode_base);
+	ucode_hdr = (struct capp_ucode_lid_hdr *)(p->capp_ucode_base);
+	if (ucode_hdr->eyecatcher != 0x43415050554c4944) {
+		PHBERR(p, "capi ucode lid header eyecatcher not found\n");
+		return OPAL_HARDWARE;
+	}
+
+	data_hdr = (struct capp_ucode_data_hdr *)((uint64_t)ucode_hdr + sizeof(*ucode_hdr));
+	while (size_read < ucode_hdr->data_size) {
+		if (data_hdr->eyecatcher != 0x4341505055434F44) {
+			PHBERR(p, "capi ucode data header eyecatcher not found!\n");
+			return OPAL_HARDWARE;
+		}
+
+		val = (uint64_t *)data_hdr + sizeof(*data_hdr)/sizeof(uint64_t);
+		if (data_hdr->reg == apc_master_cresp) {
+			xscom_write(p->chip_id, CAPP_APC_MASTER_ARRAY_ADDR_REG, 0);
+			for (i = 0; i < data_hdr->num_data_chunks; i++)
+				xscom_write(p->chip_id, CAPP_APC_MASTER_ARRAY_WRITE_REG, *val++);
+			xscom_read(p->chip_id, CAPP_APC_MASTER_ARRAY_ADDR_REG, &data);
+		} else if (data_hdr->reg == apc_master_uop_table) {
+			xscom_write(p->chip_id, CAPP_APC_MASTER_ARRAY_ADDR_REG, 0x180ULL << 52);
+			for (i = 0; i < data_hdr->num_data_chunks; i++)
+				xscom_write(p->chip_id, CAPP_APC_MASTER_ARRAY_WRITE_REG, *val++);
+			xscom_read(p->chip_id, CAPP_APC_MASTER_ARRAY_ADDR_REG, &data);
+		} else if (data_hdr->reg == snp_ttype) {
+			xscom_write(p->chip_id, CAPP_SNP_ARRAY_ADDR_REG, 0x5000ULL << 48);
+			for (i = 0; i < data_hdr->num_data_chunks; i++)
+				xscom_write(p->chip_id, CAPP_SNP_ARRAY_WRITE_REG, *val++);
+			xscom_read(p->chip_id, CAPP_SNP_ARRAY_ADDR_REG, &data);
+		} else if (data_hdr->reg == snp_uop_table) {
+			xscom_write(p->chip_id, CAPP_SNP_ARRAY_ADDR_REG, 0x4000ULL << 48);
+			for (i = 0; i < data_hdr->num_data_chunks; i++)
+				xscom_write(p->chip_id, CAPP_SNP_ARRAY_WRITE_REG, *val++);
+			xscom_read(p->chip_id, CAPP_SNP_ARRAY_ADDR_REG, &data);
+		}
+
+		size_read += sizeof(*data_hdr) + data_hdr->num_data_chunks * 8;
+		data_hdr = (struct capp_ucode_data_hdr *)((uint64_t *)data_hdr +
+				sizeof(*data_hdr)/8 + data_hdr->num_data_chunks);
+	}
+
+	p->capp_ucode_loaded = true;
+	return OPAL_SUCCESS;
+}
+
+static void phb3_init_capp_regs(struct phb3 *p)
+{
+	/* writing these vals directly based on lab procedures
+	   but some values included in microcode need to investigate */
+
+	/*      port0    port1
+	 * 100   PHB0   disabled
+	 * we're told it's the same for Venice
+         */
+	xscom_write(p->chip_id, APC_MASTER_PB_CTRL, 	0x10000000000000FF);
+	xscom_write(p->chip_id, APC_MASTER_CONFIG, 	0x4070000000000000);
+
+	/* tlb and mmio */
+	xscom_write(p->chip_id, TRANSPORT_CONTROL, 	0x4028000100000000);
+
+	xscom_write(p->chip_id, CANNED_PRESP_MAP0, 	0);
+	xscom_write(p->chip_id, CANNED_PRESP_MAP1, 	0xFFFFFFFF00000000);
+	xscom_write(p->chip_id, CANNED_PRESP_MAP2, 	0);
+
+	/* error recovery */
+	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL,  	0);
+
+	xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP,   	0x0ABCDEF000000000);
+	xscom_write(p->chip_id, CAPP_EPOCH_TIMER_CTRL, 	0x00000000FFF8FFE0);
+	xscom_write(p->chip_id, FLUSH_UOP_CONFIG1, 	0xB188280728000000);
+	xscom_write(p->chip_id, FLUSH_UOP_CONFIG2, 	0xB188400F00000000);
+	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG, 	0x01F0000000000000);
+}
+
+/* override some inits with CAPI defaults */
+static void phb3_init_capp_errors(struct phb3 *p)
+{
+	out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE,       0xffffffdd0c80ffc0);
+	out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE,   0x9cf3fe08f8dc700f);
+	out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE,   0xffff57fbff01ffde);
+	out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE,   0xfcffe0fbff7ff0ec);
+}
+
+static int64_t phb3_set_capi_mode(struct phb *phb, uint64_t mode,
+				  uint64_t pe_number)
+{
+	struct phb3 *p = phb_to_phb3(phb);
+	uint64_t reg;
+	int i;
+
+	if (mode != 1)
+		return OPAL_PARAMETER;
+
+	/* poll cqstat */
+	for (i = 0; i < 500; i++) {
+		xscom_read(p->chip_id, p->pe_xscom + 0xf, &reg);
+		if (!(reg & 0xC000000000000000))
+			break;
+		time_wait_ms(10);
+	}
+	if (reg & 0xC000000000000000) {
+		PHBERR(p, "Timeout waiting for pending transaction\n");
+		return OPAL_HARDWARE;
+	}
+
+	xscom_write(p->chip_id, p->spci_xscom + 0x3, 0x8000000000000000ull);
+	/* FIXME security timer bar
+	xscom_write(p->chip_id, p->spci_xscom + 0x4, 0x8000000000000000ull);
+	*/
+
+	/* aib mode */
+	xscom_read(p->chip_id, p->pci_xscom + 0xf, &reg);
+	reg &= ~PPC_BITMASK(6,7);
+	reg |= PPC_BIT(8);
+	reg |= PPC_BITMASK(40, 41);
+	reg &= ~PPC_BIT(42);
+	xscom_write(p->chip_id, p->pci_xscom + 0xf, reg);
+
+	/* pci hwconf0 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x18, &reg);
+	reg |= PPC_BIT(14);
+	reg &= ~PPC_BIT(15);
+	xscom_write(p->chip_id, p->pe_xscom + 0x18, reg);
+
+	/* pci hwconf1 */
+	xscom_read(p->chip_id, p->pe_xscom + 0x19, &reg);
+	reg &= ~PPC_BITMASK(17,18);
+	xscom_write(p->chip_id, p->pe_xscom + 0x19, reg);
+
+	/* aib tx cmd cred */
+	xscom_read(p->chip_id, p->pci_xscom + 0xd, &reg);
+	reg &= ~PPC_BITMASK(42,46);
+	reg |= PPC_BIT(47);
+	xscom_write(p->chip_id, p->pci_xscom + 0xd, reg);
+
+	xscom_write(p->chip_id, p->pci_xscom + 0xc, 0xff00000000000000ull);
+
+	/* pci mode ctl */
+	xscom_read(p->chip_id, p->pe_xscom + 0xb, &reg);
+	reg |= PPC_BIT(25);
+	xscom_write(p->chip_id, p->pe_xscom + 0xb, reg);
+
+	/* set tve no translate mode allow mmio window */
+	memset(p->tve_cache, 0x0, sizeof(p->tve_cache));
+	/* Allow address range 0x0002000000000000: 0x0002FFFFFFFFFFF */
+	p->tve_cache[pe_number * 2] = 0x000000FFFFFF0a00ULL;
+
+	phb3_ioda_sel(p, IODA2_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->tve_cache[i]);
+
+	/* set m64 bar to pass mmio window */
+	memset(p->m64b_cache, 0x0, sizeof(p->m64b_cache));
+	p->m64b_cache[0] = PPC_BIT(0); /*enable*/
+	p->m64b_cache[0] |= PPC_BIT(1); /*single pe*/
+	p->m64b_cache[0] |= (p->mm0_base << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/
+	p->m64b_cache[0] |= 0x3fffc000 | (pe_number & 0x1f); /*mask and lower pe*/
+
+	p->m64b_cache[1] = PPC_BIT(0); /*enable*/
+	p->m64b_cache[1] |= PPC_BIT(1); /*single pe*/
+	p->m64b_cache[1] |= (0x0002000000000000ULL << 12) | ((pe_number & 0x3e0) << 27); /*base and upper pe*/
+	p->m64b_cache[1] |= 0x3f000000 | (pe_number & 0x1f); /*mask and lower pe*/
+
+	phb3_ioda_sel(p, IODA2_TBL_M64BT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->m64b_cache); i++)
+		out_be64(p->regs + PHB_IODA_DATA0, p->m64b_cache[i]);
+
+	out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN);
+	out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64BIT_MSI_EN);
+
+	phb3_init_capp_errors(p);
+
+	phb3_init_capp_regs(p);
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops phb3_ops = {
+	.lock			= phb3_lock,
+	.unlock			= phb3_unlock,
+	.cfg_read8		= phb3_pcicfg_read8,
+	.cfg_read16		= phb3_pcicfg_read16,
+	.cfg_read32		= phb3_pcicfg_read32,
+	.cfg_write8		= phb3_pcicfg_write8,
+	.cfg_write16		= phb3_pcicfg_write16,
+	.cfg_write32		= phb3_pcicfg_write32,
+	.choose_bus		= phb3_choose_bus,
+	.device_init		= phb3_device_init,
+	.presence_detect	= phb3_presence_detect,
+	.ioda_reset		= phb3_ioda_reset,
+	.pci_reinit		= phb3_pci_reinit,
+	.set_phb_mem_window	= phb3_set_phb_mem_window,
+	.phb_mmio_enable	= phb3_phb_mmio_enable,
+	.map_pe_mmio_window	= phb3_map_pe_mmio_window,
+	.map_pe_dma_window	= phb3_map_pe_dma_window,
+	.map_pe_dma_window_real = phb3_map_pe_dma_window_real,
+	.pci_msi_eoi		= phb3_pci_msi_eoi,
+	.set_xive_pe		= phb3_set_ive_pe,
+	.get_msi_32		= phb3_get_msi_32,
+	.get_msi_64		= phb3_get_msi_64,
+	.set_pe			= phb3_set_pe,
+	.set_peltv		= phb3_set_peltv,
+	.link_state		= phb3_link_state,
+	.power_state		= phb3_power_state,
+	.slot_power_off		= phb3_slot_power_off,
+	.slot_power_on		= phb3_slot_power_on,
+	.hot_reset		= phb3_hot_reset,
+	.fundamental_reset	= phb3_fundamental_reset,
+	.complete_reset		= phb3_complete_reset,
+	.poll			= phb3_poll,
+	.eeh_freeze_status	= phb3_eeh_freeze_status,
+	.eeh_freeze_clear	= phb3_eeh_freeze_clear,
+	.next_error		= phb3_eeh_next_error,
+	.get_diag_data		= NULL,
+	.get_diag_data2		= phb3_get_diag_data,
+	.set_capi_mode		= phb3_set_capi_mode,
+};
+
+/*
+ * We should access those registers at the stage since the
+ * AIB isn't ready yet.
+ */
+static void phb3_setup_aib(struct phb3 *p)
+{
+	/* Init_2 - AIB TX Channel Mapping Register */
+	phb3_write_reg_asb(p, PHB_AIB_TX_CHAN_MAPPING,    	0x0211230000000000);
+
+	/* Init_3 - AIB RX command credit register */
+	if (p->rev >= PHB3_REV_VENICE_DD20)
+		phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED,	0x0020000100020001);
+	else
+		phb3_write_reg_asb(p, PHB_AIB_RX_CMD_CRED,	0x0020000100010001);
+	
+	/* Init_4 - AIB rx data credit register */
+	if (p->rev >= PHB3_REV_VENICE_DD20)
+		phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED,	0x0020002000010001);
+	else
+		phb3_write_reg_asb(p, PHB_AIB_RX_DATA_CRED,	0x0020002000000001);
+
+	/* Init_5 - AIB rx credit init timer register */
+	phb3_write_reg_asb(p, PHB_AIB_RX_CRED_INIT_TIMER,	0x0f00000000000000);
+
+	/* Init_6 - AIB Tag Enable register */
+	phb3_write_reg_asb(p, PHB_AIB_TAG_ENABLE,		0xffffffff00000000);
+
+	/* Init_7 - TCE Tag Enable register */
+	phb3_write_reg_asb(p, PHB_TCE_TAG_ENABLE,         0xffffffff00000000);
+}
+
+static void phb3_init_ioda2(struct phb3 *p)
+{
+	/* Init_14 - LSI Source ID */
+	out_be64(p->regs + PHB_LSI_SOURCE_ID,
+		 SETFIELD(PHB_LSI_SRC_ID, 0ul, 0xff));
+
+	/* Init_15 - IVT BAR / Length
+	 * Init_16 - RBA BAR
+	 * 	   - RTT BAR
+	 * Init_17 - PELT-V BAR
+	 */
+	out_be64(p->regs + PHB_RTT_BAR,
+		 p->tbl_rtt | PHB_RTT_BAR_ENABLE);
+	out_be64(p->regs + PHB_PELTV_BAR,
+		 p->tbl_peltv | PHB_PELTV_BAR_ENABLE);
+	out_be64(p->regs + PHB_IVT_BAR,
+		 p->tbl_ivt | 0x800 | PHB_IVT_BAR_ENABLE);
+
+	/* DD2.0 or the subsequent chips don't have memory
+	 * resident RBA.
+	 */
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_RBA_BAR, 0x0ul);
+	else
+		out_be64(p->regs + PHB_RBA_BAR,
+			 p->tbl_rba | PHB_RBA_BAR_ENABLE);
+
+	/* Init_18..21 - Setup M32 */
+	out_be64(p->regs + PHB_M32_BASE_ADDR, p->mm1_base);
+	out_be64(p->regs + PHB_M32_BASE_MASK, ~(M32_PCI_SIZE - 1));
+	out_be64(p->regs + PHB_M32_START_ADDR, M32_PCI_START);
+
+	/* Init_22 - Setup PEST BAR */
+	out_be64(p->regs + PHB_PEST_BAR,
+		 p->tbl_pest | PHB_PEST_BAR_ENABLE);
+
+	/* Init_23 - PCIE Outbound upper address */
+	out_be64(p->regs + PHB_M64_UPPER_BITS, 0);
+
+	/* Init_24 - Interrupt represent timers
+	 * The register doesn't take effect on Murano DD1.0
+	 */
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_INTREP_TIMER, 0x0004000000000000);
+	else
+		out_be64(p->regs + PHB_INTREP_TIMER, 0);
+
+	/* Init_25 - PHB3 Configuration Register. Clear TCE cache then
+	 *           configure the PHB
+	 */
+	out_be64(p->regs + PHB_PHB3_CONFIG, PHB_PHB3C_64B_TCE_EN);
+	out_be64(p->regs + PHB_PHB3_CONFIG,
+		 PHB_PHB3C_M32_EN | PHB_PHB3C_32BIT_MSI_EN |
+		 PHB_PHB3C_64BIT_MSI_EN);
+
+	/* Init_26 - At least 512ns delay according to spec */
+	time_wait_ms(1);
+
+	/* Init_27..36 - On-chip IODA tables init */
+	phb3_ioda_reset(&p->phb, false);
+}
+
+static bool phb3_wait_dlp_reset(struct phb3 *p)
+{
+	unsigned int i;
+	uint64_t val;
+
+	/*
+	 * Firmware cannot access the UTL core regs or PCI config space
+	 * until the cores are out of DL_PGRESET.
+	 * DL_PGRESET should be polled until it is inactive with a value
+	 * of '0'. The recommended polling frequency is once every 1ms.
+	 * Firmware should poll at least 200 attempts before giving up.
+	 * MMIO Stores to the link are silently dropped by the UTL core if
+	 * the link is down.
+	 * MMIO Loads to the link will be dropped by the UTL core and will
+	 * eventually time-out and will return an all ones response if the
+	 * link is down.
+	 */
+#define DLP_RESET_ATTEMPTS	400
+
+	PHBDBG(p, "Waiting for DLP PG reset to complete...\n");
+	for (i = 0; i < DLP_RESET_ATTEMPTS; i++) {
+		val = in_be64(p->regs + PHB_PCIE_DLP_TRAIN_CTL);
+		if (!(val & PHB_PCIE_DLP_TC_DL_PGRESET))
+			break;
+		time_wait_ms(1);
+	}
+	if (val & PHB_PCIE_DLP_TC_DL_PGRESET) {
+		PHBERR(p, "Timeout waiting for DLP PG reset !\n");
+		return false;
+	}
+	return true;
+}
+
+/* phb3_init_rc - Initialize the Root Complex config space
+ */
+static bool phb3_init_rc_cfg(struct phb3 *p)
+{
+	int64_t ecap, aercap;
+
+	/* XXX Handle errors ? */
+
+	/* Init_45..46:
+	 *
+	 * Set primary bus to 0, secondary to 1 and subordinate to 0xff
+	 */
+	phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PRIMARY_BUS, 0x00ff0100);
+
+	/* Init_47..52
+	 *
+	 * IO and Memory base & limits are set to base > limit, which
+	 * allows all inbounds.
+	 *
+	 * XXX This has the potential of confusing the OS which might
+	 * think that nothing is forwarded downstream. We probably need
+	 * to fix this to match the IO and M32 PHB windows
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_IO_BASE, 0x0010);
+	phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_MEM_BASE, 0x00000010);
+	phb3_pcicfg_write32(&p->phb, 0, PCI_CFG_PREF_MEM_BASE, 0x00000010);
+
+	/* Init_53..54 - Setup bridge control enable forwarding of CORR, FATAL,
+	 * and NONFATAL errors
+	*/
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_BRCTL, PCI_CFG_BRCTL_SERR_EN);
+
+	/* Init_55..56
+	 *
+	 * PCIE Device control/status, enable error reporting, disable relaxed
+	 * ordering, set MPS to 128 (see note), clear errors.
+	 *
+	 * Note: The doc recommends to set MPS to 4K. This has proved to have
+	 * some issues as it requires specific claming of MRSS on devices and
+	 * we've found devices in the field that misbehave when doing that.
+	 *
+	 * We currently leave it all to 128 bytes (minimum setting) at init
+	 * time. The generic PCIe probing later on might apply a different
+	 * value, or the kernel will, but we play it safe at early init
+	 */
+	if (p->ecap <= 0) {
+		ecap = pci_find_cap(&p->phb, 0, PCI_CFG_CAP_ID_EXP);
+		if (ecap < 0) {
+			PHBERR(p, "Can't locate PCI-E capability\n");
+			return false;
+		}
+		p->ecap = ecap;
+	} else {
+		ecap = p->ecap;
+	}
+
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVSTAT,
+			     PCICAP_EXP_DEVSTAT_CE	|
+			     PCICAP_EXP_DEVSTAT_NFE	|
+			     PCICAP_EXP_DEVSTAT_FE	|
+			     PCICAP_EXP_DEVSTAT_UE);
+
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DEVCTL,
+			     PCICAP_EXP_DEVCTL_CE_REPORT	|
+			     PCICAP_EXP_DEVCTL_NFE_REPORT	|
+			     PCICAP_EXP_DEVCTL_FE_REPORT	|
+			     PCICAP_EXP_DEVCTL_UR_REPORT	|
+			     SETFIELD(PCICAP_EXP_DEVCTL_MPS, 0, PCIE_MPS_128B));
+
+	/* Init_57..58
+	 *
+	 * Root Control Register. Enable error reporting
+	 *
+	 * Note: Added CRS visibility.
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_RC,
+			     PCICAP_EXP_RC_SYSERR_ON_CE		|
+			     PCICAP_EXP_RC_SYSERR_ON_NFE	|
+			     PCICAP_EXP_RC_SYSERR_ON_FE		|
+			     PCICAP_EXP_RC_CRS_VISIBLE);
+
+	/* Init_59..60
+	 *
+	 * Device Control 2. Enable ARI fwd, set timer to RTOS timer
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, ecap + PCICAP_EXP_DCTL2,
+			     SETFIELD(PCICAP_EXP_DCTL2_CMPTOUT, 0, 0xf) |
+			     PCICAP_EXP_DCTL2_ARI_FWD);
+
+	/* Init_61..76
+	 *
+	 * AER inits
+	 */
+	aercap = pci_find_ecap(&p->phb, 0, PCIECAP_ID_AER, NULL);
+	if (aercap < 0) {
+		/* Shouldn't happen */
+		PHBERR(p, "Failed to locate AER Ecapability in bridge\n");
+		return false;
+	}
+	p->aercap = aercap;
+
+	/* Clear all UE status */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_STATUS,
+			     0xffffffff);
+	/* Disable some error reporting as per the PHB3 spec */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_MASK,
+			     PCIECAP_AER_UE_POISON_TLP		|
+			     PCIECAP_AER_UE_COMPL_TIMEOUT	|
+			     PCIECAP_AER_UE_COMPL_ABORT		|
+			     PCIECAP_AER_UE_ECRC);
+	/* Report some errors as fatal */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_UE_SEVERITY,
+			     PCIECAP_AER_UE_DLP 		|
+			     PCIECAP_AER_UE_SURPRISE_DOWN	|
+			     PCIECAP_AER_UE_FLOW_CTL_PROT	|
+			     PCIECAP_AER_UE_UNEXP_COMPL		|
+			     PCIECAP_AER_UE_RECV_OVFLOW		|
+			     PCIECAP_AER_UE_MALFORMED_TLP);
+	/* Clear all CE status */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_STATUS,
+			     0xffffffff);
+	/* Disable some error reporting as per the PHB3 spec */
+	/* Note: When link down, also disable rcvr errors */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CE_MASK,
+			    PCIECAP_AER_CE_ADV_NONFATAL |
+			    p->has_link ? 0 : PCIECAP_AER_CE_RECVR_ERR);
+	/* Enable ECRC generation & checking */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_CAPCTL,
+			     PCIECAP_AER_CAPCTL_ECRCG_EN	|
+			     PCIECAP_AER_CAPCTL_ECRCC_EN);
+	/* Enable reporting in root error control */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_CMD,
+			     PCIECAP_AER_RERR_CMD_FE		|
+			     PCIECAP_AER_RERR_CMD_NFE		|
+			     PCIECAP_AER_RERR_CMD_CE);
+	/* Clear root error status */
+	phb3_pcicfg_write32(&p->phb, 0, aercap + PCIECAP_AER_RERR_STA,
+			     0xffffffff);
+
+	return true;
+}
+
+static void phb3_init_utl(struct phb3 *p)
+{
+	/* Init_77..79: Clear spurrious errors and assign errors to the
+	 * right "interrupt" signal
+	 */
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_STATUS,       0xffffffffffffffff);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_ERR_SEVERITY, 0x5000000000000000);
+	out_be64(p->regs + UTL_SYS_BUS_AGENT_IRQ_EN,       0xfcc0000000000000);
+
+	/* Init_80..81: Setup tag allocations
+	 *
+         * Don't touch UTL_GBIF_READ_TAGS_ALLOC, it differs betwen PHBs
+         * and the default is correct
+	 */
+	out_be64(p->regs + UTL_PCIE_TAGS_ALLOC,            0x0800000000000000);
+
+	/* Init_82: PCI Express port control */
+	out_be64(p->regs + UTL_PCIE_PORT_CONTROL,          0x8588006000000000);
+
+	/* Init_83..85: Clean & setup port errors */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS,           0xffdfffffffffffff);
+	out_be64(p->regs + UTL_PCIE_PORT_ERROR_SEV,        0x5039000000000000);
+
+	if (p->has_link)
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,   0xad5a800000000000);
+	else
+		out_be64(p->regs + UTL_PCIE_PORT_IRQ_EN,   0xad42800000000000);
+
+	/* Init_86 : Cleanup RC errors */
+	out_be64(p->regs + UTL_RC_STATUS,                  0xffffffffffffffff);
+}
+
+static void phb3_init_errors(struct phb3 *p)
+{
+	/* Init_88: LEM Error Mask : Temporarily disable error interrupts */
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,		   0xffffffffffffffff);
+
+	/* Init_89..97: Disable all error interrupts until end of init */
+	out_be64(p->regs + PHB_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR_LEM_ENABLE,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_ERR_FREEZE_ENABLE,	   0x0000000080800000);
+	out_be64(p->regs + PHB_ERR_AIB_FENCE_ENABLE,	   0xffffffdd0c00ffc0);
+	out_be64(p->regs + PHB_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR_STATUS_MASK,		   0x0000000000000000);
+	out_be64(p->regs + PHB_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_98_106: Configure MMIO error traps & clear old state
+	 *
+	 * Don't enable BAR multi-hit detection in bit 41.
+	 */
+	out_be64(p->regs + PHB_OUT_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR_LEM_ENABLE,	   0xfdffffffffbfffff);
+	out_be64(p->regs + PHB_OUT_ERR_FREEZE_ENABLE,	   0x0000420800000000);
+	out_be64(p->regs + PHB_OUT_ERR_AIB_FENCE_ENABLE,   0x9cf3bc00f89c700f);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_OUT_ERR_STATUS_MASK,	   0x0000000000400000);
+	out_be64(p->regs + PHB_OUT_ERR1_STATUS_MASK,	   0x0000000000400000);
+
+	/* Init_107_115: Configure DMA_A error traps & clear old state */
+	out_be64(p->regs + PHB_INA_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR_LEM_ENABLE,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INA_ERR_FREEZE_ENABLE,	   0xc00003a901006000);
+	out_be64(p->regs + PHB_INA_ERR_AIB_FENCE_ENABLE,   0x3fff5452fe019fde);
+	out_be64(p->regs + PHB_INA_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR_STATUS_MASK,	   0x0000000000000000);
+	out_be64(p->regs + PHB_INA_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_116_124: Configure DMA_B error traps & clear old state */
+	out_be64(p->regs + PHB_INB_ERR_STATUS,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_LEM_ENABLE,	   0xffffffffffffffff);
+
+	/*
+	 * Workaround for errata HW257476, turn correctable messages into
+	 * ER freezes on Murano and Venice DD1.0
+	 */
+	if (p->rev < PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,
+			                                   0x0000600000000070);
+	else
+		out_be64(p->regs + PHB_INB_ERR_FREEZE_ENABLE,
+			                                   0x0000600000000060);
+
+	out_be64(p->regs + PHB_INB_ERR_AIB_FENCE_ENABLE,   0xfcff80fbff7ff08c);
+	out_be64(p->regs + PHB_INB_ERR_LOG_0,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_LOG_1,		   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR_STATUS_MASK,	   0x0000000000000000);
+	out_be64(p->regs + PHB_INB_ERR1_STATUS_MASK,	   0x0000000000000000);
+
+	/* Init_125..128: Cleanup & configure LEM */
+	out_be64(p->regs + PHB_LEM_FIR_ACCUM,		   0x0000000000000000);
+	out_be64(p->regs + PHB_LEM_ACTION0,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_LEM_ACTION1,		   0xffffffffffffffff);
+	out_be64(p->regs + PHB_LEM_WOF,			   0x0000000000000000);
+}
+
+static void phb3_init_hw(struct phb3 *p)
+{
+	uint64_t val;
+
+	PHBDBG(p, "Initializing PHB...\n");
+
+	/* Lift reset */
+	xscom_read(p->chip_id, p->spci_xscom + 1, &val);/* HW275117 */
+	xscom_write(p->chip_id, p->pci_xscom + 0xa, 0);
+	time_wait_ms(100);
+
+	/* Grab version and fit it in an int */
+	val = phb3_read_reg_asb(p, PHB_VERSION);
+	if (val == 0 || val == 0xffffffffffffffff) {
+		PHBERR(p, "Failed to read version, PHB appears broken\n");
+		goto failed;
+	}
+
+	p->rev = ((val >> 16) & 0x00ff0000) | (val & 0xffff);
+	PHBDBG(p, "Core revision 0x%x\n", p->rev);
+
+	/* Setup AIB credits etc... */
+	phb3_setup_aib(p);
+
+	/* Init_8 - PCIE System Configuration Register
+	 *
+	 * Not changed from default values. Beware that bits [04:09] should
+	 * be different between PHBs (x16 vs x8).
+	 */
+	PHBDBG(p, "Default system config: 0x%016llx\n",
+	       in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG));
+	if (p->index == 2)
+		val = 0x421000fc00000000;
+	else
+		val = 0x441000fc00000000;
+	val |= (uint64_t)p->max_link_speed << PPC_BITLSHIFT(35);
+	out_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG, val);
+
+	PHBDBG(p, "New system config    : 0x%016llx\n",
+	       in_be64(p->regs + PHB_PCIE_SYSTEM_CONFIG));
+
+	/* Init_9..12 - PCIE DLP Lane EQ control */
+	if (p->lane_eq) {
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL0,
+			 be64_to_cpu(p->lane_eq[0]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL1,
+			 be64_to_cpu(p->lane_eq[1]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL2,
+			 be64_to_cpu(p->lane_eq[2]));
+		out_be64(p->regs + PHB_PCIE_LANE_EQ_CNTL3,
+			 be64_to_cpu(p->lane_eq[3]));
+	}
+
+	/* Init_XX - (PHB2 errata)
+	 *
+         * Set proper credits, needs adjustment due to wrong defaults
+	 * on PHB2 before we lift the reset.
+	 */
+	if (p->index == 2)
+		out_be64(p->regs + PHB_PCIE_SYS_LINK_INIT, 0x9008133332120000);
+
+	/* Init_13 - PCIE Reset */
+	/*
+	 * Lift the PHB resets but not PERST, this will be lifted
+	 * later by the initial PERST state machine
+	 */
+	PHBDBG(p, "PHB_RESET is 0x%016llx\n", in_be64(p->regs + PHB_RESET));
+	out_be64(p->regs + PHB_RESET,			   0xd000000000000000);
+
+	/* Architected IODA2 inits */
+	phb3_init_ioda2(p);
+
+	/* Init_37..42 - Clear UTL & DLP error logs */
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG1,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG2,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG3,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_UTL_ERRLOG4,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_DLP_ERRLOG1,	   0xffffffffffffffff);
+	out_be64(p->regs + PHB_PCIE_DLP_ERRLOG2,	   0xffffffffffffffff);
+
+	/* Init_43 - Wait for UTL core to come out of reset */
+	if (!phb3_wait_dlp_reset(p))
+		goto failed;
+
+	/* Init_44 - Clear port status */
+	out_be64(p->regs + UTL_PCIE_PORT_STATUS,	   0xffffffffffffffff);
+
+	/* Init_45..76: Init root complex config space */
+	if (!phb3_init_rc_cfg(p))
+		goto failed;
+
+	/* Init_77..86 : Init UTL */
+	phb3_init_utl(p);
+
+	/*
+	 * Init_87: PHB Control register. Various PHB settings
+	 *          Enable IVC for Murano DD2.0 or later one
+	 */
+#ifdef IVT_TABLE_IVE_16B
+	val = 0xf3a80e4b00000000;
+#else
+	val = 0xf3a80ecb00000000;
+#endif
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		val |= 0x0000010000000000;
+	out_be64(p->regs + PHB_CONTROL, val);
+
+	/* Init_88..128  : Setup error registers */
+	phb3_init_errors(p);
+
+	/* Init_129: Read error summary */
+	val = in_be64(p->regs + PHB_ETU_ERR_SUMMARY);
+	if (val) {
+		PHBERR(p, "Errors detected during PHB init: 0x%16llx\n", val);
+		goto failed;
+	}
+
+	/* NOTE: At this point the spec waits for the link to come up. We
+	 * don't bother as we are doing a PERST soon.
+	 */
+
+	/* XXX I don't know why the spec does this now and not earlier, so
+	 * to be sure to get it right we might want to move it to the freset
+	 * state machine, though the generic PCI layer will probably do
+	 * this anyway (ie, enable MEM, etc... in the RC)
+	 *
+	 * Note:The spec enables IO but PHB3 doesn't do IO space .... so we
+	 * leave that clear.
+	 */
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_CMD,
+			    PCI_CFG_CMD_MEM_EN |
+			    PCI_CFG_CMD_BUS_MASTER_EN |
+			    PCI_CFG_CMD_PERR_RESP |
+			    PCI_CFG_CMD_SERR_EN);
+
+	/* Clear errors */
+	phb3_pcicfg_write16(&p->phb, 0, PCI_CFG_STAT,
+			    PCI_CFG_STAT_SENT_TABORT |
+			    PCI_CFG_STAT_RECV_TABORT |
+			    PCI_CFG_STAT_RECV_MABORT |
+			    PCI_CFG_STAT_SENT_SERR |
+			    PCI_CFG_STAT_RECV_PERR);
+
+	/* Init_136 - Re-enable error interrupts */
+
+	/* TBD: Should we mask any of these for PERST ? */
+	out_be64(p->regs + PHB_ERR_IRQ_ENABLE,	   0x0000002280b80000);
+	out_be64(p->regs + PHB_OUT_ERR_IRQ_ENABLE, 0x600c42fc042080f0);
+	out_be64(p->regs + PHB_INA_ERR_IRQ_ENABLE, 0xc000a3a901826020);
+	out_be64(p->regs + PHB_INB_ERR_IRQ_ENABLE, 0x0000600000800070);
+	out_be64(p->regs + PHB_LEM_ERROR_MASK,	   0x42498e327f502eae);
+
+	/*
+	 * Init_141 - Enable DMA address speculation
+	 *
+	 * Errata#20131017: Disable speculation until Murano DD2.0
+	 *
+	 * Note: We keep IVT speculation disabled (bit 4). It should work with
+	 * Murano DD2.0 and later but lacks sufficient testing. We will re-enable
+	 * it once that has been done.
+	 */
+	if (p->rev >= PHB3_REV_MURANO_DD20)
+		out_be64(p->regs + PHB_TCE_SPEC_CTL,		0xf000000000000000);
+	else
+		out_be64(p->regs + PHB_TCE_SPEC_CTL,		0x0ul);
+
+	/* Errata#20131017: avoid TCE queue overflow */
+	if (p->rev == PHB3_REV_MURANO_DD20)
+		phb3_write_reg_asb(p, PHB_TCE_WATERMARK,	0x0003000000030302);
+
+	/* Init_142 - PHB3 - Timeout Control Register 1 */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL1,			0x1713132016200000);
+
+	/* Init_143 - PHB3 - Timeout Control Register 2 */
+	out_be64(p->regs + PHB_TIMEOUT_CTRL2,			0x2320d71600000000);
+
+	/* Mark the PHB as functional which enables all the various sequences */
+	p->state = PHB3_STATE_FUNCTIONAL;
+
+	PHBDBG(p, "Initialization complete\n");
+
+	return;
+
+ failed:
+	PHBERR(p, "Initialization failed\n");
+	p->state = PHB3_STATE_BROKEN;
+}
+
+static void phb3_allocate_tables(struct phb3 *p)
+{
+	/* XXX Our current memalign implementation sucks,
+	 *
+	 * It will do the job, however it doesn't support freeing
+	 * the memory and wastes space by always allocating twice
+	 * as much as requested (size + alignment)
+	 */
+	p->tbl_rtt = (uint64_t)local_alloc(p->chip_id, RTT_TABLE_SIZE, RTT_TABLE_SIZE);
+	assert(p->tbl_rtt);
+	memset((void *)p->tbl_rtt, 0, RTT_TABLE_SIZE);
+
+	p->tbl_peltv = (uint64_t)local_alloc(p->chip_id, PELTV_TABLE_SIZE, PELTV_TABLE_SIZE);
+	assert(p->tbl_peltv);
+	memset((void *)p->tbl_peltv, 0, PELTV_TABLE_SIZE);
+
+	p->tbl_pest = (uint64_t)local_alloc(p->chip_id, PEST_TABLE_SIZE, PEST_TABLE_SIZE);
+	assert(p->tbl_pest);
+	memset((void *)p->tbl_pest, 0, PEST_TABLE_SIZE);
+
+	p->tbl_ivt = (uint64_t)local_alloc(p->chip_id, IVT_TABLE_SIZE, IVT_TABLE_SIZE);
+	assert(p->tbl_ivt);
+	memset((void *)p->tbl_ivt, 0, IVT_TABLE_SIZE);
+
+	p->tbl_rba = (uint64_t)local_alloc(p->chip_id, RBA_TABLE_SIZE, RBA_TABLE_SIZE);
+	assert(p->tbl_rba);
+	memset((void *)p->tbl_rba, 0, RBA_TABLE_SIZE);
+}
+
+static void phb3_add_properties(struct phb3 *p)
+{
+	struct dt_node *np = p->phb.dt_node;
+	uint32_t lsibase, icsp = get_ics_phandle();
+	uint64_t m32b, m64b, m64s, reg, tkill;
+
+	reg = cleanup_addr((uint64_t)p->regs);
+
+	/* Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc...
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0); /* ??? */
+
+	dt_add_property_cells(np, "interrupt-parent", icsp);
+
+	/* XXX FIXME: add slot-name */
+	//dt_property_cell("bus-width", 8); /* Figure it out from VPD ? */
+
+	/* "ranges", we only expose M32 (PHB3 doesn't do IO)
+	 *
+	 * Note: The kernel expects us to have chopped of 64k from the
+	 * M32 size (for the 32-bit MSIs). If we don't do that, it will
+	 * get confused (OPAL does it)
+	 */
+	m32b = cleanup_addr(p->mm1_base);
+	m64b = cleanup_addr(p->mm0_base);
+	m64s = p->mm0_size;
+	dt_add_property_cells(np, "ranges",
+			      /* M32 space */
+			      0x02000000, 0x00000000, M32_PCI_START,
+			      hi32(m32b), lo32(m32b), 0, M32_PCI_SIZE - 0x10000);
+
+	/* XXX FIXME: add opal-memwin32, dmawins, etc... */
+	dt_add_property_cells(np, "ibm,opal-m64-window",
+			      hi32(m64b), lo32(m64b),
+			      hi32(m64b), lo32(m64b),
+			      hi32(m64s), lo32(m64s));
+	dt_add_property(np, "ibm,opal-single-pe", NULL, 0);
+	//dt_add_property_cells(np, "ibm,opal-msi-ports", 2048);
+	dt_add_property_cells(np, "ibm,opal-num-pes", 256);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe", 0);
+	dt_add_property_cells(np, "ibm,opal-msi-ranges",
+			      p->base_msi, PHB3_MSI_IRQ_COUNT);
+	tkill = reg + PHB_TCE_KILL;
+	dt_add_property_cells(np, "ibm,opal-tce-kill",
+			      hi32(tkill), lo32(tkill));
+
+	/*
+	 * Indicate to Linux that the architected IODA2 MSI EOI method
+	 * is supported
+	 */
+	dt_add_property_string(np, "ibm,msi-eoi-method", "ioda2");
+
+	/* The interrupt maps will be generated in the RC node by the
+	 * PCI code based on the content of this structure:
+	 */
+	lsibase = p->base_lsi;
+	p->phb.lstate.int_size = 1;
+	p->phb.lstate.int_val[0][0] = lsibase + PHB3_LSI_PCIE_INTA;
+	p->phb.lstate.int_val[1][0] = lsibase + PHB3_LSI_PCIE_INTB;
+	p->phb.lstate.int_val[2][0] = lsibase + PHB3_LSI_PCIE_INTC;
+	p->phb.lstate.int_val[3][0] = lsibase + PHB3_LSI_PCIE_INTD;
+	p->phb.lstate.int_parent[0] = icsp;
+	p->phb.lstate.int_parent[1] = icsp;
+	p->phb.lstate.int_parent[2] = icsp;
+	p->phb.lstate.int_parent[3] = icsp;
+
+	/* Indicators for variable tables */
+	dt_add_property_cells(np, "ibm,opal-rtt-table",
+		hi32(p->tbl_rtt), lo32(p->tbl_rtt), RTT_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-peltv-table",
+		hi32(p->tbl_peltv), lo32(p->tbl_peltv), PELTV_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-pest-table",
+		hi32(p->tbl_pest), lo32(p->tbl_pest), PEST_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-ivt-table",
+		hi32(p->tbl_ivt), lo32(p->tbl_ivt), IVT_TABLE_SIZE);
+	dt_add_property_cells(np, "ibm,opal-ive-stride",
+		IVT_TABLE_STRIDE);
+	dt_add_property_cells(np, "ibm,opal-rba-table",
+		hi32(p->tbl_rba), lo32(p->tbl_rba), RBA_TABLE_SIZE);
+}
+
+static bool phb3_calculate_windows(struct phb3 *p)
+{
+	const struct dt_property *prop;
+
+	/* Get PBCQ MMIO windows from device-tree */
+	prop = dt_require_property(p->phb.dt_node,
+				   "ibm,mmio-window", -1);
+	assert(prop->len >= (2 * sizeof(uint64_t)));
+
+	p->mm0_base = ((const uint64_t *)prop->prop)[0];
+	p->mm0_size = ((const uint64_t *)prop->prop)[1];
+	if (prop->len > 16) {
+		p->mm1_base = ((const uint64_t *)prop->prop)[2];
+		p->mm1_size = ((const uint64_t *)prop->prop)[3];
+	}
+
+	/* Sort them so that 0 is big and 1 is small */
+	if (p->mm1_size && p->mm1_size > p->mm0_size) {
+		uint64_t b = p->mm0_base;
+		uint64_t s = p->mm0_size;
+		p->mm0_base = p->mm1_base;
+		p->mm0_size = p->mm1_size;
+		p->mm1_base = b;
+		p->mm1_size = s;
+	}
+
+	/* If 1 is too small, ditch it */
+	if (p->mm1_size < M32_PCI_SIZE)
+		p->mm1_size = 0;
+
+	/* If 1 doesn't exist, carve it out of 0 */
+	if (p->mm1_size == 0) {
+		p->mm0_size /= 2;
+		p->mm1_base = p->mm0_base + p->mm0_size;
+		p->mm1_size = p->mm0_size;
+	}
+
+	/* Crop mm1 to our desired size */
+	if (p->mm1_size > M32_PCI_SIZE)
+		p->mm1_size = M32_PCI_SIZE;
+
+	return true;
+}
+
+static void phb3_create(struct dt_node *np)
+{
+	const struct dt_property *prop;
+	struct phb3 *p = zalloc(sizeof(struct phb3));
+	size_t lane_eq_len;
+	struct dt_node *iplp;
+	char *path;
+
+	assert(p);
+
+	/* Populate base stuff */
+	p->index = dt_prop_get_u32(np, "ibm,phb-index");
+	p->chip_id = dt_prop_get_u32(np, "ibm,chip-id");
+	p->regs = (void *)dt_get_address(np, 0, NULL);
+	p->base_msi = PHB3_MSI_IRQ_BASE(p->chip_id, p->index);
+	p->base_lsi = PHB3_LSI_IRQ_BASE(p->chip_id, p->index);
+	p->phb.dt_node = np;
+	p->phb.ops = &phb3_ops;
+	p->phb.phb_type = phb_type_pcie_v3;
+	p->phb.scan_map = 0x1; /* Only device 0 to scan */
+	p->capp_ucode_base = 0;
+	p->capp_ucode_loaded = false;
+	if (dt_has_node_property(np, "ibm,capp-ucode", NULL))
+		p->capp_ucode_base = dt_prop_get_u32(np, "ibm,capp-ucode");
+	p->max_link_speed = dt_prop_get_u32_def(np, "ibm,max-link-speed", 3);
+	p->state = PHB3_STATE_UNINITIALIZED;
+
+	if (!phb3_calculate_windows(p))
+		return;
+
+	/* Get the various XSCOM register bases from the device-tree */
+	prop = dt_require_property(np, "ibm,xscom-bases", 3 * sizeof(uint32_t));
+	p->pe_xscom = ((const uint32_t *)prop->prop)[0];
+	p->spci_xscom = ((const uint32_t *)prop->prop)[1];
+	p->pci_xscom = ((const uint32_t *)prop->prop)[2];
+
+	/*
+	 * We skip the initial PERST assertion requested by the generic code
+	 * when doing a cold boot because we are coming out of cold boot already
+	 * so we save boot time that way. The PERST state machine will still
+	 * handle waiting for the link to come up, it will just avoid actually
+	 * asserting & deasserting the PERST output
+	 *
+	 * For a hot IPL, we still do a PERST
+	 *
+	 * Note: In absence of property (ie, FSP-less), we stick to the old
+	 * behaviour and set skip_perst to true
+	 */
+	p->skip_perst = true; /* Default */
+
+	iplp = dt_find_by_path(dt_root, "ipl-params/ipl-params");
+	if (iplp) {
+		const char *ipl_type = dt_prop_get_def(iplp, "cec-major-type", NULL);
+		if (ipl_type && (!strcmp(ipl_type, "hot")))
+			p->skip_perst = false;
+	}
+
+	/* By default link is assumed down */
+	p->has_link = false;
+
+	/* We register the PHB before we initialize it so we
+	 * get a useful OPAL ID for it
+	 */
+	pci_register_phb(&p->phb);
+
+	/* Hello ! */
+	path = dt_get_path(np);
+	PHBINF(p, "Found %s @%p\n", path, p->regs);
+	PHBINF(p, "  M32 [0x%016llx..0x%016llx]\n",
+	       p->mm1_base, p->mm1_base + p->mm1_size - 1);
+	PHBINF(p, "  M64 [0x%016llx..0x%016llx]\n",
+	       p->mm0_base, p->mm0_base + p->mm0_size - 1);
+	free(path);
+
+	/* Check if we can use the A/B detect pins */
+	p->use_ab_detect = dt_has_node_property(np, "ibm,use-ab-detect", NULL);
+
+	/* Find base location code from root node */
+	p->phb.base_loc_code = dt_prop_get_def(dt_root,
+					       "ibm,io-base-loc-code", NULL);
+	if (!p->phb.base_loc_code)
+		PHBERR(p, "Base location code not found !\n");
+
+	/* Check for lane equalization values from HB or HDAT */
+	p->lane_eq = dt_prop_get_def_size(np, "ibm,lane-eq", NULL, &lane_eq_len);
+	if (p->lane_eq && lane_eq_len != (8 * 4)) {
+		PHBERR(p, "Device-tree has ibm,lane-eq with wrong len %ld\n",
+			lane_eq_len);
+		p->lane_eq = NULL;
+	}
+	if (p->lane_eq) {
+		PHBDBG(p, "Override lane equalization settings:\n");
+		PHBDBG(p, "  0x%016llx 0x%016llx\n",
+		       be64_to_cpu(p->lane_eq[0]), be64_to_cpu(p->lane_eq[1]));
+		PHBDBG(p, "  0x%016llx 0x%016llx\n",
+		       be64_to_cpu(p->lane_eq[2]), be64_to_cpu(p->lane_eq[3]));
+	}
+
+	/*
+	 * Grab CEC IO VPD load info from the root of the device-tree,
+	 * on P8 there's a single such VPD for the whole machine
+	 */
+	prop = dt_find_property(dt_root, "ibm,io-vpd");
+	if (!prop) {
+		/* LX VPD Lid not already loaded */
+		vpd_iohub_load(dt_root);
+	}
+
+	/* Allocate the SkiBoot internal in-memory tables for the PHB */
+	phb3_allocate_tables(p);
+
+	phb3_add_properties(p);
+
+	/* Clear IODA2 cache */
+	phb3_init_ioda_cache(p);
+
+	/* Register interrupt sources */
+	register_irq_source(&phb3_msi_irq_ops, p, p->base_msi,
+			    PHB3_MSI_IRQ_COUNT);
+	register_irq_source(&phb3_lsi_irq_ops, p, p->base_lsi, 4);
+
+#ifndef DISABLE_ERR_INTS
+	register_irq_source(&phb3_err_lsi_irq_ops, p,
+			    p->base_lsi + PHB3_LSI_PCIE_INF, 2);
+#endif
+	/* Get the HW up and running */
+	phb3_init_hw(p);
+
+	/* Load capp microcode into capp unit if PHB0 */
+	if (p->index == 0)
+		capp_load_ucode(p);
+
+	/* Platform additional setup */
+	if (platform.pci_setup_phb)
+		platform.pci_setup_phb(&p->phb, p->index);
+}
+
+static void phb3_probe_pbcq(struct dt_node *pbcq)
+{
+	uint32_t spci_xscom, pci_xscom, pe_xscom, gcid, pno;
+	uint64_t val, phb_bar, bar_en;
+	uint64_t mmio0_bar, mmio0_bmask, mmio0_sz;
+	uint64_t mmio1_bar, mmio1_bmask, mmio1_sz;
+	uint64_t reg[2];
+	uint64_t mmio_win[4];
+	unsigned int mmio_win_sz;
+	struct dt_node *np;
+	char *path;
+	uint64_t capp_ucode_base;
+	unsigned int max_link_speed;
+
+	gcid = dt_get_chip_id(pbcq);
+	pno = dt_prop_get_u32(pbcq, "ibm,phb-index");
+	path = dt_get_path(pbcq);
+	printf("Chip %d Found PBCQ%d at %s\n", gcid, pno, path);
+	free(path);
+
+	pe_xscom = dt_get_address(pbcq, 0, NULL);
+	pci_xscom = dt_get_address(pbcq, 1, NULL);
+	spci_xscom = dt_get_address(pbcq, 2, NULL);
+	printf("PHB3[%d:%d]: X[PE]=0x%08x X[PCI]=0x%08x X[SPCI]=0x%08x\n",
+	       gcid, pno, pe_xscom, pci_xscom, spci_xscom);
+
+	/* Check if CAPP mode */
+	if (xscom_read(gcid, spci_xscom + 0x03, &val)) {
+		prerror("PHB3[%d:%d]: Cannot read AIB CAPP ENABLE\n",
+			gcid, pno);
+		return;
+	}
+	if (val >> 63) {
+		prerror("PHB3[%d:%d]: Ignoring bridge in CAPP mode\n",
+			gcid, pno);
+		return;
+	}
+
+	/* Get PE BARs, assume only 0 and 2 are used for now */
+	xscom_read(gcid, pe_xscom + 0x42, &phb_bar);
+	phb_bar >>= 14;
+	printf("PHB3[%d:%d] REGS     = 0x%016llx [4k]\n",
+		gcid, pno, phb_bar);
+	if (phb_bar == 0) {
+		prerror("PHB3[%d:%d]: No PHB BAR set !\n", gcid, pno);
+		return;
+	}
+
+	/* Dbl check PHB BAR */
+	xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */
+	xscom_read(gcid, pci_xscom + 0x0b, &val);
+	val >>= 14;
+	printf("PHB3[%d:%d] PCIBAR   = 0x%016llx\n", gcid, pno, val);
+	if (phb_bar != val) {
+		prerror("PHB3[%d:%d] PCIBAR invalid, fixing up...\n",
+			gcid, pno);
+		xscom_read(gcid, spci_xscom + 1, &val);/* HW275117 */
+		xscom_write(gcid, pci_xscom + 0x0b, phb_bar << 14);
+	}
+
+	/* Check MMIO BARs */
+	xscom_read(gcid, pe_xscom + 0x40, &mmio0_bar);
+	xscom_read(gcid, pe_xscom + 0x43, &mmio0_bmask);
+	mmio0_bmask &= 0xffffffffc0000000ull;
+	mmio0_sz = ((~mmio0_bmask) >> 14) + 1;
+	mmio0_bar >>= 14;
+	printf("PHB3[%d:%d] MMIO0    = 0x%016llx [0x%016llx]\n",
+		gcid, pno, mmio0_bar, mmio0_sz);
+	xscom_read(gcid, pe_xscom + 0x41, &mmio1_bar);
+	xscom_read(gcid, pe_xscom + 0x44, &mmio1_bmask);
+	mmio1_bmask &= 0xffffffffc0000000ull;
+	mmio1_sz = ((~mmio1_bmask) >> 14) + 1;
+	mmio1_bar >>= 14;
+	printf("PHB3[%d:%d] MMIO1    = 0x%016llx [0x%016llx]\n",
+		gcid, pno, mmio1_bar, mmio1_sz);
+
+	/* Check BAR enable
+	 *
+	 * XXX BAR aren't always enabled by HB, we'll make assumptions
+	 * that BARs are valid if they value is non-0
+	 */
+	xscom_read(gcid, pe_xscom + 0x45, &bar_en);
+	printf("PHB3[%d:%d] BAREN    = 0x%016llx\n",
+		gcid, pno, bar_en);
+
+	/* Always enable PHB BAR */
+	bar_en |= 0x2000000000000000ull;
+
+	/* Build MMIO windows list */
+	mmio_win_sz = 0;
+	if (mmio0_bar) {
+		mmio_win[mmio_win_sz++] = mmio0_bar;
+		mmio_win[mmio_win_sz++] = mmio0_sz;
+		bar_en |= 0x8000000000000000ul;
+	}
+	if (mmio1_bar) {
+		mmio_win[mmio_win_sz++] = mmio1_bar;
+		mmio_win[mmio_win_sz++] = mmio1_sz;
+		bar_en |= 0x4000000000000000ul;
+	}
+
+	/* No MMIO windows ? Barf ! */
+	if (mmio_win_sz == 0) {
+		prerror("PHB3[%d:%d]: No MMIO windows enabled !\n",
+			gcid, pno);
+		return;
+	}
+
+	/* Set the interrupt routing stuff, 8 relevant bits in mask
+	 * (11 bits per PHB)
+	 */
+	val = P8_CHIP_IRQ_PHB_BASE(gcid, pno);
+	val = (val << 45);
+	xscom_write(gcid, pe_xscom + 0x1a, val);
+	xscom_write(gcid, pe_xscom + 0x1b, 0xff00000000000000ul);
+
+	/* Configure LSI location to the top of the map */
+	xscom_write(gcid, pe_xscom + 0x1f, 0xff00000000000000ul);
+
+	/* Now add IRSN message bits to BAR enable and write it */
+	bar_en |= 0x1800000000000000ul;
+	xscom_write(gcid, pe_xscom + 0x45, bar_en);
+
+	printf("PHB3[%d:%d] NEWBAREN = 0x%016llx\n",
+		gcid, pno, bar_en);
+
+	xscom_read(gcid, pe_xscom + 0x1a, &val);
+	printf("PHB3[%d:%d] IRSNC    = 0x%016llx\n",
+		gcid, pno, val);
+	xscom_read(gcid, pe_xscom + 0x1b, &val);
+	printf("PHB3[%d:%d] IRSNM    = 0x%016llx\n",
+		gcid, pno, val);
+	printf("PHB3[%d:%d] LSI      = 0x%016llx\n",
+		gcid, pno, val);
+
+	/* Create PHB node */
+	reg[0] = phb_bar;
+	reg[1] = 0x1000;
+
+	np = dt_new_addr(dt_root, "pciex", reg[0]);
+	if (!np)
+		return;
+
+	dt_add_property_strings(np, "compatible", "ibm,power8-pciex",
+				"ibm,ioda2-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+
+	/* Everything else is handled later by skiboot, we just
+	 * stick a few hints here
+	 */
+	dt_add_property_cells(np, "ibm,xscom-bases",
+			      pe_xscom, spci_xscom, pci_xscom);
+	dt_add_property(np, "ibm,mmio-window", mmio_win, 8 * mmio_win_sz);
+	dt_add_property_cells(np, "ibm,phb-index", pno);
+	dt_add_property_cells(np, "ibm,pbcq", pbcq->phandle);
+	dt_add_property_cells(np, "ibm,chip-id", gcid);
+	if (dt_has_node_property(pbcq, "ibm,use-ab-detect", NULL))
+		dt_add_property(np, "ibm,use-ab-detect", NULL, 0);
+	if (dt_has_node_property(pbcq, "ibm,hub-id", NULL))
+		dt_add_property_cells(np, "ibm,hub-id",
+				      dt_prop_get_u32(pbcq, "ibm,hub-id"));
+	if (dt_has_node_property(pbcq, "ibm,loc-code", NULL)) {
+		const char *lc = dt_prop_get(pbcq, "ibm,loc-code");
+		dt_add_property_string(np, "ibm,loc-code", lc);
+	}
+	if (dt_has_node_property(pbcq, "ibm,lane-eq", NULL)) {
+		size_t leq_size;
+		const void *leq = dt_prop_get_def_size(pbcq, "ibm,lane-eq",
+						       NULL, &leq_size);
+		if (leq != NULL && leq_size == 4 * 8)
+			dt_add_property(np, "ibm,lane-eq", leq, leq_size);
+	}
+	if (dt_has_node_property(pbcq, "ibm,capp-ucode", NULL)) {
+		capp_ucode_base = dt_prop_get_u32(pbcq, "ibm,capp-ucode");
+		dt_add_property_cells(np, "ibm,capp-ucode", capp_ucode_base);
+	}
+	max_link_speed = dt_prop_get_u32_def(pbcq, "ibm,max-link-speed", 3);
+	dt_add_property_cells(np, "ibm,max-link-speed", max_link_speed);
+
+	add_chip_dev_associativity(np);
+}
+
+void probe_phb3(void)
+{
+	struct dt_node *np;
+
+	/* Look for PBCQ XSCOM nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-pbcq")
+		phb3_probe_pbcq(np);
+
+	/* Look for newly created PHB nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-pciex")
+		phb3_create(np);
+}
diff --git a/hw/psi.c b/hw/psi.c
new file mode 100644
index 00000000..5cbae34e
--- /dev/null
+++ b/hw/psi.c
@@ -0,0 +1,873 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Service Processor serial console handling code
+ */
+#include <io.h>
+#include <psi.h>
+#include <fsp.h>
+#include <opal.h>
+#include <gx.h>
+#include <interrupts.h>
+#include <cpu.h>
+#include <trace.h>
+#include <xscom.h>
+#include <chip.h>
+#include <timebase.h>
+#include <platform.h>
+
+//#define DBG(fmt...)	printf(fmt)
+#define DBG(fmt...)	do { } while(0)
+//#define FSP_TRACE
+
+static LIST_HEAD(psis);
+static u64 psi_link_timer;
+static u64 psi_link_timeout;
+bool psi_link_poll_active;
+static bool psi_ext_irq_policy = EXTERNAL_IRQ_POLICY_LINUX;
+
+static void psi_register_interrupts(struct psi *psi);
+static void psi_activate_phb(struct psi *psi);
+
+static struct lock psi_lock = LOCK_UNLOCKED;
+
+void psi_set_link_polling(bool active)
+{
+	printf("PSI: %sing link polling\n",
+	       active ? "start" : "stopp");
+	psi_link_poll_active = active;
+}
+
+void psi_disable_link(struct psi *psi)
+{
+	u64 val;
+
+	lock(&psi_lock);
+
+	/*
+	 * Note: This can be called with the link already down but
+	 * not detected as such yet by this layer since psi_check_link_active()
+	 * operates locklessly and thus won't update the PSI structure. This
+	 * is a non-issue, the only consequence is the messages in the log
+	 * mentioning first the link having gone down then being disabled.
+	 */
+	if (psi->active) {
+		psi->active = false;
+
+		printf("PSI[0x%03x]: Disabling link!\n", psi->chip_id);
+
+		/* Clear the link enable bit and disable FSP interrupts */
+		val = in_be64(psi->regs + PSIHB_CR);
+		val &= ~PSIHB_CR_PSI_LINK_ENABLE;
+		val &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+		val &= ~PSIHB_CR_FSP_IRQ; /* Clear interrupt state too */
+		out_be64(psi->regs + PSIHB_CR, val);
+	}
+
+	unlock(&psi_lock);
+}
+
+bool psi_check_link_active(struct psi *psi)
+{
+	u64 val = in_be64(psi->regs + PSIHB_CR);
+
+	/*
+	 * Unlocked, used during fsp_poke_msg so we really want
+	 * to avoid fancy link re-entrancy and deadlocks here
+	 */
+	if (!psi->active)
+		return false;
+	return (val & PSIHB_CR_PSI_LINK_ENABLE) &&
+		(val & PSIHB_CR_FSP_LINK_ACTIVE);
+}
+
+struct psi *psi_find_link(uint32_t chip_id)
+{
+	struct psi *psi;
+
+	list_for_each(&psis, psi, list) {
+		if (psi->chip_id == chip_id)
+			return psi;
+	}
+	return NULL;
+}
+
+#define PSI_LINK_CHECK_INTERVAL		10	/* Interval in secs */
+#define PSI_LINK_RECOVERY_TIMEOUT	900	/* 15 minutes */
+
+static void psi_link_poll(void *data __unused)
+{
+	struct psi *psi;
+	u64 now;
+
+	if (!psi_link_poll_active)
+		return;
+
+	now = mftb();
+	if (psi_link_timer == 0 ||
+		(tb_compare(now, psi_link_timer) == TB_AAFTERB) ||
+		(tb_compare(now, psi_link_timer) == TB_AEQUALB)) {
+
+		list_for_each(&psis, psi, list) {
+			u64 val;
+
+			if (psi->active || !psi->working)
+				continue;
+
+			lock(&psi_lock);
+			if (psi->active || !psi->working) {
+				unlock(&psi_lock);
+				continue;
+			}
+
+			val = in_be64(psi->regs + PSIHB_CR);
+
+			printf("PSI[0x%03x]: Poll CR=0x%016llx\n",
+			       psi->chip_id, val);
+
+			if ((val & PSIHB_CR_PSI_LINK_ENABLE) &&
+			    (val & PSIHB_CR_FSP_LINK_ACTIVE)) {
+				printf("PSI[0x%03x]: Found active link!\n",
+				       psi->chip_id);
+				psi_link_timeout = 0;
+				psi->active = true;
+				psi_activate_phb(psi);
+				unlock(&psi_lock);
+				fsp_reinit_fsp();
+				return;
+			}
+			unlock(&psi_lock);
+		}
+
+		if (!psi_link_timeout)
+			psi_link_timeout =
+				now + secs_to_tb(PSI_LINK_RECOVERY_TIMEOUT);
+
+		if (tb_compare(now, psi_link_timeout) == TB_AAFTERB) {
+			prerror("PSI: Timed out looking for a PSI link\n");
+
+			/* Log error to the host from here */
+		}
+
+		/* Poll every 10 seconds */
+		psi_link_timer = now + secs_to_tb(PSI_LINK_CHECK_INTERVAL);
+	}
+}
+
+void psi_enable_fsp_interrupt(struct psi *psi)
+{
+	if (!psi->working)
+		return;
+
+	/* Enable FSP interrupts in the GXHB */
+	lock(&psi_lock);
+	out_be64(psi->regs + PSIHB_CR,
+		 in_be64(psi->regs + PSIHB_CR) | PSIHB_CR_FSP_IRQ_ENABLE);
+	unlock(&psi_lock);
+}
+
+/* Multiple bits can be set on errors */
+static void decode_psihb_error(u64 val)
+{
+	if (val & PSIHB_CR_PSI_ERROR)
+		printf("PSI: PSI Reported Error\n");
+	if (val & PSIHB_CR_PSI_LINK_INACTIVE)
+		printf("PSI: PSI Link Inactive Transition\n");
+	if (val & PSIHB_CR_FSP_ACK_TIMEOUT)
+		printf("PSI: FSP Ack Timeout\n");
+	if (val & PSIHB_CR_MMIO_LOAD_TIMEOUT)
+		printf("PSI: MMIO Load Timeout\n");
+	if (val & PSIHB_CR_MMIO_LENGTH_ERROR)
+		printf("PSI: MMIO Length Error\n");
+	if (val & PSIHB_CR_MMIO_ADDRESS_ERROR)
+		printf("PSI: MMIO Address Error\n");
+	if (val & PSIHB_CR_MMIO_TYPE_ERROR)
+		printf("PSI: MMIO Type Error\n");
+	if (val & PSIHB_CR_UE)
+		printf("PSI: UE Detected\n");
+	if (val & PSIHB_CR_PARITY_ERROR)
+		printf("PSI: Internal Parity Error\n");
+	if (val & PSIHB_CR_SYNC_ERR_ALERT1)
+		printf("PSI: Sync Error Alert1\n");
+	if (val & PSIHB_CR_SYNC_ERR_ALERT2)
+		printf("PSI: Sync Error Alert2\n");
+	if (val & PSIHB_CR_FSP_COMMAND_ERROR)
+		printf("PSI: FSP Command Error\n");
+}
+
+
+static void handle_psi_interrupt(struct psi *psi, u64 val)
+{
+	u64 reg;
+
+	printf("PSI[0x%03x]: PSI mgmnt interrupt CR=0x%016llx\n",
+	       psi->chip_id, val);
+
+	if (val & (0xfffull << 20)) {
+		lock(&psi_lock);
+		psi->active = false;
+
+		decode_psihb_error(val);
+
+		/* Mask errors in SEMR */
+		reg = in_be64(psi->regs + PSIHB_SEMR);
+		reg = ((0xfffull << 36) | (0xfffull << 20));
+		out_be64(psi->regs + PSIHB_SEMR, reg);
+		printf("PSI: SEMR set to %llx\n", reg);
+
+		/* Reset all the error bits in PSIHB_CR and
+		 * disable FSP interrupts
+		 */
+		val = in_be64(psi->regs + PSIHB_CR);
+		val &= ~(0x7ffull << 20);
+		val &= ~PSIHB_CR_PSI_LINK_ENABLE;	/* flip link enable */
+		/*
+		 * Ensure no commands/spurious interrupts reach
+		 * the processor, by flipping the command enable.
+		 */
+		val &= ~PSIHB_CR_FSP_CMD_ENABLE;
+		val &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+		val &= ~PSIHB_CR_FSP_IRQ; /* Clear interrupt state too */
+		out_be64(psi->regs + PSIHB_CR, val);
+		printf("PSI: PSIHB_CR (error bits) set to %llx\n",
+				in_be64(psi->regs + PSIHB_CR));
+		unlock(&psi_lock);
+	} else if (val & (0x1full << 11))
+		printf("PSI: FSP error detected\n");
+}
+
+/* TODO: Determine which of these needs to be handled by powernv */
+static void handle_extra_interrupt(struct psi *psi)
+{
+	u64 val;
+
+	val = in_be64(psi->regs + PSIHB_IRQ_STATUS);
+
+	/*
+	 * Decode interrupt type, call appropriate handlers
+	 * when available.
+	 */
+	if (val & PSIHB_IRQ_STAT_OCC)
+		printf("PSI: OCC irq received\n");
+	if (val & PSIHB_IRQ_STAT_FSI)
+		printf("PSI: FSI irq received\n");
+	if (val & PSIHB_IRQ_STAT_LPC)
+		printf("PSI: LPC/I2C irq received\n");
+	if (val & PSIHB_IRQ_STAT_LOCAL_ERR)
+		printf("PSI: ATTN irq received\n");
+	if (val & PSIHB_IRQ_STAT_HOST_ERR) {
+		if (platform.external_irq)
+			platform.external_irq(psi->chip_id);
+	}
+
+	/*
+	 * TODO: Per Vicente Chung, CRESPs don't generate interrupts,
+	 * and are just informational. Need to define the policy
+	 * to handle them.
+	 */
+}
+
+static void psi_spurious_fsp_irq(struct psi *psi)
+{
+	u64 reg, bit;
+
+	prerror("PSI: Spurious interrupt, attempting clear\n");
+
+	if (proc_gen == proc_gen_p8) {
+		reg = PSIHB_XSCOM_P8_HBCSR_CLR;
+		bit = PSIHB_XSCOM_P8_HBSCR_FSP_IRQ;
+	} else {
+		reg = PSIHB_XSCOM_P7_HBCSR_CLR;
+		bit = PSIHB_XSCOM_P7_HBSCR_FSP_IRQ;
+	}
+	xscom_write(psi->chip_id, psi->xscom_base + reg, bit);
+}
+
+bool psi_poll_fsp_interrupt(struct psi *psi)
+{
+	return !!(in_be64(psi->regs + PSIHB_CR) & PSIHB_CR_FSP_IRQ);
+}
+
+static void psi_interrupt(void *data, uint32_t isn __unused)
+{
+	struct psi *psi = data;
+	u64 val;
+
+	val = in_be64(psi->regs + PSIHB_CR);
+
+	if (psi_link_poll_active) {
+		printf("PSI[0x%03x]: PSI interrupt CR=0x%016llx (A=%d)\n",
+		       psi->chip_id, val, psi->active);
+	}
+
+	/* Handle PSI interrupts first in case it's a link down */
+	if (val & PSIHB_CR_PSI_IRQ) {
+		handle_psi_interrupt(psi, val);
+
+		/*
+		 * If the link went down, re-read PSIHB_CR as
+		 * the FSP interrupt might have been cleared.
+		 */
+		if (!psi->active)
+			val = in_be64(psi->regs + PSIHB_CR);
+	}
+
+
+	/*
+	 * We avoid forwarding FSP interrupts if the link isn't
+	 * active. They should be masked anyway but it looks
+	 * like the CR bit can remain set.
+	 */
+	if (val & PSIHB_CR_FSP_IRQ) {
+		/*
+		 * We have a case a flood with FSP mailbox interrupts
+		 * when the link is down, see if we manage to clear
+		 * the condition
+		 */
+		if (!psi->active)
+			psi_spurious_fsp_irq(psi);
+		else
+			fsp_interrupt();
+	}
+
+	 /* P8 additional interrupt? */
+	if (proc_gen == proc_gen_p8)
+		handle_extra_interrupt(psi);
+
+	/* Poll the console buffers on any interrupt since we don't
+	 * get send notifications
+	 */
+	fsp_console_poll(NULL);
+}
+
+static int64_t psi_p7_set_xive(void *data, uint32_t isn __unused,
+				   uint16_t server, uint8_t priority)
+{
+	struct psi *psi = data;
+	uint64_t xivr;
+
+	if (!psi->working)
+		return OPAL_HARDWARE;
+
+	/* Populate the XIVR */
+	xivr  = (uint64_t)server << 40;
+	xivr |= (uint64_t)priority << 32;
+	xivr |=	P7_IRQ_BUID(psi->interrupt) << 16;
+
+	out_be64(psi->regs + PSIHB_XIVR, xivr);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t psi_p7_get_xive(void *data, uint32_t isn __unused,
+				uint16_t *server, uint8_t *priority)
+{
+	struct psi *psi = data;
+	uint64_t xivr;
+
+	if (!psi->working)
+		return OPAL_HARDWARE;
+
+	/* Read & decode the XIVR */
+	xivr = in_be64(psi->regs + PSIHB_XIVR);
+
+	*server = (xivr >> 40) & 0x7ff;
+	*priority = (xivr >> 32) & 0xff;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t psi_p8_set_xive(void *data, uint32_t isn,
+				   uint16_t server, uint8_t priority)
+{
+	struct psi *psi = data;
+	uint64_t xivr_p, xivr;
+
+	switch(isn & 7) {
+	case P8_IRQ_PSI_FSP:
+		xivr_p = PSIHB_XIVR_FSP;
+		break;
+	case P8_IRQ_PSI_OCC:
+		xivr_p = PSIHB_XIVR_OCC;
+		break;
+	case P8_IRQ_PSI_FSI:
+		xivr_p = PSIHB_XIVR_FSI;
+		break;
+	case P8_IRQ_PSI_LPC:
+		xivr_p = PSIHB_XIVR_LPC;
+		break;
+	case P8_IRQ_PSI_LOCAL_ERR:
+		xivr_p = PSIHB_XIVR_LOCAL_ERR;
+		break;
+	case P8_IRQ_PSI_HOST_ERR:
+		xivr_p = PSIHB_XIVR_HOST_ERR;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* Populate the XIVR */
+	xivr  = (uint64_t)server << 40;
+	xivr |= (uint64_t)priority << 32;
+	xivr |= (uint64_t)(isn & 7) << 29;
+
+	out_be64(psi->regs + xivr_p, xivr);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t psi_p8_get_xive(void *data, uint32_t isn __unused,
+				   uint16_t *server, uint8_t *priority)
+{
+	struct psi *psi = data;
+	uint64_t xivr_p, xivr;
+
+	switch(isn & 7) {
+	case P8_IRQ_PSI_FSP:
+		xivr_p = PSIHB_XIVR_FSP;
+		break;
+	case P8_IRQ_PSI_OCC:
+		xivr_p = PSIHB_XIVR_OCC;
+		break;
+	case P8_IRQ_PSI_FSI:
+		xivr_p = PSIHB_XIVR_FSI;
+		break;
+	case P8_IRQ_PSI_LPC:
+		xivr_p = PSIHB_XIVR_LPC;
+		break;
+	case P8_IRQ_PSI_LOCAL_ERR:
+		xivr_p = PSIHB_XIVR_LOCAL_ERR;
+		break;
+	case P8_IRQ_PSI_HOST_ERR:
+		xivr_p = PSIHB_XIVR_HOST_ERR;
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/* Read & decode the XIVR */
+	xivr = in_be64(psi->regs + xivr_p);
+
+	*server = (xivr >> 40) & 0xffff;
+	*priority = (xivr >> 32) & 0xff;
+
+	return OPAL_SUCCESS;
+}
+
+/* Called on a fast reset, make sure we aren't stuck with
+ * an accepted and never EOId PSI interrupt
+ */
+void psi_irq_reset(void)
+{
+	struct psi *psi;
+	uint64_t xivr;
+
+	printf("PSI: Hot reset!\n");
+
+	assert(proc_gen == proc_gen_p7);
+
+	list_for_each(&psis, psi, list) {
+		/* Mask the interrupt & clean the XIVR */
+		xivr = 0x000000ff00000000;
+		xivr |=	P7_IRQ_BUID(psi->interrupt) << 16;
+		out_be64(psi->regs + PSIHB_XIVR, xivr);
+
+#if 0 /* Seems to checkstop ... */
+		/*
+		 * Maybe not anymore; we were just blindly sending
+		 * this on all iopaths, not just the active one;
+		 * We don't even know if those psis are even correct.
+		 */
+		/* Send a dummy EOI to make sure the ICP is clear */
+		icp_send_eoi(psi->interrupt);
+#endif
+	}
+}
+
+static const struct irq_source_ops psi_p7_irq_ops = {
+	.get_xive = psi_p7_get_xive,
+	.set_xive = psi_p7_set_xive,
+	.interrupt = psi_interrupt,
+};
+
+static const struct irq_source_ops psi_p8_irq_ops = {
+	.get_xive = psi_p8_get_xive,
+	.set_xive = psi_p8_set_xive,
+	.interrupt = psi_interrupt,
+};
+
+static const struct irq_source_ops psi_p8_host_err_ops = {
+	.get_xive = psi_p8_get_xive,
+	.set_xive = psi_p8_set_xive,
+};
+
+static void psi_tce_enable(struct psi *psi, bool enable)
+{
+	void *addr;
+	u64 val;
+
+	switch (proc_gen) {
+	case proc_gen_p7:
+		addr = psi->regs + PSIHB_CR;
+		break;
+	case proc_gen_p8:
+		addr = psi->regs + PSIHB_PHBSCR;
+		break;
+	default:
+		prerror("%s: Unknown CPU type\n", __func__);
+		return;
+	}
+
+	val = in_be64(addr);
+	if (enable)
+		val |=  PSIHB_CR_TCE_ENABLE;
+	else
+		val &= ~PSIHB_CR_TCE_ENABLE;
+	out_be64(addr, val);
+}
+
+/*
+ * Configure the PSI interface for communicating with
+ * an FSP, such as enabling the TCEs, FSP commands,
+ * etc...
+ */
+void psi_init_for_fsp(struct psi *psi)
+{
+	uint64_t reg;
+	bool enable_tce = true;
+
+	lock(&psi_lock);
+
+	/* Disable and setup TCE base address */
+	psi_tce_enable(psi, false);
+
+	switch (proc_gen) {
+	case proc_gen_p7:
+		out_be64(psi->regs + PSIHB_TAR, PSI_TCE_TABLE_BASE |
+			 PSIHB_TAR_16K_ENTRIES);
+		break;
+	case proc_gen_p8:
+		out_be64(psi->regs + PSIHB_TAR, PSI_TCE_TABLE_BASE |
+			 PSIHB_TAR_256K_ENTRIES);
+		break;
+	default:
+		enable_tce = false;
+	};
+
+	/* Enable various other configuration register bits based
+	 * on what pHyp does. We keep interrupts disabled until
+	 * after the mailbox has been properly configured. We assume
+	 * basic stuff such as PSI link enable is already there.
+	 *
+	 *  - FSP CMD Enable
+	 *  - FSP MMIO Enable
+	 *  - TCE Enable
+	 *  - Error response enable
+	 *
+	 * Clear all other error bits
+	 */
+	if (!psi->active) {
+		prerror("PSI: psi_init_for_fsp() called on inactive link!\n");
+		unlock(&psi_lock);
+		return;
+	}
+
+	reg = in_be64(psi->regs + PSIHB_CR);
+	reg |= PSIHB_CR_FSP_CMD_ENABLE;
+	reg |= PSIHB_CR_FSP_MMIO_ENABLE;
+	reg |= PSIHB_CR_FSP_ERR_RSP_ENABLE;
+	reg &= ~0x00000000ffffffffull;
+	out_be64(psi->regs + PSIHB_CR, reg);
+	psi_tce_enable(psi, enable_tce);
+
+	unlock(&psi_lock);
+}
+
+void psi_set_external_irq_policy(bool policy)
+{
+	psi_ext_irq_policy = policy;
+}
+
+/*
+ * Register interrupt sources for all working links, not just the active ones.
+ * This is a one time activity.
+ */
+static void psi_register_interrupts(struct psi *psi)
+{
+	/* Configure the interrupt BUID and mask it */
+	switch (proc_gen) {
+	case proc_gen_p7:
+		/* On P7, we get a single interrupt */
+		out_be64(psi->regs + PSIHB_XIVR,
+			 P7_IRQ_BUID(psi->interrupt) << 16 |
+			 0xffull << 32);
+
+		/* Configure it in the GX controller as well */
+		gx_configure_psi_buid(psi->chip_id,
+				      P7_IRQ_BUID(psi->interrupt));
+
+		/* Register the IRQ source */
+		register_irq_source(&psi_p7_irq_ops,
+				    psi, psi->interrupt, 1);
+		break;
+	case proc_gen_p8:
+		/* On P8 we get a block of 8, set up the base/mask
+		 * and mask all the sources for now
+		 */
+		out_be64(psi->regs + PSIHB_ISRN,
+			 SETFIELD(PSIHB_ISRN_COMP, 0ul, psi->interrupt) |
+			 SETFIELD(PSIHB_ISRN_MASK, 0ul, 0x7fff8ul) |
+			 PSIHB_ISRN_DOWNSTREAM_EN |
+			 PSIHB_ISRN_UPSTREAM_EN);
+		out_be64(psi->regs + PSIHB_XIVR_FSP,
+			 (0xffull << 32) | (P8_IRQ_PSI_FSP << 29));
+		out_be64(psi->regs + PSIHB_XIVR_OCC,
+			 (0xffull << 32) | (P8_IRQ_PSI_OCC << 29));
+		out_be64(psi->regs + PSIHB_XIVR_FSI,
+			 (0xffull << 32) | (P8_IRQ_PSI_FSI << 29));
+		out_be64(psi->regs + PSIHB_XIVR_LPC,
+			 (0xffull << 32) | (P8_IRQ_PSI_LPC << 29));
+		out_be64(psi->regs + PSIHB_XIVR_LOCAL_ERR,
+			 (0xffull << 32) | (P8_IRQ_PSI_LOCAL_ERR << 29));
+		out_be64(psi->regs + PSIHB_XIVR_HOST_ERR,
+			 (0xffull << 32) | (P8_IRQ_PSI_HOST_ERR << 29));
+
+		/*
+		 * Register the IRQ sources FSP, OCC, FSI, LPC
+		 * and Local Error. Host Error is actually the
+		 * external interrupt and the policy for that comes
+		 * from the platform
+		 */
+		if (psi_ext_irq_policy == EXTERNAL_IRQ_POLICY_SKIBOOT) {
+			register_irq_source(&psi_p8_irq_ops,
+					    psi,
+					    psi->interrupt + P8_IRQ_PSI_SKIBOOT_BASE,
+					    P8_IRQ_PSI_ALL_COUNT);
+		} else {
+			register_irq_source(&psi_p8_irq_ops,
+					    psi,
+					    psi->interrupt + P8_IRQ_PSI_SKIBOOT_BASE,
+					    P8_IRQ_PSI_LOCAL_COUNT);
+			/*
+			 * Host Error is handled by powernv; host error
+			 * is at offset 5 from the PSI base.
+			 */
+			register_irq_source(&psi_p8_host_err_ops,
+					    psi,
+					    psi->interrupt + P8_IRQ_PSI_LINUX_BASE,
+					    P8_IRQ_PSI_LINUX_COUNT);
+		}
+		break;
+	default:
+		/* Unknown: just no interrupts */
+		prerror("PSI: Unknown interrupt type\n");
+	}
+}
+
+static void psi_activate_phb(struct psi *psi)
+{
+	u64 reg;
+
+	/*
+	 * Disable interrupt emission in the control register,
+	 * it will be re-enabled later, after the mailbox one
+	 * will have been enabled.
+	 */
+	reg = in_be64(psi->regs + PSIHB_CR);
+	reg &= ~PSIHB_CR_FSP_IRQ_ENABLE;
+	out_be64(psi->regs + PSIHB_CR, reg);
+
+	/* Enable interrupts in the mask register. We enable everything
+	 * except for bit "FSP command error detected" which the doc
+	 * (P7 BookIV) says should be masked for normal ops. It also
+	 * seems to be masked under OPAL.
+	 */
+	reg = 0x0000010000100000ull;
+	out_be64(psi->regs + PSIHB_SEMR, reg);
+
+#if 0
+	/* Dump the GXHB registers */
+	printf("  PSIHB_BBAR   : %llx\n",
+	       in_be64(psi->regs + PSIHB_BBAR));
+	printf("  PSIHB_FSPBAR : %llx\n",
+	       in_be64(psi->regs + PSIHB_FSPBAR));
+	printf("  PSIHB_FSPMMR : %llx\n",
+	       in_be64(psi->regs + PSIHB_FSPMMR));
+	printf("  PSIHB_TAR    : %llx\n",
+	       in_be64(psi->regs + PSIHB_TAR));
+	printf("  PSIHB_CR     : %llx\n",
+	       in_be64(psi->regs + PSIHB_CR));
+	printf("  PSIHB_SEMR   : %llx\n",
+	       in_be64(psi->regs + PSIHB_SEMR));
+	printf("  PSIHB_XIVR   : %llx\n",
+	       in_be64(psi->regs + PSIHB_XIVR));
+#endif
+}
+
+static void psi_create_mm_dtnode(struct psi *psi)
+{
+	struct dt_node *np;
+	uint64_t addr = (uint64_t)psi->regs;
+
+	np = dt_new_addr(dt_root, "psi", addr);
+	if (!np)
+		return;
+
+	/* Hard wire size to 4G */
+	dt_add_property_cells(np, "reg", hi32(addr), lo32(addr), 1, 0);
+	switch (proc_gen) {
+	case proc_gen_p7:
+		dt_add_property_strings(np, "compatible", "ibm,psi",
+					"ibm,power7-psi");
+		break;
+	case proc_gen_p8:
+		dt_add_property_strings(np, "compatible", "ibm,psi",
+					"ibm,power8-psi");
+		break;
+	default:
+		dt_add_property_strings(np, "compatible", "ibm,psi");
+	}
+	dt_add_property_cells(np, "interrupt-parent", get_ics_phandle());
+	dt_add_property_cells(np, "interrupts", psi->interrupt);
+	dt_add_property_cells(np, "ibm,chip-id", psi->chip_id);
+}
+
+static struct psi *alloc_psi(uint64_t base)
+{
+	struct psi *psi;
+
+	psi = zalloc(sizeof(struct psi));
+	if (!psi) {
+		prerror("PSI: Could not allocate memory\n");
+		return NULL;
+	}
+	psi->xscom_base = base;
+	return psi;
+}
+
+static struct psi *psi_probe_p7(struct proc_chip *chip, u64 base)
+{
+	struct psi *psi = NULL;
+	uint64_t rc, val;
+
+	rc = xscom_read(chip->id, base + PSIHB_XSCOM_P7_HBBAR, &val);
+	if (rc) {
+		prerror("PSI: Error %llx reading PSIHB BAR on chip %d\n",
+				rc, chip->id);
+		return NULL;
+	}
+	if (val & PSIHB_XSCOM_P7_HBBAR_EN) {
+		psi = alloc_psi(base);
+		if (!psi)
+			return NULL;
+		psi->working = true;
+		rc = val >> 36;	/* Bits 0:1 = 0x00; 2:27 Bridge BAR... */
+		rc <<= 20;	/* ... corresponds to bits 18:43 of base addr */
+		psi->regs = (void *)rc;
+	} else
+		printf("PSI[0x%03x]: Working link not found\n", chip->id);
+
+	return psi;
+}
+
+static struct psi *psi_probe_p8(struct proc_chip *chip, u64 base)
+{
+	struct psi *psi = NULL;
+	uint64_t rc, val;
+
+	rc = xscom_read(chip->id, base + PSIHB_XSCOM_P8_BASE, &val);
+	if (rc) {
+		prerror("PSI[0x%03x]: Error %llx reading PSIHB BAR\n",
+			chip->id, rc);
+		return NULL;
+	}
+	if (val & PSIHB_XSCOM_P8_HBBAR_EN) {
+		psi = alloc_psi(base);
+		if (!psi)
+			return NULL;
+		psi->working = true;
+		psi->regs = (void *)(val & ~PSIHB_XSCOM_P8_HBBAR_EN);
+	} else
+		printf("PSI[0x%03x]: Working link not found\n", chip->id);
+
+	return psi;
+}
+
+static bool psi_init_psihb(struct dt_node *psihb)
+{
+	uint32_t chip_id = dt_get_chip_id(psihb);
+	struct proc_chip *chip = get_chip(chip_id);
+	struct psi *psi = NULL;
+	u64 base, val;
+
+	if (!chip) {
+		prerror("PSI: Can't find chip!\n");
+		return false;
+	}
+
+	base = dt_get_address(psihb, 0, NULL);
+
+	if (dt_node_is_compatible(psihb, "ibm,power7-psihb-x"))
+		psi = psi_probe_p7(chip, base);
+	else if (dt_node_is_compatible(psihb, "ibm,power8-psihb-x"))
+		psi = psi_probe_p8(chip, base);
+	else {
+		prerror("PSI: Unknown processor type\n");
+		return false;
+	}
+	if (!psi)
+		return false;
+
+	list_add(&psis, &psi->list);
+
+	val = in_be64(psi->regs + PSIHB_CR);
+	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
+		lock(&psi_lock);
+		psi->active = true;
+		unlock(&psi_lock);
+	}
+
+	psi->chip_id = chip->id;
+	psi->interrupt = get_psi_interrupt(chip->id);
+
+	psi_create_mm_dtnode(psi);
+	psi_register_interrupts(psi);
+	psi_activate_phb(psi);
+
+	printf("PSI[0x%03x]: Found PSI bridge [working=%d, active=%d]\n",
+			psi->chip_id, psi->working, psi->active);
+	return true;
+}
+
+void psi_fsp_link_in_use(struct psi *psi __unused)
+{
+	static bool poller_created = false;
+
+	/* Do this once only */
+	if (!poller_created) {
+		poller_created = true;
+		opal_add_poller(psi_link_poll, NULL);
+	}
+}
+
+void psi_init(void)
+{
+	struct dt_node *np;
+
+	dt_for_each_compatible(dt_root, np, "ibm,psihb-x")
+		psi_init_psihb(np);
+}
diff --git a/hw/sfc-ctrl.c b/hw/sfc-ctrl.c
new file mode 100644
index 00000000..de163c57
--- /dev/null
+++ b/hw/sfc-ctrl.c
@@ -0,0 +1,523 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <lpc.h>
+#include <sfc-ctrl.h>
+
+#include <libflash/libflash.h>
+#include <libflash/libflash-priv.h>
+
+/* Offset of SFC registers in FW space */
+#define SFC_CMDREG_OFFSET	0x00000c00
+/* Offset of SFC command buffer in FW space */
+#define	SFC_CMDBUF_OFFSET	0x00000d00
+/* Offset of flash MMIO mapping in FW space */
+#define SFC_MMIO_OFFSET		0x0c000000
+
+
+/*
+ * Register definitions
+ */
+#define SFC_REG_CONF      0x10 /* CONF: Direct Access Configuration */
+#define SFC_REG_CONF_FRZE		(1 << 3)
+#define SFC_REG_CONF_ECCEN		(1 << 2)
+#define SFC_REG_CONF_DRCD		(1 << 1)
+#define SFC_REG_CONF_FLRLD		(1 << 0)
+
+#define SFC_REG_STATUS    0x0C /* STATUS : Status Reg */
+#define SFC_REG_STATUS_NX_ON_SHFT	28
+#define SFC_REG_STATUS_RWP		(1 << 27)
+#define SFC_REG_STATUS_FOURBYTEAD	(1 << 26)
+#define SFC_REG_STATUS_ILLEGAL		(1 << 4)
+#define SFC_REG_STATUS_ECCERRCNTN	(1 << 3)
+#define SFC_REG_STATUS_ECCUEN		(1 << 2)
+#define SFC_REG_STATUS_DONE		(1 << 0)
+
+#define SFC_REG_CMD       0x40 /* CMD : Command */
+#define SFC_REG_CMD_OPCODE_SHFT		9
+#define SFC_REG_CMD_LENGTH_SHFT		0
+
+#define SFC_REG_SPICLK    0x3C /* SPICLK: SPI clock rate config */
+#define SFC_REG_SPICLK_OUTDLY_SHFT	24
+#define SFC_REG_SPICLK_INSAMPDLY_SHFT	16
+#define SFC_REG_SPICLK_CLKHI_SHFT	8
+#define SFC_REG_SPICLK_CLKLO_SHFT	0
+
+#define SFC_REG_ADR       0x44 /* ADR : Address */
+#define SFC_REG_ERASMS    0x48 /* ERASMS : Small Erase Block Size */
+#define SFC_REG_ERASLGS   0x4C /* ERALGS : Large Erase Block Size */
+#define SFC_REG_CONF4     0x54 /* CONF4  : SPI Op Code for Small Erase */
+#define SFC_REG_CONF5     0x58 /* CONF5  : Small Erase Size config reg */
+
+#define SFC_REG_CONF8     0x64 /* CONF8  : Read Command */
+#define SFC_REG_CONF8_CSINACTIVERD_SHFT	18
+#define SFC_REG_CONF8_DUMMY_SHFT	8
+#define SFC_REG_CONF8_READOP_SHFT	0
+
+#define SFC_REG_ADRCBF    0x80 /* ADRCBF : First Intf NOR Addr Offset */
+#define SFC_REG_ADRCMF    0x84 /* ADRCMF : First Intf NOR Allocation */
+#define SFC_REG_ADRCBS    0x88 /* ADRCBS : Second Intf NOR Addr Offset */
+#define SFC_REG_ADRCMS    0x8C /* ADRCMS : Second Intf NOR Allocation */
+#define SFC_REG_OADRNB    0x90 /* OADRNB : Direct Access OBP Window Base Address */
+#define SFC_REG_OADRNS    0x94 /* OADRNS : DIrect Access OPB Window Size */
+
+#define SFC_REG_CHIPIDCONF    0x9C /* CHIPIDCONF : config ChipId CMD */
+#define SFC_REG_CHIPIDCONF_OPCODE_SHFT	24
+#define SFC_REG_CHIPIDCONF_READ		(1 << 23)
+#define SFC_REG_CHIPIDCONF_WRITE	(1 << 22)
+#define SFC_REG_CHIPIDCONF_USE_ADDR	(1 << 21)
+#define SFC_REG_CHIPIDCONF_DUMMY_SHFT	16
+#define SFC_REG_CHIPIDCONF_LEN_SHFT	0
+
+/*
+ * SFC Opcodes
+ */
+#define SFC_OP_READRAW      0x03 /* Read Raw */
+#define SFC_OP_WRITERAW     0x02 /* Write Raw */
+#define SFC_OP_ERASM        0x32 /* Erase Small */
+#define SFC_OP_ERALG        0x34 /* Erase Large */
+#define SFC_OP_ENWRITPROT   0x53 /* Enable WRite Protect */
+#define SFC_OP_CHIPID       0x1F /* Get Chip ID */
+#define SFC_OP_STATUS       0x05 /* Get Status */
+#define SFC_OP_TURNOFF      0x5E /* Turn Off */
+#define SFC_OP_TURNON       0x50 /* Turn On */
+#define SFC_OP_ABORT        0x6F /* Super-Abort */
+#define SFC_OP_START4BA     0x37 /* Start 4BA */
+#define SFC_OP_END4BA       0x69 /* End 4BA */
+
+/* Command buffer size */
+#define SFC_CMDBUF_SIZE     256
+
+struct sfc_ctrl {
+	/* Erase sizes */
+	uint32_t		small_er_size;
+	uint32_t		large_er_size;
+
+	/* Current 4b mode */
+	bool			mode_4b;
+
+	/* Callbacks */
+	struct spi_flash_ctrl	ops;
+};
+
+/* Command register support */
+static inline int sfc_reg_read(uint8_t reg, uint32_t *val)
+{
+	uint32_t tmp;
+	int rc;
+
+	*val = 0xffffffff;
+	rc = lpc_fw_read32(&tmp, SFC_CMDREG_OFFSET + reg);
+	if (rc)
+		return rc;
+	*val = be32_to_cpu(tmp);
+	return 0;
+}
+
+static inline int sfc_reg_write(uint8_t reg, uint32_t val)
+{
+	return lpc_fw_write32(cpu_to_be32(val), SFC_CMDREG_OFFSET + reg);
+}
+
+static int sfc_buf_write(uint32_t len, const void *data)
+{
+	uint32_t tmp, off = 0;
+	int rc;
+
+	if (len > SFC_CMDBUF_SIZE)
+		return FLASH_ERR_PARM_ERROR;
+
+	while (len >= 4) {
+		tmp = *(const uint32_t *)data;
+		rc = lpc_fw_write32(tmp, SFC_CMDBUF_OFFSET + off);
+		if (rc)
+			return rc;
+		off += 4;
+		len -= 4;
+		data += 4;
+	}
+	if (!len)
+		return 0;
+
+	/* lpc_fw_write operates on BE values so that's what we layout
+	 * in memory with memcpy. The swap in the register on LE doesn't
+	 * matter, the result in memory will be in the right order.
+	 */
+	tmp = -1;
+	memcpy(&tmp, data, len);
+	return lpc_fw_write32(tmp, SFC_CMDBUF_OFFSET + off);
+}
+
+static int sfc_buf_read(uint32_t len, void *data)
+{
+	uint32_t tmp, off = 0;
+	int rc;
+
+	if (len > SFC_CMDBUF_SIZE)
+		return FLASH_ERR_PARM_ERROR;
+
+	while (len >= 4) {
+		rc = lpc_fw_read32(data, SFC_CMDBUF_OFFSET + off);
+		if (rc)
+			return rc;
+		off += 4;
+		len -= 4;
+		data += 4;
+	}
+	if (!len)
+		return 0;
+
+	rc = lpc_fw_read32(&tmp, SFC_CMDBUF_OFFSET + off);
+	if (rc)
+		return rc;
+	/* We know tmp contains a big endian value, so memcpy is
+	 * our friend here
+	 */
+	memcpy(data, &tmp, len);
+	return 0;
+}
+
+/* Polls until SFC indicates command is complete */
+static int sfc_poll_complete(void)
+{
+	uint32_t status, timeout;
+	struct timespec ts;
+
+	/*
+	 * A full 256 bytes read/write command will take at least
+	 * 126us. Smaller commands are faster but we use less of
+	 * them. So let's sleep in increments of 100us
+	 */
+	ts.tv_sec = 0;
+	ts.tv_nsec = 100000;
+
+	/*
+	 * Use a 1s timeout which should be sufficient for the
+	 * commands we use
+	 */
+	timeout = 10000;
+
+	do {
+		int rc;
+
+		rc = sfc_reg_read(SFC_REG_STATUS, &status);
+		if (rc)
+			return rc;
+		if (status & SFC_REG_STATUS_DONE)
+			break;
+		if (--timeout == 0)
+			return FLASH_ERR_CTRL_TIMEOUT;
+		nanosleep(&ts, NULL);
+	} while (true);
+
+	return 0;
+}
+
+static int sfc_exec_command(uint8_t opcode, uint32_t length)
+{
+	int rc = 0;
+	uint32_t cmd_reg = 0;
+
+	if (opcode > 0x7f || length > 0x1ff)
+		return FLASH_ERR_PARM_ERROR;
+
+	/* Write command register to start execution */
+	cmd_reg |= (opcode << SFC_REG_CMD_OPCODE_SHFT);
+	cmd_reg |= (length << SFC_REG_CMD_LENGTH_SHFT);
+	rc = sfc_reg_write(SFC_REG_CMD, cmd_reg);
+	if (rc)
+		return rc;
+
+	/* Wait for command to complete */
+	return sfc_poll_complete();
+}
+
+static int sfc_chip_id(struct spi_flash_ctrl *ctrl, uint8_t *id_buf,
+		       uint32_t *id_size)
+{
+	uint32_t idconf;
+	int rc;
+
+	(void)ctrl;
+
+	if ((*id_size) < 3)
+		return FLASH_ERR_PARM_ERROR;
+
+	/*
+	 * XXX This will not work in locked down mode but we assume that
+	 * in this case, the chip ID command is already properly programmed
+	 * and the SFC will ignore this. However I haven't verified...
+	 */
+	idconf = ((uint64_t)CMD_RDID) << SFC_REG_CHIPIDCONF_OPCODE_SHFT;
+	idconf |= SFC_REG_CHIPIDCONF_READ;
+        idconf |= (3ul << SFC_REG_CHIPIDCONF_LEN_SHFT);
+	(void)sfc_reg_write(SFC_REG_CHIPIDCONF, idconf);
+
+	/* Perform command */
+	rc = sfc_exec_command(SFC_OP_CHIPID, 0);
+	if (rc)
+		return rc;
+
+	/* Read chip ID */
+        rc = sfc_buf_read(3, id_buf);
+	if (rc)
+		return rc;
+	*id_size = 3;
+
+	return 0;
+}
+
+
+static int sfc_read(struct spi_flash_ctrl *ctrl, uint32_t pos,
+		    void *buf, uint32_t len)
+{
+	(void)ctrl;
+
+	while(len) {
+		uint32_t chunk = len;
+		int rc;
+
+		if (chunk > SFC_CMDBUF_SIZE)
+			chunk = SFC_CMDBUF_SIZE;
+		rc = sfc_reg_write(SFC_REG_ADR, pos);
+		if (rc)
+			return rc;
+		rc = sfc_exec_command(SFC_OP_READRAW, chunk);
+		if (rc)
+			return rc;
+		rc = sfc_buf_read(chunk, buf);
+		if (rc)
+			return rc;
+		len -= chunk;
+		pos += chunk;
+		buf += chunk;
+	}
+	return 0;
+}
+
+static int sfc_write(struct spi_flash_ctrl *ctrl, uint32_t addr,
+		     const void *buf, uint32_t size)
+{
+	uint32_t chunk;
+	int rc;
+
+	(void)ctrl;
+
+	while(size) {
+		/* We shall not cross a page boundary */
+		chunk = 0x100 - (addr & 0xff);
+		if (chunk > size)
+			chunk = size;
+
+		/* Write to SFC write buffer */
+		rc = sfc_buf_write(chunk, buf);
+		if (rc)
+			return rc;
+
+		/* Program address */
+		rc = sfc_reg_write(SFC_REG_ADR, addr);
+		if (rc)
+			return rc;
+
+		/* Send command */
+		rc = sfc_exec_command(SFC_OP_WRITERAW, chunk);
+		if (rc)
+			return rc;
+
+		addr += chunk;
+		buf += chunk;
+		size -= chunk;
+	}
+	return 0;
+}
+
+static int sfc_erase(struct spi_flash_ctrl *ctrl, uint32_t addr,
+		     uint32_t size)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+	uint32_t sm_mask = ct->small_er_size - 1;
+	uint32_t lg_mask = ct->large_er_size - 1;
+	uint32_t chunk;
+	uint8_t cmd;
+	int rc;
+
+	while(size) {
+		/* Choose erase size for this chunk */
+		if (((addr | size) & lg_mask) == 0) {
+			chunk = ct->large_er_size;
+			cmd = SFC_OP_ERALG;
+		} else if (((addr | size) & sm_mask) == 0) {
+			chunk = ct->small_er_size;
+			cmd = SFC_OP_ERASM;
+		} else
+			return FLASH_ERR_ERASE_BOUNDARY;
+
+		rc = sfc_reg_write(SFC_REG_ADR, addr);
+		if (rc)
+			return rc;
+		rc = sfc_exec_command(cmd, 0);
+		if (rc)
+			return rc;
+		addr += chunk;
+		size -= chunk;
+	}
+	return 0;
+}
+
+static int sfc_setup(struct spi_flash_ctrl *ctrl, struct flash_info *info,
+		     uint32_t *tsize)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+	uint32_t er_flags;
+
+	(void)tsize;
+
+	/* Keep non-erase related flags */
+	er_flags = ~FL_ERASE_ALL;
+
+	/* Add supported erase sizes */
+	if (ct->small_er_size == 0x1000 || ct->large_er_size == 0x1000)
+		er_flags |= FL_ERASE_4K;
+	if (ct->small_er_size == 0x8000 || ct->large_er_size == 0x8000)
+		er_flags |= FL_ERASE_32K;
+	if (ct->small_er_size == 0x10000 || ct->large_er_size == 0x10000)
+		er_flags |= FL_ERASE_64K;
+
+	/* Mask the flags out */
+	info->flags &= er_flags;
+
+	return 0;
+}
+
+static int sfc_set_4b(struct spi_flash_ctrl *ctrl, bool enable)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+	int rc;
+
+	rc = sfc_exec_command(enable ? SFC_OP_START4BA : SFC_OP_END4BA, 0);
+	if (rc)
+		return rc;
+	ct->mode_4b = enable;
+	return 0;
+}
+
+static void sfc_validate_er_size(uint32_t *size)
+{
+	if (*size == 0)
+		return;
+
+	/* We only support 4k, 32k and 64k */
+	if (*size != 0x1000 && *size != 0x8000 && *size != 0x10000) {
+		FL_ERR("SFC: Erase size %d bytes unsupported\n", *size);
+		*size = 0;
+	}
+}
+
+static int sfc_init(struct sfc_ctrl *ct)
+{
+	int rc;
+	uint32_t status;
+
+	/*
+	 * Assumptions: The controller has been fully initialized
+	 * by an earlier FW layer setting the chip ID command, the
+	 * erase sizes, and configuring the timings for reads and
+	 * writes.
+	 *
+	 * This driver is meant to be usable if the configuration
+	 * is in lock down.
+	 *
+	 * If that wasn't the case, we could configure some sane
+	 * defaults here and tuned values in setup() after the
+	 * chip has been identified.
+	 */
+
+	/* Read erase sizes from flash */
+	rc = sfc_reg_read(SFC_REG_ERASMS, &ct->small_er_size);
+	if (rc)
+		return rc;
+	sfc_validate_er_size(&ct->small_er_size);
+	rc = sfc_reg_read(SFC_REG_ERASLGS, &ct->large_er_size);
+	if (rc)
+		return rc;
+	sfc_validate_er_size(&ct->large_er_size);
+
+	/* No erase sizes we can cope with ? Ouch... */
+	if ((ct->small_er_size == 0 && ct->large_er_size == 0) ||
+	    (ct->large_er_size && (ct->small_er_size > ct->large_er_size))) {
+		FL_ERR("SFC: No supported erase sizes !\n");
+		return FLASH_ERR_CTRL_CONFIG_MISMATCH;
+	}
+
+	FL_INF("SFC: Suppored erase sizes:");
+	if (ct->small_er_size)
+		FL_INF(" %dKB", ct->small_er_size >> 10);
+	if (ct->large_er_size)
+		FL_INF(" %dKB", ct->large_er_size >> 10);
+	FL_INF("\n");
+
+	/* Read current state of 4 byte addressing */
+	rc = sfc_reg_read(SFC_REG_STATUS, &status);
+	if (rc)
+		return rc;
+	ct->mode_4b = !!(status & SFC_REG_STATUS_FOURBYTEAD);
+
+	return 0;
+}
+
+int sfc_open(struct spi_flash_ctrl **ctrl)
+{
+	struct sfc_ctrl *ct;
+	int rc;
+
+	*ctrl = NULL;
+	ct = malloc(sizeof(*ct));
+	if (!ct) {
+		FL_ERR("SFC: Failed to allocate\n");
+		return FLASH_ERR_MALLOC_FAILED;
+	}
+	memset(ct, 0, sizeof(*ct));
+	ct->ops.chip_id = sfc_chip_id;
+	ct->ops.setup = sfc_setup;
+	ct->ops.set_4b = sfc_set_4b;
+	ct->ops.read = sfc_read;
+	ct->ops.write = sfc_write;
+	ct->ops.erase = sfc_erase;
+
+	rc = sfc_init(ct);
+	if (rc)
+		goto fail;
+	*ctrl = &ct->ops;
+	return 0;
+ fail:
+	free(ct);
+	return rc;
+}
+
+void sfc_close(struct spi_flash_ctrl *ctrl)
+{
+	struct sfc_ctrl *ct = container_of(ctrl, struct sfc_ctrl, ops);
+
+	/* Free the whole lot */
+	free(ct);
+}
+
diff --git a/hw/slw.c b/hw/slw.c
new file mode 100644
index 00000000..3522458f
--- /dev/null
+++ b/hw/slw.c
@@ -0,0 +1,875 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Handle ChipTOD chip & configure core timebases
+ */
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <cpu.h>
+#include <chip.h>
+#include <mem_region.h>
+#include <chiptod.h>
+#include <interrupts.h>
+#include <timebase.h>
+#include <fsp-elog.h>
+
+#ifdef __HAVE_LIBPORE__
+#include <p8_pore_table_gen_api.H>
+#include <sbe_xip_image.h>
+#endif
+
+//#define DBG(fmt...)	printf("SLW: " fmt)
+#define DBG(fmt...)	do { } while(0)
+
+#define MAX_RESET_PATCH_SIZE	64
+static uint32_t slw_saved_reset[MAX_RESET_PATCH_SIZE];
+
+static bool slw_current_le = false;
+
+/* Assembly in head.S */
+extern void enter_rvwinkle(void);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_SET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_GET, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_SLW_REG, OPAL_PLATFORM_ERR_EVT, OPAL_SLW,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_INFO,
+		 OPAL_NA, NULL);
+
+static void slw_do_rvwinkle(void *data)
+{
+	struct cpu_thread *cpu = this_cpu();
+	struct cpu_thread *master = data;
+	uint64_t lpcr = mfspr(SPR_LPCR);
+	struct proc_chip *chip;
+
+	/* Setup our ICP to receive IPIs */
+	icp_prep_for_rvwinkle();
+
+	/* Setup LPCR to wakeup on external interrupts only */
+	mtspr(SPR_LPCR, ((lpcr & ~SPR_LPCR_P8_PECE) | SPR_LPCR_P8_PECE2));
+
+	printf("SLW: CPU PIR 0x%04x goint to rvwinkle...\n", cpu->pir);
+
+	/* Tell that we got it */
+	cpu->state = cpu_state_rvwinkle;
+
+	enter_rvwinkle();
+
+	/* Ok, it's ours again */
+	cpu->state = cpu_state_active;
+
+	printf("SLW: CPU PIR 0x%04x woken up !\n", cpu->pir);
+
+	/* Cleanup our ICP */
+	reset_cpu_icp();
+
+	/* Resync timebase */
+	chiptod_wakeup_resync();
+
+	/* Restore LPCR */
+	mtspr(SPR_LPCR, lpcr);
+
+	/* If we are passed a master pointer we are the designated
+	 * waker, let's proceed. If not, return, we are finished.
+	 */
+	if (!master)
+		return;
+
+	printf("SLW: CPU PIR 0x%04x waiting for master...\n", cpu->pir);
+
+	/* Allriiiight... now wait for master to go down */
+	while(master->state != cpu_state_rvwinkle)
+		sync();
+
+	/* XXX Wait one second ! (should check xscom state ? ) */
+	time_wait_ms(1000);
+
+	for_each_chip(chip) {
+		struct cpu_thread *c;
+		uint64_t tmp;
+		for_each_available_core_in_chip(c, chip->id) {
+			xscom_read(chip->id,
+				 XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+							EX_PM_IDLE_STATE_HISTORY_PHYP),
+				   &tmp);	
+			DBG("SLW: core %x:%x history: 0x%016llx (mid2)\n",
+			    chip->id, pir_to_core_id(c->pir), tmp);
+		}
+	}
+
+	printf("SLW: Waking master (PIR 0x%04x)...\n", master->pir);
+
+	/* Now poke all the secondary threads on the master's core */
+	for_each_cpu(cpu) {
+		if (!cpu_is_sibling(cpu, master) || (cpu == master))
+			continue;
+		icp_kick_cpu(cpu);
+
+		/* Wait for it to claim to be back (XXX ADD TIMEOUT) */
+		while(cpu->state != cpu_state_active)
+			sync();
+	}
+
+	/* Now poke the master and be gone */
+	icp_kick_cpu(master);
+}
+
+static void slw_patch_reset(void)
+{
+	extern uint32_t rvwinkle_patch_start;
+	extern uint32_t rvwinkle_patch_end;
+	uint32_t *src, *dst, *sav;
+
+	BUILD_ASSERT((&rvwinkle_patch_end - &rvwinkle_patch_start) <=
+		     MAX_RESET_PATCH_SIZE);
+
+	src = &rvwinkle_patch_start;
+	dst = (uint32_t *)0x100;
+	sav = slw_saved_reset;
+	while(src < &rvwinkle_patch_end) {
+		*(sav++) = *(dst);
+		*(dst++) = *(src++);
+	}
+	sync_icache();
+}
+
+static void slw_unpatch_reset(void)
+{
+	extern uint32_t rvwinkle_patch_start;
+	extern uint32_t rvwinkle_patch_end;
+	uint32_t *src, *dst, *sav;
+
+	src = &rvwinkle_patch_start;
+	dst = (uint32_t *)0x100;
+	sav = slw_saved_reset;
+	while(src < &rvwinkle_patch_end) {
+		*(dst++) = *(sav++);
+		src++;
+	}
+	sync_icache();
+}
+
+static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/* PowerManagement GP0 clear PM_DISABLE */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Failed to read PM_GP0\n");
+		return false;
+	}
+	tmp = tmp & ~0x8000000000000000ULL;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Failed to write PM_GP0\n");
+		return false;
+	}
+	DBG("SLW: PMGP0 set to 0x%016llx\n", tmp);
+
+	/* Read back for debug */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP0), &tmp);
+	DBG("SLW: PMGP0 read   0x%016llx\n", tmp);
+
+
+	/* Set CORE and ECO PFET Vret to select zero */
+	rc = xscom_write(chip->id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CORE_PFET_VRET), 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+			"SLW: Failed to write PM_CORE_PFET_VRET\n");
+		return false;
+	}
+	rc = xscom_write(chip->id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CORE_ECO_VRET), 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+			"SLW: Failed to write PM_CORE_ECO_VRET\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool slw_set_overrides(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/*
+	 * Set ENABLE_IGNORE_RECOV_ERRORS in OHA_MODE_REG
+	 *
+	 * XXX FIXME: This should be only done for "forced" winkle such as
+	 * when doing repairs or LE transition, and we should restore the
+	 * original value when done
+	 */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX(core, PM_OHA_MODE_REG),
+			&tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+				"SLW: Failed to read PM_OHA_MODE_REG\n");
+		return false;
+	}
+	tmp = tmp | 0x8000000000000000ULL;
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX(core, PM_OHA_MODE_REG),
+			 tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+				"SLW: Failed to write PM_OHA_MODE_REG\n");
+		return false;
+	}
+	DBG("SLW: PM_OHA_MODE_REG set to 0x%016llx\n", tmp);
+
+	/* Read back for debug */
+	rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX(core, PM_OHA_MODE_REG),&tmp);
+	DBG("SLW: PM_OHA_MODE_REG read   0x%016llx\n", tmp);
+
+	/*
+	 * Clear special wakeup bits that could hold power mgt
+	 *
+	 * XXX FIXME: See above
+	 */
+	rc = xscom_write(chip->id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SPECIAL_WAKEUP_FSP),
+			 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+			"SLW: Failed to write PM_SPECIAL_WAKEUP_FSP\n");
+		return false;
+	}
+	rc = xscom_write(chip->id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SPECIAL_WAKEUP_OCC),
+			 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+			"SLW: Failed to write PM_SPECIAL_WAKEUP_OCC\n");
+		return false;
+	}
+	rc = xscom_write(chip->id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SPECIAL_WAKEUP_PHYP),
+			 0);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+			"SLW: Failed to write PM_SPECIAL_WAKEUP_PHYP\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool slw_unset_overrides(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+
+	/* XXX FIXME: Save and restore the overrides */
+	printf("SLW: slw_unset_overrides %x:%x\n", chip->id, core);
+	return true;
+}
+
+static bool slw_set_deep_mode(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/* Init PM GP1 for fast mode or deep mode */
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1),
+			 EX_PM_SETUP_GP1_DEEP_SLEEP);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+					"SLW: Failed to write PM_GP1\n");
+		return false;
+	}
+
+	/* Read back for debug */
+	xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp);
+	DBG("SLW: PMGP1 read   0x%016llx\n", tmp);
+	return true;
+}
+
+static bool slw_set_fast_mode(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/* Init PM GP1 for fast mode or deep mode */
+	rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1),
+			 EX_PM_SETUP_GP1_FAST_SLEEP);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_SET),
+				"SLW: Failed to write PM_GP1\n");
+		return false;
+	}
+
+	/* Read back for debug */
+	xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_GP1), &tmp);
+	DBG("SLW: PMGP1 read   0x%016llx\n", tmp);
+	return true;
+}
+
+static bool slw_get_idle_state_history(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint32_t core = pir_to_core_id(c->pir);
+	uint64_t tmp;
+	int rc;
+
+	/* Cleanup history */
+	rc = xscom_read(chip->id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		return false;
+	}
+
+	DBG("SLW: core %x:%x history: 0x%016llx (old1)\n",
+	    chip->id, core, tmp);
+
+	rc = xscom_read(chip->id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &tmp);
+
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		return false;
+	}
+
+	DBG("SLW: core %x:%x history: 0x%016llx (old2)\n",
+	    chip->id, core, tmp);
+
+	return true;
+}
+
+static bool slw_prepare_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+	DBG("SLW: Prepare core %x:%x\n",
+	    chip->id, pir_to_core_id(c->pir));
+
+	if(!slw_general_init(chip, c))
+		return false;
+	if(!slw_set_overrides(chip, c))
+		return false;
+	if(!slw_set_deep_mode(chip, c))
+		return false;
+	if(!slw_get_idle_state_history(chip, c))
+		return false;
+
+	return true;
+}
+
+static bool fastsleep_prepare_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+	DBG("FASTSLEEP: Prepare core %x:%x\n",
+	    chip->id, pir_to_core_id(c->pir));
+
+	if(!slw_general_init(chip, c))
+		return false;
+	if(!slw_set_overrides(chip, c))
+		return false;
+	if(!slw_set_fast_mode(chip, c))
+		return false;
+	if(!slw_get_idle_state_history(chip, c))
+		return false;
+
+	return true;
+
+}
+
+/* Define device-tree fields */
+#define MAX_NAME_LEN	16
+struct cpu_idle_states {
+	char name[MAX_NAME_LEN];
+	u32 latency_ns;
+	u32 flags;
+	u64 pmicr;
+	u64 pmicr_mask;
+};
+
+/* Flag definitions */
+
+#define IDLE_DEC_STOP		0x00000001 /* Decrementer would stop */
+#define IDLE_TB_STOP		0x00000002 /* Timebase would stop */
+#define IDLE_LOSE_USER_CONTEXT	0x00000100 /* Restore GPRs like nap */
+#define IDLE_LOSE_HYP_CONTEXT	0x00000200 /* Restore hypervisor resource
+					      from PACA pointer */
+#define IDLE_LOSE_FULL_CONTEXT	0x00000400 /* Restore hypervisor resource
+					      by searching PACA */
+#define IDLE_USE_INST_NAP	0x00010000 /* Use nap instruction */
+#define IDLE_USE_INST_SLEEP	0x00020000 /* Use sleep instruction */
+#define IDLE_USE_INST_WINKLE	0x00040000 /* Use winkle instruction */
+#define IDLE_USE_PMICR		0x00800000 /* Use SPR PMICR instruction */
+
+#define IDLE_FASTSLEEP_PMICR	0x0000002000000000
+#define IDLE_DEEPSLEEP_PMICR	0x0000003000000000
+#define IDLE_SLEEP_PMICR_MASK	0x0000003000000000
+
+#define IDLE_FASTWINKLE_PMICR	0x0000000000200000
+#define IDLE_DEEPWINKLE_PMICR	0x0000000000300000
+#define IDLE_WINKLE_PMICR_MASK	0x0000000000300000
+
+static struct cpu_idle_states power7_cpu_idle_states[] = {
+	{ /* nap */
+		.name = "nap",
+		.latency_ns = 1000,
+		.flags = 0*IDLE_DEC_STOP \
+		       | 0*IDLE_TB_STOP  \
+		       | 1*IDLE_LOSE_USER_CONTEXT \
+		       | 0*IDLE_LOSE_HYP_CONTEXT \
+		       | 0*IDLE_LOSE_FULL_CONTEXT \
+		       | 1*IDLE_USE_INST_NAP \
+		       | 0*IDLE_USE_INST_SLEEP \
+		       | 0*IDLE_USE_INST_WINKLE \
+		       | 0*IDLE_USE_PMICR,
+		.pmicr = 0,
+		.pmicr_mask = 0 },
+};
+
+static struct cpu_idle_states power8_cpu_idle_states[] = {
+	{ /* nap */
+		.name = "nap",
+		.latency_ns = 1000,
+		.flags = 0*IDLE_DEC_STOP \
+		       | 0*IDLE_TB_STOP  \
+		       | 1*IDLE_LOSE_USER_CONTEXT \
+		       | 0*IDLE_LOSE_HYP_CONTEXT \
+		       | 0*IDLE_LOSE_FULL_CONTEXT \
+		       | 1*IDLE_USE_INST_NAP \
+		       | 0*IDLE_USE_INST_SLEEP \
+		       | 0*IDLE_USE_INST_WINKLE \
+		       | 0*IDLE_USE_PMICR,
+		.pmicr = 0,
+		.pmicr_mask = 0 },
+	{ /* fast sleep */
+		.name = "fastsleep",
+		.latency_ns = 100000,
+		.flags = 1*IDLE_DEC_STOP \
+		       | 1*IDLE_TB_STOP  \
+		       | 1*IDLE_LOSE_USER_CONTEXT \
+		       | 0*IDLE_LOSE_HYP_CONTEXT \
+		       | 0*IDLE_LOSE_FULL_CONTEXT \
+		       | 0*IDLE_USE_INST_NAP \
+		       | 1*IDLE_USE_INST_SLEEP \
+		       | 0*IDLE_USE_INST_WINKLE \
+		       | 0*IDLE_USE_PMICR, /* Not enabled until deep
+						states are available */
+		.pmicr = IDLE_FASTSLEEP_PMICR,
+		.pmicr_mask = IDLE_SLEEP_PMICR_MASK },
+};
+
+/* Add device tree properties to describe idle states */
+void add_cpu_idle_state_properties(void)
+{
+	struct dt_node *power_mgt;
+	struct cpu_idle_states *states;
+	struct proc_chip *chip;
+	int nr_states;
+
+	printf("CPU idle state device tree init\n");
+
+	/* Create /ibm,opal/power-mgt */
+	power_mgt = dt_new(opal_node, "power-mgt");
+	if (!power_mgt) {
+		printf("creating dt node /ibm,opal/power-mgt failed\n");
+		return;
+	}
+
+	/*
+	 * Chose the right state table for the chip
+	 *
+	 * XXX We use the first chip version, we should probably look
+	 * for the smaller of all chips instead..
+	 */
+	chip = next_chip(NULL);
+	assert(chip);
+	if (chip->type == PROC_CHIP_P8_MURANO ||
+	    chip->type == PROC_CHIP_P8_VENICE) {
+		const struct dt_property *p;
+		bool can_sleep = true;
+
+		p = dt_find_property(dt_root, "ibm,enabled-idle-states");
+
+		states = power8_cpu_idle_states;
+		nr_states = ARRAY_SIZE(power8_cpu_idle_states);
+
+		/* Check if hostboot say we can sleep */
+		if (p && !dt_prop_find_string(p, "fastsleep"))
+			can_sleep = false;
+
+		/* Clip to NAP only on Murano DD1.x */
+		if (chip->type == PROC_CHIP_P8_MURANO &&
+		    chip->ec_level < 0x20)
+			can_sleep = false;
+
+		if (!can_sleep)
+			nr_states = 1;
+	} else {
+		states = power7_cpu_idle_states;
+		nr_states = ARRAY_SIZE(power7_cpu_idle_states);
+	}
+
+	/*
+	 * XXX Creating variable size properties is awkward. For now we hard wire
+	 * the 1 and 2 states cases. Long run we want to implement functions to
+	 * "append" strings and cells to properties so we can just have a loop
+	 * of nr_states here
+	 */
+	switch (nr_states) {
+		case 1:
+			dt_add_property_strings(power_mgt, "ibm,cpu-idle-state-names",
+						states[0].name);
+			dt_add_property_cells(power_mgt, "ibm,cpu-idle-state-latencies-ns",
+					      states[0].latency_ns);
+			dt_add_property_cells(power_mgt, "ibm,cpu-idle-state-flags",
+					      states[0].flags);
+			dt_add_property_u64s(power_mgt, "ibm,cpu-idle-state-pmicr",
+					     states[0].pmicr);
+			dt_add_property_u64s(power_mgt, "ibm,cpu-idle-state-pmicr-mask",
+					     states[0].pmicr_mask);
+			break;
+		case 2:
+			dt_add_property_strings(power_mgt, "ibm,cpu-idle-state-names",
+						states[0].name,
+						states[1].name);
+			dt_add_property_cells(power_mgt, "ibm,cpu-idle-state-latencies-ns",
+					      states[0].latency_ns,
+					      states[1].latency_ns);
+			dt_add_property_cells(power_mgt, "ibm,cpu-idle-state-flags",
+					      states[0].flags,
+					      states[1].flags);
+			dt_add_property_u64s(power_mgt, "ibm,cpu-idle-state-pmicr",
+					     states[0].pmicr,
+					     states[1].pmicr);
+			dt_add_property_u64s(power_mgt, "ibm,cpu-idle-state-pmicr-mask",
+					     states[0].pmicr_mask,
+					     states[1].pmicr_mask);
+			break;
+		default:
+			prerror("SLW: Unsupported number of states\n");
+	}
+}
+
+static bool slw_prepare_chip(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+	
+	for_each_available_core_in_chip(c, chip->id) {
+		if (!slw_prepare_core(chip, c))
+			return false;
+	}
+	return true;
+}
+
+static void slw_cleanup_core(struct proc_chip *chip, struct cpu_thread *c)
+{
+	uint64_t tmp;
+	int rc;
+
+	/* Display history to check transition */
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+					       EX_PM_IDLE_STATE_HISTORY_PHYP),
+			&tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		/* XXX error handling ? return false; */
+	}
+
+	printf("SLW: core %x:%x history: 0x%016llx (new1)\n",
+	       chip->id, pir_to_core_id(c->pir), tmp);
+
+	rc = xscom_read(chip->id,
+			XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+					       EX_PM_IDLE_STATE_HISTORY_PHYP),
+			&tmp);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_GET),
+			"SLW: Failed to read PM_IDLE_STATE_HISTORY\n");
+		/* XXX error handling ? return false; */
+	}
+
+	printf("SLW: core %x:%x history: 0x%016llx (new2)\n",
+	       chip->id, pir_to_core_id(c->pir), tmp);
+
+	/*
+	 * XXX FIXME: Error out if the transition didn't reach rvwinkle ?
+	 */
+
+	/*
+	 * XXX FIXME: We should restore a bunch of the EX bits we
+	 * overwrite to sane values here
+	 */
+	slw_unset_overrides(chip, c);
+}
+
+static void slw_cleanup_chip(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+	
+	for_each_available_core_in_chip(c, chip->id)
+		slw_cleanup_core(chip, c);
+}
+
+#ifdef __HAVE_LIBPORE__
+static void slw_patch_scans(struct proc_chip *chip, bool le_mode)
+{
+	int64_t rc;
+	uint64_t old_val, new_val;
+
+	rc = sbe_xip_get_scalar((void *)chip->slw_base,
+				"skip_ex_override_ring_scans", &old_val);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to read scan override on chip %d\n",
+			chip->id);
+		return;
+	}
+
+	new_val = le_mode ? 0 : 1;
+
+	DBG("SLW: Chip %d, LE value was: %lld, setting to %lld\n",
+	    chip->id, old_val, new_val);
+
+	rc = sbe_xip_set_scalar((void *)chip->slw_base,
+				"skip_ex_override_ring_scans", new_val);
+	if (rc) {
+		log_simple_error(&e_info(OPAL_RC_SLW_REG),
+			"SLW: Failed to set LE mode on chip %d\n", chip->id);
+		return;
+	}
+}
+#else
+static inline void slw_patch_scans(struct proc_chip *chip __unused,
+				   bool le_mode __unused ) { }
+#endif /* __HAVE_LIBPORE__ */
+
+int64_t slw_reinit(uint64_t flags)
+{
+	struct proc_chip *chip;
+	struct cpu_thread *cpu;
+	bool has_waker = false;
+	bool target_le = slw_current_le;
+
+#ifndef __HAVE_LIBPORE__
+	return OPAL_UNSUPPORTED;
+#endif
+
+	if (flags & OPAL_REINIT_CPUS_HILE_BE)
+		target_le = false;
+	if (flags & OPAL_REINIT_CPUS_HILE_LE)
+		target_le = true;
+
+	DBG("SLW Reinit from CPU PIR 0x%04x, HILE set to %s endian...\n",
+	    this_cpu()->pir, target_le ? "little" : "big");
+
+	/* Prepare chips/cores for rvwinkle */
+	for_each_chip(chip) {
+		if (!chip->slw_base) {
+			log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Not found on chip %d\n", chip->id);
+			return OPAL_HARDWARE;
+		}
+		if (!slw_prepare_chip(chip)) {
+			log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+				"SLW: Error preparing chip %d\n", chip->id);
+			return OPAL_HARDWARE;
+		}
+		slw_patch_scans(chip, target_le);
+	}
+	slw_current_le = target_le;
+
+	/* XXX Save HIDs ? Or do that in head.S ... */
+
+	slw_patch_reset();
+
+	/* rvwinkle everybody and pick one to wake me once I rvwinkle myself */
+	for_each_available_cpu(cpu) {
+		struct cpu_thread *master = NULL;
+
+		if (cpu == this_cpu())
+			continue;
+
+		/* Pick up a waker for myself: it must not be a sibling of
+		 * the current CPU and must be a thread 0 (so it gets to
+		 * sync its timebase before doing time_wait_ms()
+		 */
+		if (!has_waker && !cpu_is_sibling(cpu, this_cpu()) &&
+		    cpu_is_thread0(cpu)) {
+			has_waker = true;
+			master = this_cpu();
+		}
+		__cpu_queue_job(cpu, slw_do_rvwinkle, master, true);
+
+		/* Wait for it to claim to be down */
+		while(cpu->state != cpu_state_rvwinkle)
+			sync();		
+	}
+
+	/* XXX Wait one second ! (should check xscom state ? ) */
+	DBG("SLW: [TB=0x%016lx] Waiting one second...\n", mftb());
+	time_wait_ms(1000);
+	DBG("SLW: [TB=0x%016lx] Done.\n", mftb());
+
+	for_each_chip(chip) {
+		struct cpu_thread *c;
+		uint64_t tmp;
+		for_each_available_core_in_chip(c, chip->id) {
+			xscom_read(chip->id,
+				 XSCOM_ADDR_P8_EX_SLAVE(pir_to_core_id(c->pir),
+							EX_PM_IDLE_STATE_HISTORY_PHYP),
+				   &tmp);
+			printf("SLW: core %x:%x history: 0x%016llx (mid)\n",
+			       chip->id, pir_to_core_id(c->pir), tmp);
+		}
+	}
+
+
+	/* Wake everybody except on my core */
+	for_each_cpu(cpu) {
+		if (cpu->state != cpu_state_rvwinkle ||
+		    cpu_is_sibling(cpu, this_cpu()))
+			continue;
+		icp_kick_cpu(cpu);
+
+		/* Wait for it to claim to be back (XXX ADD TIMEOUT) */
+		while(cpu->state != cpu_state_active)
+			sync();
+	}
+
+	/* Did we find a waker ? If we didn't, that means we had no
+	 * other core in the system, we can't do it
+	 */
+	if (!has_waker) {
+		DBG("SLW: No candidate waker, giving up !\n");
+		return OPAL_HARDWARE;
+	}
+
+	/* Our siblings are rvwinkling, and our waker is waiting for us
+	 * so let's just go down now
+	 */
+	slw_do_rvwinkle(NULL);
+
+	slw_unpatch_reset();
+
+	for_each_chip(chip)
+		slw_cleanup_chip(chip);
+
+	DBG("SLW Reinit complete !\n");
+
+	return OPAL_SUCCESS;
+}
+
+#ifdef __HAVE_LIBPORE__
+static void slw_patch_regs(struct proc_chip *chip)
+{
+	struct cpu_thread *c;
+	void *image = (void *)chip->slw_base;
+	int rc;
+
+	for_each_available_cpu(c) {
+		if (c->chip_id != chip->id)
+			continue;
+	
+		/* Clear HRMOR */
+		rc =  p8_pore_gen_cpureg_fixed(image, P8_SLW_MODEBUILD_SRAM,
+					       P8_SPR_HRMOR, 0,
+					       cpu_get_core_index(c),
+					       cpu_get_thread_index(c));
+		if (rc) {
+			log_simple_error(&e_info(OPAL_RC_SLW_REG),
+				"SLW: Failed to set HRMOR for CPU %x\n",
+				c->pir);
+		}
+
+		/* XXX Add HIDs etc... */
+	}
+}
+#endif /* __HAVE_LIBPORE__ */
+
+static void slw_init_chip(struct proc_chip *chip)
+{
+	int rc __unused;
+	struct cpu_thread *c;
+
+	prerror("SLW: Init chip 0x%x\n", chip->id);
+
+	if (!chip->slw_base) {
+		prerror("SLW: No image found !\n");
+		return;
+	}
+
+#ifdef __HAVE_LIBPORE__
+	/* Check actual image size */
+	rc = sbe_xip_get_scalar((void *)chip->slw_base, "image_size",
+				&chip->slw_image_size);
+	if (rc != 0) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+			"SLW: Error %d reading SLW image size\n", rc);
+		/* XXX Panic ? */
+		chip->slw_base = 0;
+		chip->slw_bar_size = 0;
+		chip->slw_image_size = 0;
+		return;
+	}
+	printf("SLW: Image size from image: 0x%llx\n", chip->slw_image_size);
+
+	if (chip->slw_image_size > chip->slw_bar_size) {
+		log_simple_error(&e_info(OPAL_RC_SLW_INIT),
+			"SLW: Built-in image size larger than BAR size !\n");
+		/* XXX Panic ? */
+	}
+
+	/* Patch SLW image */
+        slw_patch_regs(chip);
+#endif /* __HAVE_LIBPORE__ */
+
+	/* At power ON setup inits for fast-sleep */
+	for_each_available_core_in_chip(c, chip->id) {
+		fastsleep_prepare_core(chip, c);
+	}
+}
+
+void slw_init(void)
+{
+	struct proc_chip *chip;
+
+	if (proc_gen != proc_gen_p8)
+		return;
+
+	for_each_chip(chip)
+		slw_init_chip(chip);
+}
+
diff --git a/hw/xscom.c b/hw/xscom.c
new file mode 100644
index 00000000..c4c3be24
--- /dev/null
+++ b/hw/xscom.c
@@ -0,0 +1,518 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <xscom.h>
+#include <io.h>
+#include <processor.h>
+#include <device.h>
+#include <chip.h>
+#include <centaur.h>
+#include <fsp-elog.h>
+
+/* Mask of bits to clear in HMER before an access */
+#define HMER_CLR_MASK	(~(SPR_HMER_XSCOM_FAIL | \
+			   SPR_HMER_XSCOM_DONE | \
+			   SPR_HMER_XSCOM_STATUS_MASK))
+
+#define XSCOM_ADDR_IND_FLAG		PPC_BIT(0)
+#define XSCOM_ADDR_IND_ADDR_MASK	PPC_BITMASK(12,31)
+#define XSCOM_ADDR_IND_ADDR_LSH		PPC_BITLSHIFT(31)
+#define XSCOM_ADDR_IND_DATA_MSK		PPC_BITMASK(48,63)
+
+#define XSCOM_DATA_IND_READ		PPC_BIT(0)
+#define XSCOM_DATA_IND_COMPLETE		PPC_BIT(32)
+#define XSCOM_DATA_IND_ERR_MASK		PPC_BITMASK(33,35)
+#define XSCOM_DATA_IND_ERR_LSH		PPC_BITLSHIFT(35)
+#define XSCOM_DATA_IND_DATA_MSK		PPC_BITMASK(48,63)
+
+/* HB folks say: try 10 time for now */
+#define XSCOM_IND_MAX_RETRIES		10
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_INDIRECT_RW, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA, NULL);
+
+/*
+ * Locking notes:
+ *
+ * We used to have a per-target lock. However due to errata HW822317
+ * we can have issues on the issuer side if multiple threads try to
+ * send XSCOMs simultaneously (HMER responses get mixed up), so just
+ * use a global lock instead
+ */
+static struct lock xscom_lock = LOCK_UNLOCKED;
+
+static inline void *xscom_addr(uint32_t gcid, uint32_t pcb_addr)
+{
+	struct proc_chip *chip = get_chip(gcid);
+	uint64_t addr;
+
+	assert(chip);
+	addr  = chip->xscom_base;
+	addr |= ((uint64_t)pcb_addr << 4) & ~0xfful;
+	addr |= (pcb_addr << 3) & 0x78;
+
+	return (void *)addr;
+}
+
+static uint64_t xscom_wait_done(void)
+{
+	uint64_t hmer;
+
+	do
+		hmer = mfspr(SPR_HMER);
+	while(!(hmer & SPR_HMER_XSCOM_DONE));
+
+	/*
+	 * HW822317: We need to read a second time as the actual
+	 * status can be delayed by 1 cycle after DONE
+	 */
+	return mfspr(SPR_HMER);
+}
+
+static void xscom_reset(uint32_t gcid)
+{
+	u64 hmer;
+
+	/* Clear errors in HMER */
+	mtspr(SPR_HMER, HMER_CLR_MASK);
+
+	/* First we need to write 0 to a register on our chip */
+	out_be64(xscom_addr(this_cpu()->chip_id, 0x202000f), 0);
+	hmer = xscom_wait_done();
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		goto fail;
+
+	/* Then we need to clear those two other registers on the target */
+	out_be64(xscom_addr(gcid, 0x2020007), 0);
+	hmer = xscom_wait_done();
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		goto fail;
+	out_be64(xscom_addr(gcid, 0x2020009), 0);
+	hmer = xscom_wait_done();
+	if (hmer & SPR_HMER_XSCOM_FAIL)
+		goto fail;
+	return;
+ fail:
+	/* Fatal error resetting XSCOM */
+	log_simple_error(&e_info(OPAL_RC_XSCOM_RESET),
+		"XSCOM: Fatal error resetting engine after failed access !\n");
+
+	/* XXX Generate error log ? attn ? panic ?
+	 * If we decide to panic, change the above severity to PANIC
+	 */
+}
+
+static bool xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
+			       bool is_write)
+{
+	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
+
+	/* XXX Figure out error codes from doc and error
+	 * recovery procedures
+	 */
+	switch(stat) {
+	/* XSCOM blocked, just retry */
+	case 1:
+		return true;
+	}
+
+	/* XXX: Create error log entry ? */
+	log_simple_error(&e_info(OPAL_RC_XSCOM_RW),
+		"XSCOM: %s error gcid=0x%x pcb_addr=0x%x stat=0x%x\n",
+		is_write ? "write" : "read", gcid, pcb_addr, stat);
+
+	/* We need to reset the XSCOM or we'll hang on the next access */
+	xscom_reset(gcid);
+
+	/* Non recovered ... just fail */
+	return false;
+}
+
+static void xscom_handle_ind_error(uint64_t data, uint32_t gcid,
+				   uint64_t pcb_addr, bool is_write)
+{
+	unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data);
+	bool timeout = !(data & XSCOM_DATA_IND_COMPLETE);
+
+	/* XXX: Create error log entry ? */
+	if (timeout)
+		log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW),
+			"XSCOM: %s indirect timeout, gcid=0x%x pcb_addr=0x%llx"
+			" stat=0x%x\n",
+			is_write ? "write" : "read", gcid, pcb_addr, stat);
+	else
+		log_simple_error(&e_info(OPAL_RC_XSCOM_INDIRECT_RW),
+			"XSCOM: %s indirect error, gcid=0x%x pcb_addr=0x%llx"
+			" stat=0x%x\n",
+			is_write ? "write" : "read", gcid, pcb_addr, stat);
+}
+
+static bool xscom_gcid_ok(uint32_t gcid)
+{
+	return get_chip(gcid) != NULL;
+}
+
+/*
+ * Low level XSCOM access functions, perform a single direct xscom
+ * access via MMIO
+ */
+static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
+{
+	uint64_t hmer;
+
+	if (!xscom_gcid_ok(gcid)) {
+		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+		return OPAL_PARAMETER;
+	}
+
+	for (;;) {
+		/* Clear status bits in HMER (HMER is special
+		 * writing to it *ands* bits
+		 */
+		mtspr(SPR_HMER, HMER_CLR_MASK);
+
+		/* Read value from SCOM */
+		*val = in_be64(xscom_addr(gcid, pcb_addr));
+
+		/* Wait for done bit */
+		hmer = xscom_wait_done();
+
+		/* Check for error */
+		if (!(hmer & SPR_HMER_XSCOM_FAIL))
+			break;
+
+		/* Handle error and eventually retry */
+		if (!xscom_handle_error(hmer, gcid, pcb_addr, false))
+			return OPAL_HARDWARE;
+	}
+	return 0;
+}
+
+static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
+{
+	uint64_t hmer;
+
+	if (!xscom_gcid_ok(gcid)) {
+		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+		return OPAL_PARAMETER;
+	}
+
+	for (;;) {
+		/* Clear status bits in HMER (HMER is special
+		 * writing to it *ands* bits
+		 */
+		mtspr(SPR_HMER, HMER_CLR_MASK);
+
+		/* Write value to SCOM */
+		out_be64(xscom_addr(gcid, pcb_addr), val);
+
+		/* Wait for done bit */
+		hmer = xscom_wait_done();
+
+		/* Check for error */
+		if (!(hmer & SPR_HMER_XSCOM_FAIL))
+			break;
+
+		/* Handle error and eventually retry */
+		if (!xscom_handle_error(hmer, gcid, pcb_addr, true))
+			return OPAL_HARDWARE;
+	}
+	return 0;
+}
+
+/*
+ * Indirect XSCOM access functions
+ */
+static int xscom_indirect_read(uint32_t gcid, uint64_t pcb_addr, uint64_t *val)
+{
+	uint32_t addr;
+	uint64_t data;
+	int rc, retries;
+
+	if (proc_gen != proc_gen_p8) {
+		*val = (uint64_t)-1;
+		return OPAL_UNSUPPORTED;
+	}
+
+	/* Write indirect address */
+	addr = pcb_addr & 0x7fffffff;
+	data = XSCOM_DATA_IND_READ |
+		(pcb_addr & XSCOM_ADDR_IND_ADDR_MASK);
+	rc = __xscom_write(gcid, addr, data);
+	if (rc)
+		goto bail;
+
+	/* Wait for completion */
+	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+		rc = __xscom_read(gcid, addr, &data);
+		if (rc)
+			goto bail;
+		if ((data & XSCOM_DATA_IND_COMPLETE) &&
+		    ((data & XSCOM_DATA_IND_ERR_MASK) == 0)) {
+			*val = data & XSCOM_DATA_IND_DATA_MSK;
+			break;
+		}
+		if ((data & XSCOM_DATA_IND_COMPLETE) ||
+		    (retries >= XSCOM_IND_MAX_RETRIES)) {
+			xscom_handle_ind_error(data, gcid, pcb_addr,
+					       false);
+			rc = OPAL_HARDWARE;
+			goto bail;
+		}
+	}
+ bail:
+	if (rc)
+		*val = (uint64_t)-1;
+	return rc;
+}
+
+static int xscom_indirect_write(uint32_t gcid, uint64_t pcb_addr, uint64_t val)
+{
+	uint32_t addr;
+	uint64_t data;
+	int rc, retries;
+
+	if (proc_gen != proc_gen_p8)
+		return OPAL_UNSUPPORTED;
+
+	/* Write indirect address & data */
+	addr = pcb_addr & 0x7fffffff;
+	data = pcb_addr & XSCOM_ADDR_IND_ADDR_MASK;
+	data |= val & XSCOM_ADDR_IND_DATA_MSK;
+
+	rc = __xscom_write(gcid, addr, data);
+	if (rc)
+		goto bail;
+
+	/* Wait for completion */
+	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
+		rc = __xscom_read(gcid, addr, &data);
+		if (rc)
+			goto bail;
+		if ((data & XSCOM_DATA_IND_COMPLETE) &&
+		    ((data & XSCOM_DATA_IND_ERR_MASK) == 0))
+			break;
+		if ((data & XSCOM_DATA_IND_COMPLETE) ||
+		    (retries >= XSCOM_IND_MAX_RETRIES)) {
+			xscom_handle_ind_error(data, gcid, pcb_addr,
+					       false);
+			rc = OPAL_HARDWARE;
+			goto bail;
+		}
+	}
+ bail:
+	return rc;
+}
+
+static uint32_t xscom_decode_chiplet(uint32_t partid, uint64_t *pcb_addr)
+{
+	uint32_t gcid = (partid & 0x0fffffff) >> 4;
+	uint32_t core = partid & 0xf;
+
+	*pcb_addr |= P8_EX_PCB_SLAVE_BASE;
+	*pcb_addr |= core << 24;
+
+	return gcid;
+}
+
+/*
+ * External API
+ */
+int xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val)
+{
+	bool need_unlock;
+	uint32_t gcid;
+	int rc;
+
+	/* Handle part ID decoding */
+	switch(partid >> 28) {
+	case 0: /* Normal processor chip */
+		gcid = partid;
+		break;
+	case 8: /* Centaur */
+		return centaur_xscom_read(partid, pcb_addr, val);
+	case 4: /* EX chiplet */
+		gcid = xscom_decode_chiplet(partid, &pcb_addr);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * HW822317 requires locking. We use a recursive lock as error
+	 * conditions might cause printf's which might then try to take
+	 * the lock again
+	 */
+	need_unlock = lock_recursive(&xscom_lock);
+
+	/* Direct vs indirect access */
+	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+		rc = xscom_indirect_read(gcid, pcb_addr, val);
+	else
+		rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val);
+
+	/* Unlock it */
+	if (need_unlock)
+		unlock(&xscom_lock);
+	return rc;
+}
+
+opal_call(OPAL_XSCOM_READ, xscom_read, 3);
+
+int xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val)
+{
+	bool need_unlock;
+	uint32_t gcid;
+	int rc;
+
+	/* Handle part ID decoding */
+	switch(partid >> 28) {
+	case 0: /* Normal processor chip */
+		gcid = partid;
+		break;
+	case 8: /* Centaur */
+		return centaur_xscom_write(partid, pcb_addr, val);
+	case 4: /* EX chiplet */
+		gcid = xscom_decode_chiplet(partid, &pcb_addr);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	/*
+	 * HW822317 requires locking. We use a recursive lock as error
+	 * conditions might cause printf's which might then try to take
+	 * the lock again
+	 */
+	need_unlock = lock_recursive(&xscom_lock);
+
+	/* Direct vs indirect access */
+	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
+		rc = xscom_indirect_write(gcid, pcb_addr, val);
+	else
+		rc = __xscom_write(gcid, pcb_addr & 0x7fffffff, val);
+
+	/* Unlock it */
+	if (need_unlock)
+		unlock(&xscom_lock);
+	return rc;
+}
+opal_call(OPAL_XSCOM_WRITE, xscom_write, 3);
+
+int xscom_readme(uint64_t pcb_addr, uint64_t *val)
+{
+	return xscom_read(this_cpu()->chip_id, pcb_addr, val);
+}
+
+int xscom_writeme(uint64_t pcb_addr, uint64_t val)
+{
+	return xscom_write(this_cpu()->chip_id, pcb_addr, val);
+}
+
+static void xscom_init_chip_info(struct proc_chip *chip)
+{
+	uint64_t val;
+	int64_t rc;
+
+	rc = xscom_read(chip->id, 0xf000f, &val);
+	if (rc) {
+		prerror("XSCOM: Error %lld reading 0xf000f register\n", rc);
+		/* We leave chip type to UNKNOWN */
+		return;
+	}
+
+	/* Extract CFAM id */
+	val >>= 44;
+
+	/* Identify chip */
+	switch(val & 0xff) {
+	case 0xf9:
+		chip->type = PROC_CHIP_P7;
+		assert(proc_gen == proc_gen_p7);
+		break;
+	case 0xe8:
+		chip->type = PROC_CHIP_P7P;
+		assert(proc_gen == proc_gen_p7);
+		break;
+	case 0xef:
+		chip->type = PROC_CHIP_P8_MURANO;
+		assert(proc_gen == proc_gen_p8);
+		break;
+	case 0xea:
+		chip->type = PROC_CHIP_P8_VENICE;
+		assert(proc_gen == proc_gen_p8);
+		break;
+	default:
+		printf("CHIP: Unknown chip type 0x%02x !!!\n",
+		       (unsigned char)(val & 0xff));
+	}
+
+	/* Get EC level from CFAM ID */
+	chip->ec_level = ((val >> 16) & 0xf) << 4;
+	chip->ec_level |= (val >> 8) & 0xf;
+}
+
+void xscom_init(void)
+{
+	struct dt_node *xn;
+
+	dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+		uint32_t gcid = dt_get_chip_id(xn);
+		const struct dt_property *reg;
+		struct proc_chip *chip;
+		const char *chip_name;
+		static const char *chip_names[] = {
+			"UNKNOWN", "P7", "P7+", "P8E", "P8",
+		};
+
+		chip = get_chip(gcid);
+		assert(chip);
+
+		/* XXX We need a proper address parsing. For now, we just
+		 * "know" that we are looking at a u64
+		 */
+		reg = dt_find_property(xn, "reg");
+		assert(reg);
+
+		chip->xscom_base = dt_translate_address(xn, 0, NULL);
+
+		/* Grab processor type and EC level */
+		xscom_init_chip_info(chip);
+
+		chip_name = chip->type > PROC_CHIP_P8_VENICE ? "INVALID" :
+			chip_names[chip->type];
+		printf("XSCOM: chip 0x%x at 0x%llx [%s DD%x.%x]\n",
+		       gcid, chip->xscom_base,
+		       chip_name,
+		       chip->ec_level >> 4,
+		       chip->ec_level & 0xf);
+	}
+}
+
+void xscom_used_by_console(void)
+{
+	xscom_lock.in_con_path = true;
+}
-- 
cgit v1.2.1