25 files changed, 6013 insertions, 129 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6f07ec1c2f58..4e4c8a4a5fd3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -116,6 +116,8 @@ config PARIDE
 
 source "drivers/block/paride/Kconfig"
 
+source "drivers/block/mtip32xx/Kconfig"
+
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
 	depends on PCI && VIRT_TO_BUS
@@ -315,6 +317,17 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_NVME
+	tristate "NVM Express block device"
+	depends on PCI
+	---help---
+	  The NVM Express driver is for solid state drives directly
+	  connected to the PCI or PCI Express bus.  If you know you
+	  don't have one of these, it is safe to answer N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nvme.
+
 config BLK_DEV_OSD
 	tristate "OSD object-as-blkdev support"
 	depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 76646e9a1c91..5b795059f8fb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_NVME)	+= nvme.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
@@ -39,5 +40,6 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 587cce57adae..b0f553b26d0f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1735,7 +1735,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_BIG_PASSTHRU:
 		return cciss_bigpassthru(h, argp);
 
-	/* scsi_cmd_ioctl handles these, below, though some are not */
+	/* scsi_cmd_blk_ioctl handles these, below, though some are not */
 	/* very meaningful for cciss.  SG_IO is the main one people want. */
 
 	case SG_GET_VERSION_NUM:
@@ -1746,9 +1746,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case SG_EMULATED_HOST:
 	case SG_IO:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+		return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
 
-	/* scsi_cmd_ioctl would normally handle these, below, but */
+	/* scsi_cmd_blk_ioctl would normally handle these, below, but */
 	/* they aren't a good fit for cciss, as CD-ROMs are */
 	/* not supported, and we don't have any bus/target/lun */
 	/* which we present to the kernel. */
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9cf20355ceec..8d680562ba73 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -59,8 +59,8 @@
 
 /* module parameter, defined in drbd_main.c */
 extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
 extern unsigned int cn_idx;
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55356c8..211fc44f84be 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,8 +117,8 @@ module_param(fault_devs, int, 0644);
 
 /* module parameter, defined */
 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
 unsigned int cn_idx = CN_IDX_DRBD;
 int proc_details;       /* Detail level in proc drbd*/
 
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000000..b5dd14e072f2
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+	tristate "Block Device Driver for Micron PCIe SSDs"
+	depends on HOTPLUG_PCI_PCIE
+	help
+          This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000000..4fbef8c8329b
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for  Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000000..b74eab70c3d0
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,3651 @@
+/*
+ * Driver for the Micron P320 SSD
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
+#define HW_CMD_TBL_SZ		(AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
+#define HW_CMD_TBL_AR_SZ	(HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
+#define HW_PORT_PRIV_DMA_SZ \
+		(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+#define HOST_HSORG		0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV		0xFF00
+#define HSORG_STYLE		0x8
+#define HSORG_SLOTGROUPS	0x7
+
+#define PORT_COMMAND_ISSUE	0x38
+#define PORT_SDBV		0x7C
+
+#define PORT_OFFSET		0x100
+#define PORT_MEM_SIZE		0x80
+
+#define PORT_IRQ_ERR \
+	(PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+	 PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+	 PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+	(PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+	(PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+	 PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+	(PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN	0x00
+#define MTIP_PRODUCT_ASICFPGA	0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+	__u8		io_ports[8];
+	__u8		hob_ports[8];
+	ide_reg_valid_t	out_flags;
+	ide_reg_valid_t	in_flags;
+	int		data_phase;
+	int		req_cmd;
+	compat_ulong_t	out_size;
+	compat_ulong_t	in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ *	 true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+	u16 vendor_id = 0;
+
+       /* Read the vendorID from the configuration space */
+	pci_read_config_word(pdev, 0x00, &vendor_id);
+	if (vendor_id == 0xFFFF)
+		return true; /* device removed */
+
+	return false; /* device present */
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+	int group = 0, commandslot = 0, commandindex = 0;
+	struct mtip_cmd *command;
+	struct mtip_port *port = dd->port;
+
+	for (group = 0; group < 4; group++) {
+		for (commandslot = 0; commandslot < 32; commandslot++) {
+			if (!(port->allocated[group] & (1 << commandslot)))
+				continue;
+
+			commandindex = group << 5 | commandslot;
+			command = &port->commands[commandindex];
+
+			if (atomic_read(&command->active)
+			    && (command->async_callback)) {
+				command->async_callback(command->async_data,
+					-ENODEV);
+				command->async_callback = NULL;
+				command->async_data = NULL;
+			}
+
+			dma_unmap_sg(&port->dd->pdev->dev,
+				command->sg,
+				command->scatter_ents,
+				command->direction);
+		}
+	}
+
+	up(&port->cmd_slot);
+
+	atomic_set(&dd->drv_cleanup_done, true);
+}
+
+/*
+ * Obtain an empty command slot.
+ *
+ * This function needs to be reentrant since it could be called
+ * at the same time on multiple CPUs. The allocation of the
+ * command slot must be atomic.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	>= 0	Index of command slot obtained.
+ *	-1	No command slots available.
+ */
+static int get_slot(struct mtip_port *port)
+{
+	int slot, i;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+	/*
+	 * Try 10 times, because there is a small race here.
+	 *  that's ok, because it's still cheaper than a lock.
+	 *
+	 * Race: Since this section is not protected by lock, same bit
+	 * could be chosen by different process contexts running in
+	 * different processor. So instead of costly lock, we are going
+	 * with loop.
+	 */
+	for (i = 0; i < 10; i++) {
+		slot = find_next_zero_bit(port->allocated,
+					 num_command_slots, 1);
+		if ((slot < num_command_slots) &&
+		    (!test_and_set_bit(slot, port->allocated)))
+			return slot;
+	}
+	dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
+
+	if (mtip_check_surprise_removal(port->dd->pdev)) {
+		/* Device not present, clean outstanding commands */
+		mtip_command_cleanup(port->dd);
+	}
+	return -1;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @port Pointer to the port data structure.
+ * @tag  Tag of command to release
+ *
+ * return value
+ *	None
+ */
+static inline void release_slot(struct mtip_port *port, int tag)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(tag, port->allocated);
+	smp_mb__after_clear_bit();
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * Just like hba_reset, except does not call sleep, so can be
+ * run from interrupt/tasklet context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	The reset was successful.
+ *	-1	The HBA Reset bit did not clear.
+ */
+static int hba_reset_nosleep(struct driver_data *dd)
+{
+	unsigned long timeout;
+
+	/* Chip quirk: quiesce any chip function */
+	mdelay(10);
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/*
+	 * Wait 10ms then spin for up to 1 second
+	 * waiting for reset acknowledgement
+	 */
+	timeout = jiffies + msecs_to_jiffies(1000);
+	mdelay(10);
+	while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		 && time_before(jiffies, timeout))
+		mdelay(1);
+
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag  The tag of the command to be issued.
+ *
+ * return value
+ *      None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+	unsigned long flags = 0;
+
+	atomic_set(&port->commands[tag].active, 1);
+
+	spin_lock_irqsave(&port->cmd_issue_lock, flags);
+
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->s_active[MTIP_TAG_INDEX(tag)]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+
+	spin_unlock_irqrestore(&port->cmd_issue_lock, flags);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+	/* Flush */
+	readl(port->mmio + PORT_CMD);
+
+	return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+	readl(port->mmio + PORT_CMD);
+	return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+	/* Enable FIS reception */
+	mtip_enable_fis(port, 1);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+	/* Disable interrupts on this port */
+	writel(0, port->mmio + PORT_IRQ_MASK);
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Disable FIS reception */
+	mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+	int i;
+	mtip_deinit_port(port);
+
+	/* Program the command list base and FIS base addresses */
+	if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+		writel((port->command_list_dma >> 16) >> 16,
+			 port->mmio + PORT_LST_ADDR_HI);
+		writel((port->rxfis_dma >> 16) >> 16,
+			 port->mmio + PORT_FIS_ADDR_HI);
+	}
+
+	writel(port->command_list_dma & 0xFFFFFFFF,
+			port->mmio + PORT_LST_ADDR);
+	writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+	/* Clear SError */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* reset the completed registers.*/
+	for (i = 0; i < port->dd->slot_groups; i++)
+		writel(0xFFFFFFFF, port->completed[i]);
+
+	/* Clear any pending interrupts for this port */
+	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+	/* Enable port interrupts */
+	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+	unsigned long timeout;
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+		 && time_before(jiffies, timeout))
+		;
+
+	/*
+	 * Chip quirk: escalate to hba reset if
+	 * PxCMD.CR not clear after 500 ms
+	 */
+	if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+		dev_warn(&port->dd->pdev->dev,
+			"PxCMD.CR not clear, escalating reset\n");
+
+		if (hba_reset_nosleep(port->dd))
+			dev_err(&port->dd->pdev->dev,
+				"HBA reset escalation failed.\n");
+
+		/* 30 ms delay before com reset to quiesce chip */
+		mdelay(30);
+	}
+
+	dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+	/* Set PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) |
+			 1, port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 1 ms to quiesce chip function */
+	timeout = jiffies + msecs_to_jiffies(1);
+	while (time_before(jiffies, timeout))
+		;
+
+	/* Clear PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+			 port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+			 && time_before(jiffies, timeout))
+		;
+
+	if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+		dev_warn(&port->dd->pdev->dev,
+			"COM reset failed\n");
+
+	/* Clear SError, the PxSERR.DIAG.x should be set so clear it */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Called periodically to see if any read/write commands are
+ * taking too long to complete.
+ *
+ * @data Pointer to the PORT data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_timeout_function(unsigned long int data)
+{
+	struct mtip_port *port = (struct mtip_port *) data;
+	struct host_to_dev_fis *fis;
+	struct mtip_cmd *command;
+	int tag, cmdto_cnt = 0;
+	unsigned int bit, group;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+	if (unlikely(!port))
+		return;
+
+	if (atomic_read(&port->dd->resumeflag) == true) {
+		mod_timer(&port->cmd_timer,
+			jiffies + msecs_to_jiffies(30000));
+		return;
+	}
+
+	for (tag = 0; tag < num_command_slots; tag++) {
+		/*
+		 * Skip internal command slot as it has
+		 * its own timeout mechanism
+		 */
+		if (tag == MTIP_TAG_INTERNAL)
+			continue;
+
+		if (atomic_read(&port->commands[tag].active) &&
+		   (time_after(jiffies, port->commands[tag].comp_time))) {
+			group = tag >> 5;
+			bit = tag & 0x1F;
+
+			command = &port->commands[tag];
+			fis = (struct host_to_dev_fis *) command->command;
+
+			dev_warn(&port->dd->pdev->dev,
+				"Timeout for command tag %d\n", tag);
+
+			cmdto_cnt++;
+			if (cmdto_cnt == 1)
+				set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+
+			/*
+			 * Clear the completed bit. This should prevent
+			 *  any interrupt handlers from trying to retire
+			 *  the command.
+			 */
+			writel(1 << bit, port->completed[group]);
+
+			/* Call the async completion callback. */
+			if (likely(command->async_callback))
+				command->async_callback(command->async_data,
+							 -EIO);
+			command->async_callback = NULL;
+			command->comp_func = NULL;
+
+			/* Unmap the DMA scatter list entries */
+			dma_unmap_sg(&port->dd->pdev->dev,
+					command->sg,
+					command->scatter_ents,
+					command->direction);
+
+			/*
+			 * Clear the allocated bit and active tag for the
+			 * command.
+			 */
+			atomic_set(&port->commands[tag].active, 0);
+			release_slot(port, tag);
+
+			up(&port->cmd_slot);
+		}
+	}
+
+	if (cmdto_cnt) {
+		dev_warn(&port->dd->pdev->dev,
+			"%d commands timed out: restarting port",
+			cmdto_cnt);
+		mtip_restart_port(port);
+		clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+		wake_up_interruptible(&port->svc_wait);
+	}
+
+	/* Restart the timer */
+	mod_timer(&port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status)
+{
+	struct mtip_cmd *command;
+	struct driver_data *dd = data;
+	int cb_status = status ? -EIO : 0;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	command = &port->commands[tag];
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Upper layer callback */
+	if (likely(command->async_callback))
+		command->async_callback(command->async_data, cb_status);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev,
+		command->sg,
+		command->scatter_ents,
+		command->direction);
+
+	/* Clear the allocated and active bits for the command */
+	atomic_set(&port->commands[tag].active, 0);
+	release_slot(port, tag);
+
+	up(&port->cmd_slot);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command that has completed.
+ * @data   Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_completion(struct mtip_port *port,
+			    int tag,
+			    void *data,
+			    int status)
+{
+	struct mtip_cmd *command = &port->commands[tag];
+	struct completion *waiting = data;
+	if (unlikely(status == PORT_IRQ_TF_ERR))
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command %d completed with TFE\n", tag);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	complete(waiting);
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+			char *msg,
+			unsigned long *tagbits)
+{
+	unsigned int tag, count = 0;
+
+	for (tag = 0; tag < (dd->slot_groups) * 32; tag++) {
+		if (test_bit(tag, tagbits))
+			count++;
+	}
+	if (count)
+		dev_info(&dd->pdev->dev, "%s [%i tags]\n", msg, count);
+}
+
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+	int group, tag, bit, reissue;
+	struct mtip_port *port;
+	struct mtip_cmd  *command;
+	u32 completed;
+	struct host_to_dev_fis *fis;
+	unsigned long tagaccum[SLOTBITS_IN_LONGS];
+
+	dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+	port = dd->port;
+
+	/* Stop the timer to prevent command timeouts. */
+	del_timer(&port->cmd_timer);
+
+	/* Set eh_active */
+	set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* clear the tag accumulator */
+		memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+		/* Process successfully completed commands */
+		for (bit = 0; bit < 32 && completed; bit++) {
+			if (!(completed & (1<<bit)))
+				continue;
+			tag = (group << 5) + bit;
+
+			/* Skip the internal command slot */
+			if (tag == MTIP_TAG_INTERNAL)
+				continue;
+
+			command = &port->commands[tag];
+			if (likely(command->comp_func)) {
+				set_bit(tag, tagaccum);
+				atomic_set(&port->commands[tag].active, 0);
+				command->comp_func(port,
+					 tag,
+					 command->comp_data,
+					 0);
+			} else {
+				dev_err(&port->dd->pdev->dev,
+					"Missing completion func for tag %d",
+					tag);
+				if (mtip_check_surprise_removal(dd->pdev)) {
+					mtip_command_cleanup(dd);
+					/* don't proceed further */
+					return;
+				}
+			}
+		}
+	}
+	print_tags(dd, "TFE tags completed:", tagaccum);
+
+	/* Restart the port */
+	mdelay(20);
+	mtip_restart_port(port);
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		for (bit = 0; bit < 32; bit++) {
+			reissue = 1;
+			tag = (group << 5) + bit;
+
+			/* If the active bit is set re-issue the command */
+			if (atomic_read(&port->commands[tag].active) == 0)
+				continue;
+
+			fis = (struct host_to_dev_fis *)
+				port->commands[tag].command;
+
+			/* Should re-issue? */
+			if (tag == MTIP_TAG_INTERNAL ||
+			    fis->command == ATA_CMD_SET_FEATURES)
+				reissue = 0;
+
+			/*
+			 * First check if this command has
+			 *  exceeded its retries.
+			 */
+			if (reissue &&
+			    (port->commands[tag].retries-- > 0)) {
+
+				set_bit(tag, tagaccum);
+
+				/* Update the timeout value. */
+				port->commands[tag].comp_time =
+					jiffies + msecs_to_jiffies(
+					MTIP_NCQ_COMMAND_TIMEOUT_MS);
+				/* Re-issue the command. */
+				mtip_issue_ncq_command(port, tag);
+
+				continue;
+			}
+
+			/* Retire a command that will not be reissued */
+			dev_warn(&port->dd->pdev->dev,
+				"retiring tag %d\n", tag);
+			atomic_set(&port->commands[tag].active, 0);
+
+			if (port->commands[tag].comp_func)
+				port->commands[tag].comp_func(
+					port,
+					tag,
+					port->commands[tag].comp_data,
+					PORT_IRQ_TF_ERR);
+			else
+				dev_warn(&port->dd->pdev->dev,
+					"Bad completion for tag %d\n",
+					tag);
+		}
+	}
+	print_tags(dd, "TFE tags reissued:", tagaccum);
+
+	/* clear eh_active */
+	clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
+
+	mod_timer(&port->cmd_timer,
+		 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_process_sdbf(struct driver_data *dd)
+{
+	struct mtip_port  *port = dd->port;
+	int group, tag, bit;
+	u32 completed;
+	struct mtip_cmd *command;
+
+	/* walk all bits in all slot groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* Process completed commands. */
+		for (bit = 0;
+		     (bit < 32) && completed;
+		     bit++, completed >>= 1) {
+			if (completed & 0x01) {
+				tag = (group << 5) | bit;
+
+				/* skip internal command slot. */
+				if (unlikely(tag == MTIP_TAG_INTERNAL))
+					continue;
+
+				command = &port->commands[tag];
+				/* make internal callback */
+				if (likely(command->comp_func)) {
+					command->comp_func(
+						port,
+						tag,
+						command->comp_data,
+						0);
+				} else {
+					dev_warn(&dd->pdev->dev,
+						"Null completion "
+						"for tag %d",
+						tag);
+
+					if (mtip_check_surprise_removal(
+						dd->pdev)) {
+						mtip_command_cleanup(dd);
+						return;
+					}
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+	if (test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) &&
+	    (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+		& (1 << MTIP_TAG_INTERNAL))) {
+		if (cmd->comp_func) {
+			cmd->comp_func(port,
+				MTIP_TAG_INTERNAL,
+				cmd->comp_data,
+				0);
+			return;
+		}
+	}
+
+	dev_warn(&dd->pdev->dev, "IRQ status 0x%x ignored.\n", port_stat);
+
+	return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
+		mtip_handle_tfe(dd);
+
+	if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.x\n");
+		writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.n\n");
+		writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+		dev_warn(&dd->pdev->dev,
+			"Port stat errors %x unhandled\n",
+			(port_stat & ~PORT_IRQ_HANDLED));
+	}
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+	struct driver_data *dd = (struct driver_data *) data;
+	struct mtip_port *port = dd->port;
+	u32 hba_stat, port_stat;
+	int rv = IRQ_NONE;
+
+	hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+	if (hba_stat) {
+		rv = IRQ_HANDLED;
+
+		/* Acknowledge the interrupt status on the port.*/
+		port_stat = readl(port->mmio + PORT_IRQ_STAT);
+		writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+		/* Demux port status */
+		if (likely(port_stat & PORT_IRQ_SDB_FIS))
+			mtip_process_sdbf(dd);
+
+		if (unlikely(port_stat & PORT_IRQ_ERR)) {
+			if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+				mtip_command_cleanup(dd);
+				/* don't proceed further */
+				return IRQ_HANDLED;
+			}
+
+			mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_LEGACY))
+			mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+	}
+
+	/* acknowledge interrupt */
+	writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+	return rv;
+}
+
+/*
+ * Wrapper for mtip_handle_irq
+ * (ignores return code)
+ */
+static void mtip_tasklet(unsigned long data)
+{
+	mtip_handle_irq((struct driver_data *) data);
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq		IRQ number.
+ * @instance	Pointer to the driver data structure.
+ *
+ * return value
+ *	IRQ_HANDLED	A HBA interrupt was pending and handled.
+ *	IRQ_NONE	This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+	struct driver_data *dd = instance;
+	tasklet_schedule(&dd->tasklet);
+	return IRQ_HANDLED;
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+	atomic_set(&port->commands[tag].active, 1);
+	writel(1 << MTIP_TAG_BIT(tag),
+		port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port    Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ *	0	Success
+ *	-EBUSY  Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+	unsigned long to;
+	unsigned int n;
+	unsigned int active = 1;
+
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		if (test_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
+			msleep(20);
+			continue; /* svc thd is actively issuing commands */
+		}
+		/*
+		 * Ignore s_active bit 0 of array element 0.
+		 * This bit will always be set
+		 */
+		active = readl(port->s_active[0]) & 0xFFFFFFFE;
+		for (n = 1; n < port->dd->slot_groups; n++)
+			active |= readl(port->s_active[n]);
+
+		if (!active)
+			break;
+
+		msleep(20);
+	} while (time_before(jiffies, to));
+
+	return active ? -EBUSY : 0;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port    Pointer to the port data structure.
+ * @fis     Pointer to the FIS that describes the command.
+ * @fis_len  Length in WORDS of the FIS.
+ * @buffer  DMA accessible for command data.
+ * @buf_len  Length, in bytes, of the data buffer.
+ * @opts    Command header options, excluding the FIS length
+ *             and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ *	0	 Command completed successfully.
+ *	-EFAULT  The buffer address is not correctly aligned.
+ *	-EBUSY   Internal command or other IO in progress.
+ *	-EAGAIN  Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+					void *fis,
+					int fis_len,
+					dma_addr_t buffer,
+					int buf_len,
+					u32 opts,
+					gfp_t atomic,
+					unsigned long timeout)
+{
+	struct mtip_cmd_sg *command_sg;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int rv = 0;
+	struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
+	if (buffer & 0x00000007) {
+		dev_err(&port->dd->pdev->dev,
+			"SG buffer is not 8 byte aligned\n");
+		return -EFAULT;
+	}
+
+	/* Only one internal command should be running at a time */
+	if (test_and_set_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command already active\n");
+		return -EBUSY;
+	}
+	set_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+
+	if (atomic == GFP_KERNEL) {
+		/* wait for io to complete if non atomic */
+		if (mtip_quiesce_io(port, 5000) < 0) {
+			dev_warn(&port->dd->pdev->dev,
+				"Failed to quiesce IO\n");
+			release_slot(port, MTIP_TAG_INTERNAL);
+			clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+			wake_up_interruptible(&port->svc_wait);
+			return -EBUSY;
+		}
+
+		/* Set the completion function and data for the command. */
+		int_cmd->comp_data = &wait;
+		int_cmd->comp_func = mtip_completion;
+
+	} else {
+		/* Clear completion - we're going to poll */
+		int_cmd->comp_data = NULL;
+		int_cmd->comp_func = NULL;
+	}
+
+	/* Copy the command to the command table */
+	memcpy(int_cmd->command, fis, fis_len*4);
+
+	/* Populate the SG list */
+	int_cmd->command_header->opts =
+		 __force_bit2int cpu_to_le32(opts | fis_len);
+	if (buf_len) {
+		command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+		command_sg->info =
+			__force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+		command_sg->dba	=
+			__force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+		command_sg->dba_upper =
+			__force_bit2int cpu_to_le32((buffer >> 16) >> 16);
+
+		int_cmd->command_header->opts |=
+			__force_bit2int cpu_to_le32((1 << 16));
+	}
+
+	/* Populate the command header */
+	int_cmd->command_header->byte_count = 0;
+
+	/* Issue the command to the hardware */
+	mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+	/* Poll if atomic, wait_for_completion otherwise */
+	if (atomic == GFP_KERNEL) {
+		/* Wait for the command to complete or timeout. */
+		if (wait_for_completion_timeout(
+				&wait,
+				msecs_to_jiffies(timeout)) == 0) {
+			dev_err(&port->dd->pdev->dev,
+				"Internal command did not complete [%d] "
+				"within timeout of  %lu ms\n",
+				atomic, timeout);
+			rv = -EAGAIN;
+		}
+
+		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL)) {
+			dev_warn(&port->dd->pdev->dev,
+				"Retiring internal command but CI is 1.\n");
+		}
+
+	} else {
+		/* Spin for <timeout> checking if command still outstanding */
+		timeout = jiffies + msecs_to_jiffies(timeout);
+
+		while ((readl(
+			port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL))
+			&& time_before(jiffies, timeout))
+			;
+
+		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL)) {
+			dev_err(&port->dd->pdev->dev,
+				"Internal command did not complete [%d]\n",
+				atomic);
+			rv = -EAGAIN;
+		}
+	}
+
+	/* Clear the allocated and active bits for the internal command. */
+	atomic_set(&int_cmd->active, 0);
+	release_slot(port, MTIP_TAG_INTERNAL);
+	clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
+
+	return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ *	None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+	int i;
+	for (i = 0; i < (len/2); i++)
+		be16_to_cpus(&buf[i]);
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port	 Pointer to the port structure.
+ * @user_buffer  A user space buffer where the identify data should be
+ *                    copied.
+ *
+ * return value
+ *	0	Command completed successfully.
+ *	-EFAULT An error occurred while coping data to the user buffer.
+ *	-1	Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+	int rv = 0;
+	struct host_to_dev_fis fis;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_ID_ATA;
+
+	/* Set the identify information as invalid. */
+	port->identify_valid = 0;
+
+	/* Clear the identify information. */
+	memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				5,
+				port->identify_dma,
+				sizeof(u16) * ATA_ID_WORDS,
+				0,
+				GFP_KERNEL,
+				MTIP_INTERNAL_COMMAND_TIMEOUT_MS)
+				< 0) {
+		rv = -1;
+		goto out;
+	}
+
+	/*
+	 * Perform any necessary byte-swapping.  Yes, the kernel does in fact
+	 * perform field-sensitive swapping on the string fields.
+	 * See the kernel use of ata_id_string() for proof of this.
+	 */
+#ifdef __LITTLE_ENDIAN
+	ata_swap_string(port->identify + 27, 40);  /* model string*/
+	ata_swap_string(port->identify + 23, 8);   /* firmware string*/
+	ata_swap_string(port->identify + 10, 20);  /* serial# string*/
+#else
+	{
+		int i;
+		for (i = 0; i < ATA_ID_WORDS; i++)
+			port->identify[i] = le16_to_cpu(port->identify[i]);
+	}
+#endif
+
+	/* Set the identify buffer as valid. */
+	port->identify_valid = 1;
+
+	if (user_buffer) {
+		if (copy_to_user(
+			user_buffer,
+			port->identify,
+			ATA_ID_WORDS * sizeof(u16))) {
+			rv = -EFAULT;
+			goto out;
+		}
+	}
+
+out:
+	return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	0	Command was executed successfully.
+ *	-1	An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+	int rv;
+	struct host_to_dev_fis	fis;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_STANDBYNOW1;
+
+	/* Execute the command.  Use a 15-second timeout for large drives. */
+	rv = mtip_exec_internal_command(port,
+					&fis,
+					5,
+					0,
+					0,
+					0,
+					GFP_KERNEL,
+					15000);
+
+	return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd      Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ *	1 Capacity was returned successfully.
+ *	0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+	struct mtip_port *port = dd->port;
+	u64 total, raw0, raw1, raw2, raw3;
+	raw0 = port->identify[100];
+	raw1 = port->identify[101];
+	raw2 = port->identify[102];
+	raw3 = port->identify[103];
+	total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+	*sectors = total;
+	return (bool) !!port->identify_valid;
+}
+
+/*
+ * Reset the HBA.
+ *
+ * Resets the HBA by setting the HBA Reset bit in the Global
+ * HBA Control register. After setting the HBA Reset bit the
+ * function waits for 1 second before reading the HBA Reset
+ * bit to make sure it has cleared. If HBA Reset is not clear
+ * an error is returned. Cannot be used in non-blockable
+ * context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0  The reset was successful.
+ *	-1 The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+	mtip_deinit_port(dd->port);
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/* Wait for reset to clear */
+	ssleep(1);
+
+	/* Check the bit has cleared */
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET) {
+		dev_err(&dd->pdev->dev,
+			"Reset bit did not clear.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+	sector_t sectors;
+	unsigned short revid;
+	char cbuf[42];
+
+	if (!port->identify_valid)
+		return;
+
+	strlcpy(cbuf, (char *)(port->identify+10), 21);
+	dev_info(&port->dd->pdev->dev,
+		"Serial No.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+23), 9);
+	dev_info(&port->dd->pdev->dev,
+		"Firmware Ver.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+27), 41);
+	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+	if (mtip_hw_get_capacity(port->dd, &sectors))
+		dev_info(&port->dd->pdev->dev,
+			"Capacity: %llu sectors (%llu MB)\n",
+			 (u64)sectors,
+			 ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+	pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+	switch (revid & 0xFF) {
+	case 0x1:
+		strlcpy(cbuf, "A0", 3);
+		break;
+	case 0x3:
+		strlcpy(cbuf, "A2", 3);
+		break;
+	default:
+		strlcpy(cbuf, "?", 2);
+		break;
+	}
+	dev_info(&port->dd->pdev->dev,
+		"Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ *	None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+				struct mtip_cmd *command,
+				int nents)
+{
+	int n;
+	unsigned int dma_len;
+	struct mtip_cmd_sg *command_sg;
+	struct scatterlist *sg = command->sg;
+
+	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+	for (n = 0; n < nents; n++) {
+		dma_len = sg_dma_len(sg);
+		if (dma_len > 0x400000)
+			dev_err(&dd->pdev->dev,
+				"DMA segment length truncated\n");
+		command_sg->info = __force_bit2int
+			cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	= __force_bit2int
+			cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper = __force_bit2int
+			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+		command_sg++;
+		sg++;
+	}
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[1];
+	fis.sect_count	= command[2];
+	fis.sector	= command[3];
+	fis.cyl_low	= command[4];
+	fis.cyl_hi	= command[5];
+	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
+
+
+	dbg_printk(MTIP_DRV_NAME "%s: User Command: cmd %x, feat %x, "
+		"nsect %x, sect %x, lcyl %x, "
+		"hcyl %x, sel %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3],
+		command[4],
+		command[5],
+		command[6]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				 &fis,
+				 5,
+				 0,
+				 0,
+				 0,
+				 GFP_KERNEL,
+				 MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
+		return -1;
+	}
+
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[4] = reply->cyl_low;
+	command[5] = reply->cyl_hi;
+
+	dbg_printk(MTIP_DRV_NAME "%s: Completion Status: stat %x, "
+		"err %x , cyl_lo %x cyl_hi %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[4],
+		command[5]);
+
+	return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ *                   data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ *                 data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+				void __user *user_buffer)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type		= 0x27;
+	fis.opts		= 1 << 7;
+	fis.command		= command[0];
+	fis.features	= command[2];
+	fis.sect_count	= command[3];
+	if (fis.command == ATA_CMD_SMART) {
+		fis.sector	= command[1];
+		fis.cyl_low	= 0x4F;
+		fis.cyl_hi	= 0xC2;
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		"%s: User Command: cmd %x, sect %x, "
+		"feat %x, sectcnt %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3]);
+
+	memset(port->sector_buffer, 0x00, ATA_SECT_SIZE);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				 5,
+				 port->sector_buffer_dma,
+				 (command[3] != 0) ? ATA_SECT_SIZE : 0,
+				 0,
+				 GFP_KERNEL,
+				 MTIP_IOCTL_COMMAND_TIMEOUT_MS)
+				 < 0) {
+		return -1;
+	}
+
+	/* Collect the completion status. */
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[2] = command[3];
+
+	dbg_printk(MTIP_DRV_NAME
+		"%s: Completion Status: stat %x, "
+		"err %x, cmd %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2]);
+
+	if (user_buffer && command[3]) {
+		if (copy_to_user(user_buffer,
+				 port->sector_buffer,
+				 ATA_SECT_SIZE * command[3])) {
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *  Indicates whether a command has a single sector payload.
+ *
+ *  @command passed to the device to perform the certain event.
+ *  @features passed to the device to perform the certain event.
+ *
+ *  return value
+ *	1	command is one that always has a single sector payload,
+ *		regardless of the value in the Sector Count field.
+ *      0       otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+				    unsigned char features)
+{
+	unsigned int rv = 0;
+
+	/* list of commands that have an implicit sector count of 1 */
+	switch (command) {
+	case ATA_CMD_SEC_SET_PASS:
+	case ATA_CMD_SEC_UNLOCK:
+	case ATA_CMD_SEC_ERASE_PREP:
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case ATA_CMD_SEC_FREEZE_LOCK:
+	case ATA_CMD_SEC_DISABLE_PASS:
+	case ATA_CMD_PMP_READ:
+	case ATA_CMD_PMP_WRITE:
+		rv = 1;
+		break;
+	case ATA_CMD_SET_MAX:
+		if (features == ATA_SET_MAX_UNLOCK)
+			rv = 1;
+		break;
+	case ATA_CMD_SMART:
+		if ((features == ATA_SMART_READ_VALUES) ||
+				(features == ATA_SMART_READ_THRESHOLDS))
+			rv = 1;
+		break;
+	case ATA_CMD_CONF_OVERLAY:
+		if ((features == ATA_DCO_IDENTIFY) ||
+				(features == ATA_DCO_SET))
+			rv = 1;
+		break;
+	}
+	return rv;
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+			       void __user *buf,
+			       ide_task_request_t *req_task,
+			       int outtotal)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *outbuf = NULL;
+	u8 *inbuf = NULL;
+	dma_addr_t outbuf_dma = 0;
+	dma_addr_t inbuf_dma = 0;
+	dma_addr_t dma_buffer = 0;
+	int err = 0;
+	unsigned int taskin = 0;
+	unsigned int taskout = 0;
+	u8 nsect = 0;
+	unsigned int timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+	unsigned int force_single_sector;
+	unsigned int transfer_size;
+	unsigned long task_file_data;
+	int intotal = outtotal + req_task->out_size;
+
+	taskout = req_task->out_size;
+	taskin = req_task->in_size;
+	/* 130560 = 512 * 0xFF*/
+	if (taskin > 130560 || taskout > 130560) {
+		err = -EINVAL;
+		goto abort;
+	}
+
+	if (taskout) {
+		outbuf = kzalloc(taskout, GFP_KERNEL);
+		if (outbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		outbuf_dma = pci_map_single(dd->pdev,
+					 outbuf,
+					 taskout,
+					 DMA_TO_DEVICE);
+		if (outbuf_dma == 0) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = outbuf_dma;
+	}
+
+	if (taskin) {
+		inbuf = kzalloc(taskin, GFP_KERNEL);
+		if (inbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+
+		if (copy_from_user(inbuf, buf + intotal, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		inbuf_dma = pci_map_single(dd->pdev,
+					 inbuf,
+					 taskin, DMA_FROM_DEVICE);
+		if (inbuf_dma == 0) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = inbuf_dma;
+	}
+
+	/* only supports PIO and non-data commands from this ioctl. */
+	switch (req_task->data_phase) {
+	case TASKFILE_OUT:
+		nsect = taskout / ATA_SECT_SIZE;
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_IN:
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_NO_DATA:
+		reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+		break;
+	default:
+		err = -EINVAL;
+		goto abort;
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= req_task->io_ports[7];
+	fis.features	= req_task->io_ports[1];
+	fis.sect_count	= req_task->io_ports[2];
+	fis.lba_low	= req_task->io_ports[3];
+	fis.lba_mid	= req_task->io_ports[4];
+	fis.lba_hi	= req_task->io_ports[5];
+	 /* Clear the dev bit*/
+	fis.device	= req_task->io_ports[6] & ~0x10;
+
+	if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+		req_task->in_flags.all	=
+			IDE_TASKFILE_STD_IN_FLAGS |
+			(IDE_HOB_STD_IN_FLAGS << 8);
+		fis.lba_low_ex		= req_task->hob_ports[3];
+		fis.lba_mid_ex		= req_task->hob_ports[4];
+		fis.lba_hi_ex		= req_task->hob_ports[5];
+		fis.features_ex		= req_task->hob_ports[1];
+		fis.sect_cnt_ex		= req_task->hob_ports[2];
+
+	} else {
+		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+	}
+
+	force_single_sector = implicit_sector(fis.command, fis.features);
+
+	if ((taskin || taskout) && (!fis.sect_count)) {
+		if (nsect)
+			fis.sect_count = nsect;
+		else {
+			if (!force_single_sector) {
+				dev_warn(&dd->pdev->dev,
+					"data movement but "
+					"sect_count is 0\n");
+					err = -EINVAL;
+					goto abort;
+			}
+		}
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		"taskfile: cmd %x, feat %x, nsect %x,"
+		" sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+		" head/dev %x\n",
+		fis.command,
+		fis.features,
+		fis.sect_count,
+		fis.lba_low,
+		fis.lba_mid,
+		fis.lba_hi,
+		fis.device);
+
+	switch (fis.command) {
+	case ATA_CMD_DOWNLOAD_MICRO:
+		/* Change timeout for Download Microcode to 60 seconds.*/
+		timeout = 60000;
+		break;
+	case ATA_CMD_SEC_ERASE_UNIT:
+		/* Change timeout for Security Erase Unit to 4 minutes.*/
+		timeout = 240000;
+		break;
+	case ATA_CMD_STANDBYNOW1:
+		/* Change timeout for standby immediate to 10 seconds.*/
+		timeout = 10000;
+		break;
+	case 0xF7:
+	case 0xFA:
+		/* Change timeout for vendor unique command to 10 secs */
+		timeout = 10000;
+		break;
+	case ATA_CMD_SMART:
+		/* Change timeout for vendor unique command to 10 secs */
+		timeout = 10000;
+		break;
+	default:
+		timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+		break;
+	}
+
+	/* Determine the correct transfer size.*/
+	if (force_single_sector)
+		transfer_size = ATA_SECT_SIZE;
+	else
+		transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+	/* Execute the command.*/
+	if (mtip_exec_internal_command(dd->port,
+				 &fis,
+				 5,
+				 dma_buffer,
+				 transfer_size,
+				 0,
+				 GFP_KERNEL,
+				 timeout) < 0) {
+		err = -EIO;
+		goto abort;
+	}
+
+	task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+	if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+		reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+		req_task->io_ports[7] = reply->control;
+	} else {
+		reply = dd->port->rxfis + RX_FIS_D2H_REG;
+		req_task->io_ports[7] = reply->command;
+	}
+
+	/* reclaim the DMA buffers.*/
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+			taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+			taskout, DMA_TO_DEVICE);
+	inbuf_dma  = 0;
+	outbuf_dma = 0;
+
+	/* return the ATA registers to the caller.*/
+	req_task->io_ports[1] = reply->features;
+	req_task->io_ports[2] = reply->sect_count;
+	req_task->io_ports[3] = reply->lba_low;
+	req_task->io_ports[4] = reply->lba_mid;
+	req_task->io_ports[5] = reply->lba_hi;
+	req_task->io_ports[6] = reply->device;
+
+	if (req_task->out_flags.all & 1)  {
+
+		req_task->hob_ports[3] = reply->lba_low_ex;
+		req_task->hob_ports[4] = reply->lba_mid_ex;
+		req_task->hob_ports[5] = reply->lba_hi_ex;
+		req_task->hob_ports[1] = reply->features_ex;
+		req_task->hob_ports[2] = reply->sect_cnt_ex;
+	}
+
+	/* Com rest after secure erase or lowlevel format */
+	if (((fis.command == ATA_CMD_SEC_ERASE_UNIT) ||
+		((fis.command == 0xFC) &&
+			(fis.features == 0x27 || fis.features == 0x72 ||
+			 fis.features == 0x62 || fis.features == 0x26))) &&
+			 !(reply->command & 1)) {
+		mtip_restart_port(dd->port);
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		"%s: Completion: stat %x,"
+		"err %x, sect_cnt %x, lbalo %x,"
+		"lbamid %x, lbahi %x, dev %x\n",
+		__func__,
+		req_task->io_ports[7],
+		req_task->io_ports[1],
+		req_task->io_ports[2],
+		req_task->io_ports[3],
+		req_task->io_ports[4],
+		req_task->io_ports[5],
+		req_task->io_ports[6]);
+
+	if (taskout) {
+		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+	if (taskin) {
+		if (copy_to_user(buf + intotal, inbuf, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+abort:
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+					taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+					taskout, DMA_TO_DEVICE);
+	kfree(outbuf);
+	kfree(inbuf);
+
+	return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ *	0	The IOCTL completed successfully.
+ *	-ENOTTY The specified command is not supported.
+ *	-EFAULT An error occurred copying data to a user space buffer.
+ *	-EIO	An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+			 unsigned long arg)
+{
+	switch (cmd) {
+	case HDIO_GET_IDENTITY:
+		if (mtip_get_identify(dd->port, (void __user *) arg) < 0) {
+			dev_warn(&dd->pdev->dev,
+				"Unable to read identity\n");
+			return -EIO;
+		}
+
+		break;
+	case HDIO_DRIVE_CMD:
+	{
+		u8 drive_command[4];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_command(dd->port,
+					 drive_command,
+					 (void __user *) (arg+4)))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASK:
+	{
+		u8 drive_command[7];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_task(dd->port, drive_command))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASKFILE: {
+		ide_task_request_t req_task;
+		int ret, outtotal;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+					sizeof(req_task)))
+			return -EFAULT;
+
+		outtotal = sizeof(req_task);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+							sizeof(req_task)))
+			return -EFAULT;
+
+		return ret;
+	}
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd       Pointer to the driver data structure.
+ * @start    First sector to read.
+ * @nsect    Number of sectors to read.
+ * @nents    Number of entries in scatter list for the read command.
+ * @tag      The tag of this read command.
+ * @callback Pointer to the function that should be called
+ *	     when the read completes.
+ * @data     Callback data passed to the callback function
+ *	     when the read completes.
+ * @barrier  If non-zero, this command must be completed before
+ *	     issuing any other commands.
+ * @dir      Direction (read or write)
+ *
+ * return value
+ *	None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
+			      int nsect, int nents, int tag, void *callback,
+			      void *data, int barrier, int dir)
+{
+	struct host_to_dev_fis	*fis;
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *command = &port->commands[tag];
+
+	/* Map the scatter list for DMA access */
+	if (dir == READ)
+		nents = dma_map_sg(&dd->pdev->dev, command->sg,
+					nents, DMA_FROM_DEVICE);
+	else
+		nents = dma_map_sg(&dd->pdev->dev, command->sg,
+					nents, DMA_TO_DEVICE);
+
+	command->scatter_ents = nents;
+
+	/*
+	 * The number of retries for this command before it is
+	 * reported as a failure to the upper layers.
+	 */
+	command->retries = MTIP_MAX_RETRIES;
+
+	/* Fill out fis */
+	fis = command->command;
+	fis->type        = 0x27;
+	fis->opts        = 1 << 7;
+	fis->command     =
+		(dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
+	*((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF);
+	*((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF);
+	fis->device	 = 1 << 6;
+	if (barrier)
+		fis->device |= FUA_BIT;
+	fis->features    = nsect & 0xFF;
+	fis->features_ex = (nsect >> 8) & 0xFF;
+	fis->sect_count  = ((tag << 3) | (tag >> 5));
+	fis->sect_cnt_ex = 0;
+	fis->control     = 0;
+	fis->res2        = 0;
+	fis->res3        = 0;
+	fill_command_sg(dd, command, nents);
+
+	/* Populate the command header */
+	command->command_header->opts =
+			__force_bit2int cpu_to_le32(
+				(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	command->command_header->byte_count = 0;
+
+	/*
+	 * Set the completion function and data for the command
+	 * within this layer.
+	 */
+	command->comp_data = dd;
+	command->comp_func = mtip_async_complete;
+	command->direction = (dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+
+	/*
+	 * Set the completion function and data for the command passed
+	 * from the upper layer.
+	 */
+	command->async_data = data;
+	command->async_callback = callback;
+
+	/*
+	 * To prevent this command from being issued
+	 * if an internal command is in progress or error handling is active.
+	 */
+	if (unlikely(test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) ||
+			test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags))) {
+		set_bit(tag, port->cmds_to_issue);
+		set_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
+		return;
+	}
+
+	/* Issue the command to the hardware */
+	mtip_issue_ncq_command(port, tag);
+
+	/* Set the command's timeout value.*/
+	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
+					MTIP_NCQ_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Release a command slot.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @tag Slot tag
+ *
+ * return value
+ *      None
+ */
+static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+{
+	release_slot(dd->port, tag);
+}
+
+/*
+ * Obtain a command slot and return its associated scatter list.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @tag Pointer to an int that will receive the allocated command
+ *            slot tag.
+ *
+ * return value
+ *	Pointer to the scatter list for the allocated command slot
+ *	or NULL if no command slots are available.
+ */
+static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
+						   int *tag)
+{
+	/*
+	 * It is possible that, even with this semaphore, a thread
+	 * may think that no command slots are available. Therefore, we
+	 * need to make an attempt to get_slot().
+	 */
+	down(&dd->port->cmd_slot);
+	*tag = get_slot(dd->port);
+
+	if (unlikely(*tag < 0))
+		return NULL;
+
+	return dd->port->commands[*tag].sg;
+}
+
+/*
+ * Sysfs register/status dump.
+ *
+ * @dev  Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf  Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ *	The size, in bytes, of the data copied into buf.
+ */
+static ssize_t hw_show_registers(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	u32 group_allocated;
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+	int n;
+
+	size += sprintf(&buf[size], "%s:\ns_active:\n", __func__);
+
+	for (n = 0; n < dd->slot_groups; n++)
+		size += sprintf(&buf[size], "0x%08x\n",
+					 readl(dd->port->s_active[n]));
+
+	size += sprintf(&buf[size], "Command Issue:\n");
+
+	for (n = 0; n < dd->slot_groups; n++)
+		size += sprintf(&buf[size], "0x%08x\n",
+					readl(dd->port->cmd_issue[n]));
+
+	size += sprintf(&buf[size], "Allocated:\n");
+
+	for (n = 0; n < dd->slot_groups; n++) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->allocated[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->allocated[n];
+		size += sprintf(&buf[size], "0x%08x\n",
+				 group_allocated);
+	}
+
+	size += sprintf(&buf[size], "completed:\n");
+
+	for (n = 0; n < dd->slot_groups; n++)
+		size += sprintf(&buf[size], "0x%08x\n",
+				readl(dd->port->completed[n]));
+
+	size += sprintf(&buf[size], "PORT_IRQ_STAT 0x%08x\n",
+				readl(dd->port->mmio + PORT_IRQ_STAT));
+	size += sprintf(&buf[size], "HOST_IRQ_STAT 0x%08x\n",
+				readl(dd->mmio + HOST_IRQ_STAT));
+
+	return size;
+}
+static DEVICE_ATTR(registers, S_IRUGO, hw_show_registers, NULL);
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	if (sysfs_create_file(kobj, &dev_attr_registers.attr))
+		dev_warn(&dd->pdev->dev,
+			"Error creating registers sysfs entry\n");
+	return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	sysfs_remove_file(kobj, &dev_attr_registers.attr);
+
+	return 0;
+}
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+	u32 hwdata;
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	/* interrupt bug workaround: use only 1 IS bit.*/
+	writel(hwdata |
+		HSORG_DISABLE_SLOTGRP_INTR |
+		HSORG_DISABLE_SLOTGRP_PXIS,
+		dd->mmio + HOST_HSORG);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure.  This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+	u32 hwdata;
+	unsigned int rev, slotgroups;
+
+	/*
+	 * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+	 * info register:
+	 * [15:8] hardware/software interface rev#
+	 * [   3] asic-style interface
+	 * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+	 */
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	dd->product_type = MTIP_PRODUCT_UNKNOWN;
+	dd->slot_groups = 1;
+
+	if (hwdata & 0x8) {
+		dd->product_type = MTIP_PRODUCT_ASICFPGA;
+		rev = (hwdata & HSORG_HWREV) >> 8;
+		slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+		dev_info(&dd->pdev->dev,
+			"ASIC-FPGA design, HS rev 0x%x, "
+			"%i slot groups [%i slots]\n",
+			 rev,
+			 slotgroups,
+			 slotgroups * 32);
+
+		if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+			dev_warn(&dd->pdev->dev,
+				"Warning: driver only supports "
+				"%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+			slotgroups = MTIP_MAX_SLOT_GROUPS;
+		}
+		dd->slot_groups = slotgroups;
+		return;
+	}
+
+	dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	0	FTL rebuild completed successfully
+ *	-EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+	unsigned long timeout, cnt = 0, start;
+
+	dev_warn(&dd->pdev->dev,
+		"FTL rebuild in progress. Polling for completion.\n");
+
+	start = jiffies;
+	dd->ftlrebuildflag = 1;
+	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+	do {
+		if (mtip_check_surprise_removal(dd->pdev))
+			return -EFAULT;
+
+		if (mtip_get_identify(dd->port, NULL) < 0)
+			return -EFAULT;
+
+		if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+			MTIP_FTL_REBUILD_MAGIC) {
+			ssleep(1);
+			/* Print message every 3 minutes */
+			if (cnt++ >= 180) {
+				dev_warn(&dd->pdev->dev,
+				"FTL rebuild in progress (%d secs).\n",
+				jiffies_to_msecs(jiffies - start) / 1000);
+				cnt = 0;
+			}
+		} else {
+			dev_warn(&dd->pdev->dev,
+				"FTL rebuild complete (%d secs).\n",
+			jiffies_to_msecs(jiffies - start) / 1000);
+			dd->ftlrebuildflag = 0;
+			mtip_block_initialize(dd);
+			break;
+		}
+		ssleep(10);
+	} while (time_before(jiffies, timeout));
+
+	/* Check for timeout */
+	if (dd->ftlrebuildflag) {
+		dev_err(&dd->pdev->dev,
+		"Timed out waiting for FTL rebuild to complete (%d secs).\n",
+		jiffies_to_msecs(jiffies - start) / 1000);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+
+static int mtip_service_thread(void *data)
+{
+	struct driver_data *dd = (struct driver_data *)data;
+	unsigned long slot, slot_start, slot_wrap;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+	struct mtip_port *port = dd->port;
+
+	while (1) {
+		/*
+		 * the condition is to check neither an internal command is
+		 * is in progress nor error handling is active
+		 */
+		wait_event_interruptible(port->svc_wait, (port->flags) &&
+			!test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) &&
+			!test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags));
+
+		if (kthread_should_stop())
+			break;
+
+		set_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+		if (test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
+			slot = 1;
+			/* used to restrict the loop to one iteration */
+			slot_start = num_cmd_slots;
+			slot_wrap = 0;
+			while (1) {
+				slot = find_next_bit(port->cmds_to_issue,
+						num_cmd_slots, slot);
+				if (slot_wrap == 1) {
+					if ((slot_start >= slot) ||
+						(slot >= num_cmd_slots))
+						break;
+				}
+				if (unlikely(slot_start == num_cmd_slots))
+					slot_start = slot;
+
+				if (unlikely(slot == num_cmd_slots)) {
+					slot = 1;
+					slot_wrap = 1;
+					continue;
+				}
+
+				/* Issue the command to the hardware */
+				mtip_issue_ncq_command(port, slot);
+
+				/* Set the command's timeout value.*/
+				port->commands[slot].comp_time = jiffies +
+				msecs_to_jiffies(MTIP_NCQ_COMMAND_TIMEOUT_MS);
+
+				clear_bit(slot, port->cmds_to_issue);
+			}
+
+			clear_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
+		} else if (test_bit(MTIP_FLAG_REBUILD_BIT, &port->flags)) {
+			mtip_ftl_rebuild_poll(dd);
+			clear_bit(MTIP_FLAG_REBUILD_BIT, &port->flags);
+		}
+		clear_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		if (test_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &port->flags))
+			break;
+	}
+	return 0;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+	int i;
+	int rv;
+	unsigned int num_command_slots;
+
+	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+	mtip_detect_product(dd);
+	if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+		rv = -EIO;
+		goto out1;
+	}
+	num_command_slots = dd->slot_groups * 32;
+
+	hba_setup(dd);
+
+	tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
+
+	dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+	if (!dd->port) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: port structure\n");
+		return -ENOMEM;
+	}
+
+	/* Counting semaphore to track command slot usage */
+	sema_init(&dd->port->cmd_slot, num_command_slots - 1);
+
+	/* Spinlock to prevent concurrent issue */
+	spin_lock_init(&dd->port->cmd_issue_lock);
+
+	/* Set the port mmio base address. */
+	dd->port->mmio	= dd->mmio + PORT_OFFSET;
+	dd->port->dd	= dd;
+
+	/* Allocate memory for the command list. */
+	dd->port->command_list =
+		dmam_alloc_coherent(&dd->pdev->dev,
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+			&dd->port->command_list_dma,
+			GFP_KERNEL);
+	if (!dd->port->command_list) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: command list\n");
+		rv = -ENOMEM;
+		goto out1;
+	}
+
+	/* Clear the memory we have allocated. */
+	memset(dd->port->command_list,
+		0,
+		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2));
+
+	/* Setup the addresse of the RX FIS. */
+	dd->port->rxfis	    = dd->port->command_list + HW_CMD_SLOT_SZ;
+	dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
+
+	/* Setup the address of the command tables. */
+	dd->port->command_table	  = dd->port->rxfis + AHCI_RX_FIS_SZ;
+	dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
+
+	/* Setup the address of the identify data. */
+	dd->port->identify     = dd->port->command_table +
+					HW_CMD_TBL_AR_SZ;
+	dd->port->identify_dma = dd->port->command_tbl_dma +
+					HW_CMD_TBL_AR_SZ;
+
+	/* Setup the address of the sector buffer. */
+	dd->port->sector_buffer	= (void *) dd->port->identify + ATA_SECT_SIZE;
+	dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
+
+	/* Point the command headers at the command tables. */
+	for (i = 0; i < num_command_slots; i++) {
+		dd->port->commands[i].command_header =
+					dd->port->command_list +
+					(sizeof(struct mtip_cmd_hdr) * i);
+		dd->port->commands[i].command_header_dma =
+					dd->port->command_list_dma +
+					(sizeof(struct mtip_cmd_hdr) * i);
+
+		dd->port->commands[i].command =
+			dd->port->command_table + (HW_CMD_TBL_SZ * i);
+		dd->port->commands[i].command_dma =
+			dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
+
+		if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
+			dd->port->commands[i].command_header->ctbau =
+			__force_bit2int cpu_to_le32(
+			(dd->port->commands[i].command_dma >> 16) >> 16);
+		dd->port->commands[i].command_header->ctba =
+			__force_bit2int cpu_to_le32(
+			dd->port->commands[i].command_dma & 0xFFFFFFFF);
+
+		/*
+		 * If this is not done, a bug is reported by the stock
+		 * FC11 i386. Due to the fact that it has lots of kernel
+		 * debugging enabled.
+		 */
+		sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
+
+		/* Mark all commands as currently inactive.*/
+		atomic_set(&dd->port->commands[i].active, 0);
+	}
+
+	/* Setup the pointers to the extended s_active and CI registers. */
+	for (i = 0; i < dd->slot_groups; i++) {
+		dd->port->s_active[i] =
+			dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+		dd->port->cmd_issue[i] =
+			dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+		dd->port->completed[i] =
+			dd->port->mmio + i*0x80 + PORT_SDBV;
+	}
+
+	/* Reset the HBA. */
+	if (mtip_hba_reset(dd) < 0) {
+		dev_err(&dd->pdev->dev,
+			"Card did not reset within timeout\n");
+		rv = -EIO;
+		goto out2;
+	}
+
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Setup the ISR and enable interrupts. */
+	rv = devm_request_irq(&dd->pdev->dev,
+				dd->pdev->irq,
+				mtip_irq_handler,
+				IRQF_SHARED,
+				dev_driver_string(&dd->pdev->dev),
+				dd);
+
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate IRQ %d\n", dd->pdev->irq);
+		goto out2;
+	}
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+
+	init_timer(&dd->port->cmd_timer);
+	init_waitqueue_head(&dd->port->svc_wait);
+
+	dd->port->cmd_timer.data = (unsigned long int) dd->port;
+	dd->port->cmd_timer.function = mtip_timeout_function;
+	mod_timer(&dd->port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+
+	if (mtip_get_identify(dd->port, NULL) < 0) {
+		rv = -EFAULT;
+		goto out3;
+	}
+
+	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+		MTIP_FTL_REBUILD_MAGIC) {
+		set_bit(MTIP_FLAG_REBUILD_BIT, &dd->port->flags);
+		return MTIP_FTL_REBUILD_MAGIC;
+	}
+	mtip_dump_identify(dd->port);
+	return rv;
+
+out3:
+	del_timer_sync(&dd->port->cmd_timer);
+
+	/* Disable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	/*Release the IRQ. */
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+	mtip_deinit_port(dd->port);
+
+	/* Free the command/command header memory. */
+	dmam_free_coherent(&dd->pdev->dev,
+				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+				dd->port->command_list,
+				dd->port->command_list_dma);
+out1:
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return rv;
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (atomic_read(&dd->drv_cleanup_done) != true) {
+
+		mtip_standby_immediate(dd->port);
+
+		/* de-initialize the port. */
+		mtip_deinit_port(dd->port);
+
+		/* Disable interrupts on the HBA. */
+		writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+				dd->mmio + HOST_CTL);
+	}
+
+	del_timer_sync(&dd->port->cmd_timer);
+
+	/* Release the IRQ. */
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+	/* Stop the bottom half tasklet. */
+	tasklet_kill(&dd->tasklet);
+
+	/* Free the command/command header memory. */
+	dmam_free_coherent(&dd->pdev->dev,
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+			dd->port->command_list,
+			dd->port->command_list_dma);
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	mtip_standby_immediate(dd->port);
+
+	return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Suspend was successful
+ *	-EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive
+	 * so that it saves its state.
+	 */
+	if (mtip_standby_immediate(dd->port) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Failed standby-immediate command\n");
+		return -EFAULT;
+	}
+
+	/* Disable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+	mtip_deinit_port(dd->port);
+
+	return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Resume was successful
+ *      -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+	/* Perform any needed hardware setup steps */
+	hba_setup(dd);
+
+	/* Reset the HBA */
+	if (mtip_hba_reset(dd) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Unable to reset the HBA\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Enable the port, DMA engine, and FIS reception specific
+	 * h/w in controller.
+	 */
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+				 int index,
+				 char *buf,
+				 int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	case HDIO_DRIVE_TASKFILE: {
+		struct mtip_compat_ide_task_request_s __user *compat_req_task;
+		ide_task_request_t req_task;
+		int compat_tasksize, outtotal, ret;
+
+		compat_tasksize =
+			sizeof(struct mtip_compat_ide_task_request_s);
+
+		compat_req_task =
+			(struct mtip_compat_ide_task_request_s __user *) arg;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+			compat_tasksize - (2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (get_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (get_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (put_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (put_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		return ret;
+	}
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ *	0       Operation completed successfully.
+ *	-ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+				struct hd_geometry *geo)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+	sector_t capacity;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not get drive capacity.\n");
+		return -ENOTTY;
+	}
+
+	geo->heads = 224;
+	geo->sectors = 56;
+	sector_div(capacity, (geo->heads * geo->sectors));
+	geo->cylinders = capacity;
+	return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+	.ioctl		= mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= mtip_block_compat_ioctl,
+#endif
+	.getgeo		= mtip_block_getgeo,
+	.owner		= THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ *              the driver data structure.
+ * @bio   Pointer to the BIO.
+ *
+ */
+static void mtip_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct driver_data *dd = queue->queuedata;
+	struct scatterlist *sg;
+	struct bio_vec *bvec;
+	int nents = 0;
+	int tag = 0;
+
+	if (unlikely(!bio_has_data(bio))) {
+		blk_queue_flush(queue, 0);
+		bio_endio(bio, 0);
+		return;
+	}
+
+	sg = mtip_hw_get_scatterlist(dd, &tag);
+	if (likely(sg != NULL)) {
+		blk_queue_bounce(queue, &bio);
+
+		if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
+			dev_warn(&dd->pdev->dev,
+				"Maximum number of SGL entries exceeded");
+			bio_io_error(bio);
+			mtip_hw_release_scatterlist(dd, tag);
+			return;
+		}
+
+		/* Create the scatter list for this bio. */
+		bio_for_each_segment(bvec, bio, nents) {
+			sg_set_page(&sg[nents],
+					bvec->bv_page,
+					bvec->bv_len,
+					bvec->bv_offset);
+		}
+
+		/* Issue the read/write. */
+		mtip_hw_submit_io(dd,
+				bio->bi_sector,
+				bio_sectors(bio),
+				nents,
+				tag,
+				bio_endio,
+				bio,
+				bio->bi_rw & REQ_FUA,
+				bio_data_dir(bio));
+	} else
+		bio_io_error(bio);
+}
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+	int rv = 0, wait_for_rebuild = 0;
+	sector_t capacity;
+	unsigned int index = 0;
+	struct kobject *kobj;
+	unsigned char thd_name[16];
+
+	if (dd->disk)
+		goto skip_create_disk; /* hw init done, before rebuild */
+
+	/* Initialize the protocol layer. */
+	wait_for_rebuild = mtip_hw_init(dd);
+	if (wait_for_rebuild < 0) {
+		dev_err(&dd->pdev->dev,
+			"Protocol layer initialization failed\n");
+		rv = -EINVAL;
+		goto protocol_init_error;
+	}
+
+	dd->disk = alloc_disk(MTIP_MAX_MINORS);
+	if (dd->disk  == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate gendisk structure\n");
+		rv = -EINVAL;
+		goto alloc_disk_error;
+	}
+
+	/* Generate the disk name, implemented same as in sd.c */
+	do {
+		if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+			goto ida_get_error;
+
+		spin_lock(&rssd_index_lock);
+		rv = ida_get_new(&rssd_index_ida, &index);
+		spin_unlock(&rssd_index_lock);
+	} while (rv == -EAGAIN);
+
+	if (rv)
+		goto ida_get_error;
+
+	rv = rssd_disk_name_format("rssd",
+				index,
+				dd->disk->disk_name,
+				DISK_NAME_LEN);
+	if (rv)
+		goto disk_index_error;
+
+	dd->disk->driverfs_dev	= &dd->pdev->dev;
+	dd->disk->major		= dd->major;
+	dd->disk->first_minor	= dd->instance * MTIP_MAX_MINORS;
+	dd->disk->fops		= &mtip_block_ops;
+	dd->disk->private_data	= dd;
+	dd->index		= index;
+
+	/*
+	 * if rebuild pending, start the service thread, and delay the block
+	 * queue creation and add_disk()
+	 */
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		goto start_service_thread;
+
+skip_create_disk:
+	/* Allocate the request queue. */
+	dd->queue = blk_alloc_queue(GFP_KERNEL);
+	if (dd->queue == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
+	/* Attach our request function to the request queue. */
+	blk_queue_make_request(dd->queue, mtip_make_request);
+
+	dd->disk->queue		= dd->queue;
+	dd->queue->queuedata	= dd;
+
+	/* Set device limits. */
+	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_io_min(dd->queue, 4096);
+	blk_queue_flush(dd->queue, 0);
+
+	/* Set the capacity of the device in 512 byte sectors. */
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not read drive capacity\n");
+		rv = -EIO;
+		goto read_capacity_error;
+	}
+	set_capacity(dd->disk, capacity);
+
+	/* Enable the block device and add it to /dev */
+	add_disk(dd->disk);
+
+	/*
+	 * Now that the disk is active, initialize any sysfs attributes
+	 * managed by the protocol layer.
+	 */
+	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+	if (kobj) {
+		mtip_hw_sysfs_init(dd, kobj);
+		kobject_put(kobj);
+	}
+
+	if (dd->mtip_svc_handler)
+		return rv; /* service thread created for handling rebuild */
+
+start_service_thread:
+	sprintf(thd_name, "mtip_svc_thd_%02d", index);
+
+	dd->mtip_svc_handler = kthread_run(mtip_service_thread,
+						dd, thd_name);
+
+	if (IS_ERR(dd->mtip_svc_handler)) {
+		printk(KERN_ERR "mtip32xx: service thread failed to start\n");
+		dd->mtip_svc_handler = NULL;
+		rv = -EFAULT;
+		goto kthread_run_error;
+	}
+
+	return rv;
+
+kthread_run_error:
+	/* Delete our gendisk. This also removes the device from /dev */
+	del_gendisk(dd->disk);
+
+read_capacity_error:
+	blk_cleanup_queue(dd->queue);
+
+block_queue_alloc_init_error:
+disk_index_error:
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, index);
+	spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+	put_disk(dd->disk);
+
+alloc_disk_error:
+	mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+
+protocol_init_error:
+	return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_remove(struct driver_data *dd)
+{
+	struct kobject *kobj;
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
+		kthread_stop(dd->mtip_svc_handler);
+	}
+
+	/* Clean up the sysfs attributes managed by the protocol layer. */
+	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+	if (kobj) {
+		mtip_hw_sysfs_exit(dd, kobj);
+		kobject_put(kobj);
+	}
+
+	/*
+	 * Delete our gendisk structure. This also removes the device
+	 * from /dev
+	 */
+	del_gendisk(dd->disk);
+	blk_cleanup_queue(dd->queue);
+	dd->disk  = NULL;
+	dd->queue = NULL;
+
+	/* De-initialize the protocol layer. */
+	mtip_hw_exit(dd);
+
+	return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Shutting down %s ...\n", dd->disk->disk_name);
+
+	/* Delete our gendisk structure, and cleanup the blk queue. */
+	del_gendisk(dd->disk);
+	blk_cleanup_queue(dd->queue);
+	dd->disk  = NULL;
+	dd->queue = NULL;
+
+	mtip_hw_shutdown(dd);
+	return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Suspending %s ...\n", dd->disk->disk_name);
+	mtip_hw_suspend(dd);
+	return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+		dd->disk->disk_name);
+	mtip_hw_resume(dd);
+	return 0;
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+			const struct pci_device_id *ent)
+{
+	int rv = 0;
+	struct driver_data *dd = NULL;
+
+	/* Allocate memory for this devices private data. */
+	dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+	if (dd == NULL) {
+		dev_err(&pdev->dev,
+			"Unable to allocate memory for driver data\n");
+		return -ENOMEM;
+	}
+
+	/* Set the atomic variable as 1 in case of SRSI */
+	atomic_set(&dd->drv_cleanup_done, true);
+
+	atomic_set(&dd->resumeflag, false);
+
+	/* Attach the private data to this PCI device.  */
+	pci_set_drvdata(pdev, dd);
+
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to enable device\n");
+		goto iomap_err;
+	}
+
+	/* Map BAR5 to memory. */
+	rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to map regions\n");
+		goto iomap_err;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+		if (rv) {
+			rv = pci_set_consistent_dma_mask(pdev,
+						DMA_BIT_MASK(32));
+			if (rv) {
+				dev_warn(&pdev->dev,
+					"64-bit DMA enable failed\n");
+				goto setmask_err;
+			}
+		}
+	}
+
+	pci_set_master(pdev);
+
+	if (pci_enable_msi(pdev)) {
+		dev_warn(&pdev->dev,
+			"Unable to enable MSI interrupt.\n");
+		goto block_initialize_err;
+	}
+
+	/* Copy the info we may need later into the private data structure. */
+	dd->major	= mtip_major;
+	dd->instance	= instance;
+	dd->pdev	= pdev;
+
+	/* Initialize the block layer. */
+	rv = mtip_block_initialize(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Unable to initialize block layer\n");
+		goto block_initialize_err;
+	}
+
+	/*
+	 * Increment the instance count so that each device has a unique
+	 * instance number.
+	 */
+	instance++;
+
+	goto done;
+
+block_initialize_err:
+	pci_disable_msi(pdev);
+
+setmask_err:
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+	kfree(dd);
+	pci_set_drvdata(pdev, NULL);
+	return rv;
+done:
+	/* Set the atomic variable as 0 in case of SRSI */
+	atomic_set(&dd->drv_cleanup_done, true);
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ *	None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	int counter = 0;
+
+	if (mtip_check_surprise_removal(pdev)) {
+		while (atomic_read(&dd->drv_cleanup_done) == false) {
+			counter++;
+			msleep(20);
+			if (counter == 10) {
+				/* Cleanup the outstanding commands */
+				mtip_command_cleanup(dd);
+				break;
+			}
+		}
+	}
+	/* Set the atomic variable as 1 in case of SRSI */
+	atomic_set(&dd->drv_cleanup_done, true);
+
+	/* Clean up the block layer. */
+	mtip_block_remove(dd);
+
+	pci_disable_msi(pdev);
+
+	kfree(dd);
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ *	0  Success
+ *	<0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+	int rv = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	atomic_set(&dd->resumeflag, true);
+
+	/* Disable ports & interrupts then send standby immediate */
+	rv = mtip_block_suspend(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to suspend controller\n");
+		return rv;
+	}
+
+	/*
+	 * Save the pci config space to pdev structure &
+	 * disable the device
+	 */
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+
+	/* Move to Low power state*/
+	pci_set_power_state(pdev, PCI_D3hot);
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ *      0  Success
+ *      <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+	int rv = 0;
+	struct driver_data *dd;
+
+	dd = pci_get_drvdata(pdev);
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	/* Move the device to active State */
+	pci_set_power_state(pdev, PCI_D0);
+
+	/* Restore PCI configuration space */
+	pci_restore_state(pdev);
+
+	/* Enable the PCI device*/
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to enable card during resume\n");
+		goto err;
+	}
+	pci_set_master(pdev);
+
+	/*
+	 * Calls hbaReset, initPort, & startPort function
+	 * then enables interrupts
+	 */
+	rv = mtip_block_resume(dd);
+	if (rv < 0)
+		dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+	atomic_set(&dd->resumeflag, false);
+
+	return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ *      None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	if (dd)
+		mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
+	{  PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320_DEVICE_ID) },
+	{ 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+	.name			= MTIP_DRV_NAME,
+	.id_table		= mtip_pci_tbl,
+	.probe			= mtip_pci_probe,
+	.remove			= mtip_pci_remove,
+	.suspend		= mtip_pci_suspend,
+	.resume			= mtip_pci_resume,
+	.shutdown		= mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ *      0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+	printk(KERN_INFO MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+	/* Allocate a major block device number to use with this driver. */
+	mtip_major = register_blkdev(0, MTIP_DRV_NAME);
+	if (mtip_major < 0) {
+		printk(KERN_ERR "Unable to register block device (%d)\n",
+		mtip_major);
+		return -EBUSY;
+	}
+
+	/* Register our PCI operations. */
+	return pci_register_driver(&mtip_pci_driver);
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ *      none
+ */
+static void __exit mtip_exit(void)
+{
+	/* Release the allocated major block device number. */
+	unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+	/* Unregister the PCI driver. */
+	pci_unregister_driver(&mtip_pci_driver);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000000..723d7c4946dc
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,423 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+#include <linux/version.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID	0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET	0x48
+
+/* # of times to retry timed out IOs */
+#define MTIP_MAX_RETRIES	5
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_COMMAND_TIMEOUT_MS       5000
+#define MTIP_IOCTL_COMMAND_TIMEOUT_MS     5000
+#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS  5000
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD	500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET		142
+#define MTIP_FTL_REBUILD_MAGIC		0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS	2400000
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag)	(tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag)	(tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG		128
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS	8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL	0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON    0x1344
+#define P320_DEVICE_ID		0x5150
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME		"mtip32xx"
+#define MTIP_DRV_VERSION	"1.2.6os3"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS		16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS	(MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG	(sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+					(U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR		5
+
+/* Forced Unit Access Bit */
+#define FUA_BIT			0x80
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...)	\
+	printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define __force_bit2int (unsigned int __force)
+
+/* below are bit numbers in 'flags' defined in mtip_port */
+#define MTIP_FLAG_IC_ACTIVE_BIT			0
+#define MTIP_FLAG_EH_ACTIVE_BIT			1
+#define MTIP_FLAG_SVC_THD_ACTIVE_BIT		2
+#define MTIP_FLAG_ISSUE_CMDS_BIT		4
+#define MTIP_FLAG_REBUILD_BIT			5
+#define MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT	8
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+	/*
+	 * FIS type.
+	 * - 27h Register FIS, host to device.
+	 * - 34h Register FIS, device to host.
+	 * - 39h DMA Activate FIS, device to host.
+	 * - 41h DMA Setup FIS, bi-directional.
+	 * - 46h Data FIS, bi-directional.
+	 * - 58h BIST Activate FIS, bi-directional.
+	 * - 5Fh PIO Setup FIS, device to host.
+	 * - A1h Set Device Bits FIS, device to host.
+	 */
+	unsigned char type;
+	unsigned char opts;
+	unsigned char command;
+	unsigned char features;
+
+	union {
+		unsigned char lba_low;
+		unsigned char sector;
+	};
+	union {
+		unsigned char lba_mid;
+		unsigned char cyl_low;
+	};
+	union {
+		unsigned char lba_hi;
+		unsigned char cyl_hi;
+	};
+	union {
+		unsigned char device;
+		unsigned char head;
+	};
+
+	union {
+		unsigned char lba_low_ex;
+		unsigned char sector_ex;
+	};
+	union {
+		unsigned char lba_mid_ex;
+		unsigned char cyl_low_ex;
+	};
+	union {
+		unsigned char lba_hi_ex;
+		unsigned char cyl_hi_ex;
+	};
+	unsigned char features_ex;
+
+	unsigned char sect_count;
+	unsigned char sect_cnt_ex;
+	unsigned char res2;
+	unsigned char control;
+
+	unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+	/*
+	 * Command options.
+	 * - Bits 31:16 Number of PRD entries.
+	 * - Bits 15:8 Unused in this implementation.
+	 * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+	 * - Bit 6 Write bit, should be set when writing data to the device.
+	 * - Bit 5 Unused in this implementation.
+	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+	 */
+	unsigned int opts;
+	/* This field is unsed when using NCQ. */
+	union {
+		unsigned int byte_count;
+		unsigned int status;
+	};
+	/*
+	 * Lower 32 bits of the command table address associated with this
+	 * header. The command table addresses must be 128 byte aligned.
+	 */
+	unsigned int ctba;
+	/*
+	 * If 64 bit addressing is used this field is the upper 32 bits
+	 * of the command table address associated with this command.
+	 */
+	unsigned int ctbau;
+	/* Reserved and unused. */
+	unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+	/*
+	 * Low 32 bits of the data buffer address. For P320 this
+	 * address must be 8 byte aligned signified by bits 2:0 being
+	 * set to 0.
+	 */
+	unsigned int dba;
+	/*
+	 * When 64 bit addressing is used this field is the upper
+	 * 32 bits of the data buffer address.
+	 */
+	unsigned int dba_upper;
+	/* Unused. */
+	unsigned int reserved;
+	/*
+	 * Bit 31: interrupt when this data block has been transferred.
+	 * Bits 30..22: reserved
+	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
+	 * 8 byte aligned signified by bits 2:0 being set to 1.
+	 */
+	unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+	struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+	dma_addr_t command_header_dma; /* corresponding physical address */
+
+	void *command; /* ptr to command table entry */
+
+	dma_addr_t command_dma; /* corresponding physical address */
+
+	void *comp_data; /* data passed to completion function comp_func() */
+	/*
+	 * Completion function called by the ISR upon completion of
+	 * a command.
+	 */
+	void (*comp_func)(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status);
+	/* Additional callback function that may be called by comp_func() */
+	void (*async_callback)(void *data, int status);
+
+	void *async_data; /* Addl. data passed to async_callback() */
+
+	int scatter_ents; /* Number of scatter list entries used */
+
+	struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+	int retries; /* The number of retries left for this command. */
+
+	int direction; /* Data transfer direction */
+
+	unsigned long comp_time; /* command completion time, in jiffies */
+
+	atomic_t active; /* declares if this command sent to the drive. */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+	/* Pointer back to the driver data for this port. */
+	struct driver_data *dd;
+	/*
+	 * Used to determine if the data pointed to by the
+	 * identify field is valid.
+	 */
+	unsigned long identify_valid;
+	/* Base address of the memory mapped IO for the port. */
+	void __iomem *mmio;
+	/* Array of pointers to the memory mapped s_active registers. */
+	void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped completed registers. */
+	void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped Command Issue registers. */
+	void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the driver.
+	 */
+	void *command_list;
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_list_dma;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the driver.
+	 */
+	void *rxfis;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t rxfis_dma;
+	/*
+	 * Pointer to the beginning of the command table memory as used
+	 * by the driver.
+	 */
+	void *command_table;
+	/*
+	 * Pointer to the beginning of the command table memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_tbl_dma;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the driver.
+	 */
+	u16 *identify;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t identify_dma;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the driver when issuing internal commands.
+	 */
+	u16 *sector_buffer;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the DMA when the driver issues internal commands.
+	 */
+	dma_addr_t sector_buffer_dma;
+	/*
+	 * Bit significant, used to determine if a command slot has
+	 * been allocated. i.e. the slot is in use.  Bits are cleared
+	 * when the command slot and all associated data structures
+	 * are no longer needed.
+	 */
+	unsigned long allocated[SLOTBITS_IN_LONGS];
+	/*
+	 * used to queue commands when an internal command is in progress
+	 * or error handling is active
+	 */
+	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+	/*
+	 * Array of command slots. Structure includes pointers to the
+	 * command header and command table, and completion function and data
+	 * pointers.
+	 */
+	struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
+	/* Used by mtip_service_thread to wait for an event */
+	wait_queue_head_t svc_wait;
+	/*
+	 * indicates the state of the port. Also, helps the service thread
+	 * to determine its action on wake up.
+	 */
+	unsigned long flags;
+	/*
+	 * Timer used to complete commands that have been active for too long.
+	 */
+	struct timer_list cmd_timer;
+	/*
+	 * Semaphore used to block threads if there are no
+	 * command slots available.
+	 */
+	struct semaphore cmd_slot;
+	/* Spinlock for working around command-issue bug. */
+	spinlock_t cmd_issue_lock;
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+	void __iomem *mmio; /* Base address of the HBA registers. */
+
+	int major; /* Major device number. */
+
+	int instance; /* Instance number. First device probed is 0, ... */
+
+	struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+	struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+	struct request_queue *queue; /* Our request queue. */
+
+	struct mtip_port *port; /* Pointer to the port data structure. */
+
+	/* Tasklet used to process the bottom half of the ISR. */
+	struct tasklet_struct tasklet;
+
+	unsigned product_type; /* magic value declaring the product type */
+
+	unsigned slot_groups; /* number of slot groups the product supports */
+
+	atomic_t drv_cleanup_done; /* Atomic variable for SRSI */
+
+	unsigned long index; /* Index to determine the disk name */
+
+	unsigned int ftlrebuildflag; /* FTL rebuild flag */
+
+	atomic_t resumeflag; /* Atomic variable to track suspend/resume */
+
+	atomic_t eh_active; /* Flag for error handling tracking */
+
+	struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+};
+
+#endif
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
new file mode 100644
index 000000000000..c1dc4d86c221
--- /dev/null
+++ b/drivers/block/nvme.c
@@ -0,0 +1,1739 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#define NVME_Q_DEPTH 1024
+#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
+#define NVME_MINORS 64
+#define NVME_IO_TIMEOUT	(5 * HZ)
+#define ADMIN_TIMEOUT	(60 * HZ)
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct list_head node;
+	struct nvme_queue **queues;
+	u32 __iomem *dbs;
+	struct pci_dev *pci_dev;
+	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
+	int instance;
+	int queue_count;
+	int db_stride;
+	u32 ctrl_config;
+	struct msix_entry *entry;
+	struct nvme_bar __iomem *bar;
+	struct list_head namespaces;
+	char serial[20];
+	char model[40];
+	char firmware_rev[8];
+};
+
+/*
+ * An NVM Express namespace is equivalent to a SCSI LUN
+ */
+struct nvme_ns {
+	struct list_head list;
+
+	struct nvme_dev *dev;
+	struct request_queue *queue;
+	struct gendisk *disk;
+
+	int ns_id;
+	int lba_shift;
+};
+
+/*
+ * An NVM Express queue.  Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+	struct device *q_dmadev;
+	struct nvme_dev *dev;
+	spinlock_t q_lock;
+	struct nvme_command *sq_cmds;
+	volatile struct nvme_completion *cqes;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	wait_queue_head_t sq_full;
+	wait_queue_t sq_cong_wait;
+	struct bio_list sq_cong;
+	u32 __iomem *q_db;
+	u16 q_depth;
+	u16 cq_vector;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 cq_head;
+	u16 cq_phase;
+	unsigned long cmdid_data[];
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+}
+
+typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
+						struct nvme_completion *);
+
+struct nvme_cmd_info {
+	nvme_completion_fn fn;
+	void *ctx;
+	unsigned long timeout;
+};
+
+static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+{
+	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+}
+
+/**
+ * alloc_cmdid() - Allocate a Command ID
+ * @nvmeq: The queue that will be used for this command
+ * @ctx: A pointer that will be passed to the handler
+ * @handler: The function to call on completion
+ *
+ * Allocate a Command ID for a queue.  The data passed in will
+ * be passed to the completion handler.  This is implemented by using
+ * the bottom two bits of the ctx pointer to store the handler ID.
+ * Passing in a pointer that's not 4-byte aligned will cause a BUG.
+ * We can change this if it becomes a problem.
+ *
+ * May be called with local interrupts disabled and the q_lock held,
+ * or with interrupts enabled and no locks held.
+ */
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
+				nvme_completion_fn handler, unsigned timeout)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	int cmdid;
+
+	do {
+		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
+		if (cmdid >= depth)
+			return -EBUSY;
+	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+
+	info[cmdid].fn = handler;
+	info[cmdid].ctx = ctx;
+	info[cmdid].timeout = jiffies + timeout;
+	return cmdid;
+}
+
+static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
+				nvme_completion_fn handler, unsigned timeout)
+{
+	int cmdid;
+	wait_event_killable(nvmeq->sq_full,
+		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
+	return (cmdid < 0) ? -EINTR : cmdid;
+}
+
+/* Special values must be less than 0x1000 */
+#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
+#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
+#define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
+
+static void special_completion(struct nvme_dev *dev, void *ctx,
+						struct nvme_completion *cqe)
+{
+	if (ctx == CMD_CTX_CANCELLED)
+		return;
+	if (ctx == CMD_CTX_FLUSH)
+		return;
+	if (ctx == CMD_CTX_COMPLETED) {
+		dev_warn(&dev->pci_dev->dev,
+				"completed id %d twice on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+	if (ctx == CMD_CTX_INVALID) {
+		dev_warn(&dev->pci_dev->dev,
+				"invalid id %d completed on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+
+	dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
+{
+	void *ctx;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+
+	if (cmdid >= nvmeq->q_depth) {
+		*fn = special_completion;
+		return CMD_CTX_INVALID;
+	}
+	*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
+	info[cmdid].ctx = CMD_CTX_COMPLETED;
+	clear_bit(cmdid, nvmeq->cmdid_data);
+	wake_up(&nvmeq->sq_full);
+	return ctx;
+}
+
+static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
+{
+	void *ctx;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	if (fn)
+		*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
+	info[cmdid].ctx = CMD_CTX_CANCELLED;
+	return ctx;
+}
+
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
+{
+	return dev->queues[get_cpu() + 1];
+}
+
+static void put_nvmeq(struct nvme_queue *nvmeq)
+{
+	put_cpu();
+}
+
+/**
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+	unsigned long flags;
+	u16 tail;
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	tail = nvmeq->sq_tail;
+	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+	if (++tail == nvmeq->q_depth)
+		tail = 0;
+	writel(tail, nvmeq->q_db);
+	nvmeq->sq_tail = tail;
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+	return 0;
+}
+
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_alloc_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+	void *private;		/* For the use of the submitter of the I/O */
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int offset;		/* Of PRP list */
+	int nents;		/* Used in scatterlist */
+	int length;		/* Of data, in bytes */
+	dma_addr_t first_dma;
+	struct scatterlist sg[0];
+};
+
+static __le64 **iod_list(struct nvme_iod *iod)
+{
+	return ((void *)iod) + iod->offset;
+}
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size)
+{
+	unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static struct nvme_iod *
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+{
+	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+				sizeof(__le64 *) * nvme_npages(nbytes) +
+				sizeof(struct scatterlist) * nseg, gfp);
+
+	if (iod) {
+		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+		iod->npages = -1;
+		iod->length = nbytes;
+	}
+
+	return iod;
+}
+
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+{
+	const int last_prp = PAGE_SIZE / 8 - 1;
+	int i;
+	__le64 **list = iod_list(iod);
+	dma_addr_t prp_dma = iod->first_dma;
+
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+	for (i = 0; i < iod->npages; i++) {
+		__le64 *prp_list = list[i];
+		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
+		prp_dma = next_prp_dma;
+	}
+	kfree(iod);
+}
+
+static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
+{
+	struct nvme_queue *nvmeq = get_nvmeq(dev);
+	if (bio_list_empty(&nvmeq->sq_cong))
+		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+	bio_list_add(&nvmeq->sq_cong, bio);
+	put_nvmeq(nvmeq);
+	wake_up_process(nvme_thread);
+}
+
+static void bio_completion(struct nvme_dev *dev, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct nvme_iod *iod = ctx;
+	struct bio *bio = iod->private;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	nvme_free_iod(dev, iod);
+	if (status) {
+		bio_endio(bio, -EIO);
+	} else if (bio->bi_vcnt > bio->bi_idx) {
+		requeue_bio(dev, bio);
+	} else {
+		bio_endio(bio, 0);
+	}
+}
+
+/* length is in bytes.  gfp flags indicates whether we may sleep. */
+static int nvme_setup_prps(struct nvme_dev *dev,
+			struct nvme_common_command *cmd, struct nvme_iod *iod,
+			int total_len, gfp_t gfp)
+{
+	struct dma_pool *pool;
+	int length = total_len;
+	struct scatterlist *sg = iod->sg;
+	int dma_len = sg_dma_len(sg);
+	u64 dma_addr = sg_dma_address(sg);
+	int offset = offset_in_page(dma_addr);
+	__le64 *prp_list;
+	__le64 **list = iod_list(iod);
+	dma_addr_t prp_dma;
+	int nprps, i;
+
+	cmd->prp1 = cpu_to_le64(dma_addr);
+	length -= (PAGE_SIZE - offset);
+	if (length <= 0)
+		return total_len;
+
+	dma_len -= (PAGE_SIZE - offset);
+	if (dma_len) {
+		dma_addr += (PAGE_SIZE - offset);
+	} else {
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	if (length <= PAGE_SIZE) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		return total_len;
+	}
+
+	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+	if (!prp_list) {
+		cmd->prp2 = cpu_to_le64(dma_addr);
+		iod->npages = -1;
+		return (total_len - length) + PAGE_SIZE;
+	}
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
+	cmd->prp2 = cpu_to_le64(prp_dma);
+	i = 0;
+	for (;;) {
+		if (i == PAGE_SIZE / 8) {
+			__le64 *old_prp_list = prp_list;
+			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+			if (!prp_list)
+				return total_len - length;
+			list[iod->npages++] = prp_list;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+			i = 1;
+		}
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= PAGE_SIZE;
+		dma_addr += PAGE_SIZE;
+		length -= PAGE_SIZE;
+		if (length <= 0)
+			break;
+		if (dma_len > 0)
+			continue;
+		BUG_ON(dma_len < 0);
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	return total_len;
+}
+
+/* NVMe scatterlists require no holes in the virtual address */
+#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
+			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
+
+static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
+		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+{
+	struct bio_vec *bvec, *bvprv = NULL;
+	struct scatterlist *sg = NULL;
+	int i, old_idx, length = 0, nsegs = 0;
+
+	sg_init_table(iod->sg, psegs);
+	old_idx = bio->bi_idx;
+	bio_for_each_segment(bvec, bio, i) {
+		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
+			sg->length += bvec->bv_len;
+		} else {
+			if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
+				break;
+			sg = sg ? sg + 1 : iod->sg;
+			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
+							bvec->bv_offset);
+			nsegs++;
+		}
+		length += bvec->bv_len;
+		bvprv = bvec;
+	}
+	bio->bi_idx = i;
+	iod->nents = nsegs;
+	sg_mark_end(sg);
+	if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
+		bio->bi_idx = old_idx;
+		return -ENOMEM;
+	}
+	return length;
+}
+
+static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								int cmdid)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.command_id = cmdid;
+	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
+static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
+					special_completion, NVME_IO_TIMEOUT);
+	if (unlikely(cmdid < 0))
+		return cmdid;
+
+	return nvme_submit_flush(nvmeq, ns, cmdid);
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								struct bio *bio)
+{
+	struct nvme_command *cmnd;
+	struct nvme_iod *iod;
+	enum dma_data_direction dma_dir;
+	int cmdid, length, result = -ENOMEM;
+	u16 control;
+	u32 dsmgmt;
+	int psegs = bio_phys_segments(ns->queue, bio);
+
+	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+		result = nvme_submit_flush_data(nvmeq, ns);
+		if (result)
+			return result;
+	}
+
+	iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
+	if (!iod)
+		goto nomem;
+	iod->private = bio;
+
+	result = -EBUSY;
+	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
+	if (unlikely(cmdid < 0))
+		goto free_iod;
+
+	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+		return nvme_submit_flush(nvmeq, ns, cmdid);
+
+	control = 0;
+	if (bio->bi_rw & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	dsmgmt = 0;
+	if (bio->bi_rw & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	if (bio_data_dir(bio)) {
+		cmnd->rw.opcode = nvme_cmd_write;
+		dma_dir = DMA_TO_DEVICE;
+	} else {
+		cmnd->rw.opcode = nvme_cmd_read;
+		dma_dir = DMA_FROM_DEVICE;
+	}
+
+	result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
+	if (result < 0)
+		goto free_iod;
+	length = result;
+
+	cmnd->rw.command_id = cmdid;
+	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
+								GFP_ATOMIC);
+	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+	bio->bi_sector += length >> 9;
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+
+ free_iod:
+	nvme_free_iod(nvmeq->dev, iod);
+ nomem:
+	return result;
+}
+
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+	int result = -EBUSY;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	if (bio_list_empty(&nvmeq->sq_cong))
+		result = nvme_submit_bio_queue(nvmeq, ns, bio);
+	if (unlikely(result)) {
+		if (bio_list_empty(&nvmeq->sq_cong))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+		bio_list_add(&nvmeq->sq_cong, bio);
+	}
+
+	spin_unlock_irq(&nvmeq->q_lock);
+	put_nvmeq(nvmeq);
+}
+
+static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	u16 head, phase;
+
+	head = nvmeq->cq_head;
+	phase = nvmeq->cq_phase;
+
+	for (;;) {
+		void *ctx;
+		nvme_completion_fn fn;
+		struct nvme_completion cqe = nvmeq->cqes[head];
+		if ((le16_to_cpu(cqe.status) & 1) != phase)
+			break;
+		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+		if (++head == nvmeq->q_depth) {
+			head = 0;
+			phase = !phase;
+		}
+
+		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		fn(nvmeq->dev, ctx, &cqe);
+	}
+
+	/* If the controller ignores the cq head doorbell and continuously
+	 * writes to the queue, it is theoretically possible to wrap around
+	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
+	 * requires that 0.1% of your interrupts are handled, so this isn't
+	 * a big problem.
+	 */
+	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
+		return IRQ_NONE;
+
+	writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+	nvmeq->cq_head = head;
+	nvmeq->cq_phase = phase;
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+	irqreturn_t result;
+	struct nvme_queue *nvmeq = data;
+	spin_lock(&nvmeq->q_lock);
+	result = nvme_process_cq(nvmeq);
+	spin_unlock(&nvmeq->q_lock);
+	return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
+	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
+		return IRQ_NONE;
+	return IRQ_WAKE_THREAD;
+}
+
+static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+{
+	spin_lock_irq(&nvmeq->q_lock);
+	cancel_cmdid(nvmeq, cmdid, NULL);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
+struct sync_cmd_info {
+	struct task_struct *task;
+	u32 result;
+	int status;
+};
+
+static void sync_completion(struct nvme_dev *dev, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct sync_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	wake_up_process(cmdinfo->task);
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
+			struct nvme_command *cmd, u32 *result, unsigned timeout)
+{
+	int cmdid;
+	struct sync_cmd_info cmdinfo;
+
+	cmdinfo.task = current;
+	cmdinfo.status = -EINTR;
+
+	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
+								timeout);
+	if (cmdid < 0)
+		return cmdid;
+	cmd->common.command_id = cmdid;
+
+	set_current_state(TASK_KILLABLE);
+	nvme_submit_cmd(nvmeq, cmd);
+	schedule();
+
+	if (cmdinfo.status == -EINTR) {
+		nvme_abort_command(nvmeq, cmdid);
+		return -EINTR;
+	}
+
+	if (result)
+		*result = cmdinfo.result;
+
+	return cmdinfo.status;
+}
+
+static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+								u32 *result)
+{
+	return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+	int status;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(id);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+	memset(&c, 0, sizeof(c));
+	c.create_cq.opcode = nvme_admin_create_cq;
+	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+	c.create_cq.cqid = cpu_to_le16(qid);
+	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_cq.cq_flags = cpu_to_le16(flags);
+	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+
+	memset(&c, 0, sizeof(c));
+	c.create_sq.opcode = nvme_admin_create_sq;
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_sq.sq_flags = cpu_to_le16(flags);
+	c.create_sq.cqid = cpu_to_le16(qid);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+							dma_addr_t dma_addr)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.cns = cpu_to_le32(cns);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
+				unsigned dword11, dma_addr_t dma_addr)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
+			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return nvme_submit_admin_cmd(dev, &c, result);
+}
+
+static void nvme_free_queue(struct nvme_dev *dev, int qid)
+{
+	struct nvme_queue *nvmeq = dev->queues[qid];
+	int vector = dev->entry[nvmeq->cq_vector].vector;
+
+	irq_set_affinity_hint(vector, NULL);
+	free_irq(vector, nvmeq);
+
+	/* Don't tell the adapter to delete the admin queue */
+	if (qid) {
+		adapter_delete_sq(dev, qid);
+		adapter_delete_cq(dev, qid);
+	}
+
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+}
+
+static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+							int depth, int vector)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	if (!nvmeq)
+		return NULL;
+
+	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
+					&nvmeq->cq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->cqes)
+		goto free_nvmeq;
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+
+	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+					&nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		goto free_cqdma;
+
+	nvmeq->q_dmadev = dmadev;
+	nvmeq->dev = dev;
+	spin_lock_init(&nvmeq->q_lock);
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	init_waitqueue_head(&nvmeq->sq_full);
+	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
+	bio_list_init(&nvmeq->sq_cong);
+	nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+	nvmeq->q_depth = depth;
+	nvmeq->cq_vector = vector;
+
+	return nvmeq;
+
+ free_cqdma:
+	dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
+							nvmeq->cq_dma_addr);
+ free_nvmeq:
+	kfree(nvmeq);
+	return NULL;
+}
+
+static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+							const char *name)
+{
+	if (use_threaded_interrupts)
+		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+					nvme_irq_check, nvme_irq,
+					IRQF_DISABLED | IRQF_SHARED,
+					name, nvmeq);
+	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+				IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
+}
+
+static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
+					int qid, int cq_size, int vector)
+{
+	int result;
+	struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+
+	if (!nvmeq)
+		return ERR_PTR(-ENOMEM);
+
+	result = adapter_alloc_cq(dev, qid, nvmeq);
+	if (result < 0)
+		goto free_nvmeq;
+
+	result = adapter_alloc_sq(dev, qid, nvmeq);
+	if (result < 0)
+		goto release_cq;
+
+	result = queue_request_irq(dev, nvmeq, "nvme");
+	if (result < 0)
+		goto release_sq;
+
+	return nvmeq;
+
+ release_sq:
+	adapter_delete_sq(dev, qid);
+ release_cq:
+	adapter_delete_cq(dev, qid);
+ free_nvmeq:
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+	return ERR_PTR(result);
+}
+
+static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
+{
+	int result;
+	u32 aqa;
+	u64 cap;
+	unsigned long timeout;
+	struct nvme_queue *nvmeq;
+
+	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+	if (!nvmeq)
+		return -ENOMEM;
+
+	aqa = nvmeq->q_depth - 1;
+	aqa |= aqa << 16;
+
+	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+
+	writel(0, &dev->bar->cc);
+	writel(aqa, &dev->bar->aqa);
+	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+	writel(dev->ctrl_config, &dev->bar->cc);
+
+	cap = readq(&dev->bar->cap);
+	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	dev->db_stride = NVME_CAP_STRIDE(cap);
+
+	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device not ready; aborting initialisation\n");
+			return -ENODEV;
+		}
+	}
+
+	result = queue_request_irq(dev, nvmeq, "nvme admin");
+	dev->queues[0] = nvmeq;
+	return result;
+}
+
+static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length)
+{
+	int i, err, count, nents, offset;
+	struct scatterlist *sg;
+	struct page **pages;
+	struct nvme_iod *iod;
+
+	if (addr & 3)
+		return ERR_PTR(-EINVAL);
+	if (!length)
+		return ERR_PTR(-EINVAL);
+
+	offset = offset_in_page(addr);
+	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
+
+	err = get_user_pages_fast(addr, count, 1, pages);
+	if (err < count) {
+		count = err;
+		err = -EFAULT;
+		goto put_pages;
+	}
+
+	iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+	sg = iod->sg;
+	sg_init_table(sg, count);
+	for (i = 0; i < count; i++) {
+		sg_set_page(&sg[i], pages[i],
+				min_t(int, length, PAGE_SIZE - offset), offset);
+		length -= (PAGE_SIZE - offset);
+		offset = 0;
+	}
+	sg_mark_end(&sg[i - 1]);
+	iod->nents = count;
+
+	err = -ENOMEM;
+	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	if (!nents)
+		goto free_iod;
+
+	kfree(pages);
+	return iod;
+
+ free_iod:
+	kfree(iod);
+ put_pages:
+	for (i = 0; i < count; i++)
+		put_page(pages[i]);
+	kfree(pages);
+	return ERR_PTR(err);
+}
+
+static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+			struct nvme_iod *iod)
+{
+	int i;
+
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	for (i = 0; i < iod->nents; i++)
+		put_page(sg_page(&iod->sg[i]));
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_queue *nvmeq;
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length;
+	int status;
+	struct nvme_iod *iod;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+	length = (io.nblocks + 1) << ns->lba_shift;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+	case nvme_cmd_compare:
+		iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (IS_ERR(iod))
+		return PTR_ERR(iod);
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(ns->ns_id);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
+	c.rw.reftag = io.reftag;
+	c.rw.apptag = io.apptag;
+	c.rw.appmask = io.appmask;
+	/* XXX: metadata */
+	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
+
+	nvmeq = get_nvmeq(dev);
+	/*
+	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
+	 * disabled.  We may be preempted at any point, and be rescheduled
+	 * to a different CPU.  That will cause cacheline bouncing, but no
+	 * additional races since q_lock already protects against other CPUs.
+	 */
+	put_nvmeq(nvmeq);
+	if (length != (io.nblocks + 1) << ns->lba_shift)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
+
+	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
+	nvme_free_iod(dev, iod);
+	return status;
+}
+
+static int nvme_user_admin_cmd(struct nvme_ns *ns,
+					struct nvme_admin_cmd __user *ucmd)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_admin_cmd cmd;
+	struct nvme_command c;
+	int status, length;
+	struct nvme_iod *iod;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	length = cmd.data_len;
+	if (cmd.data_len) {
+		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
+								length);
+		if (IS_ERR(iod))
+			return PTR_ERR(iod);
+		length = nvme_setup_prps(dev, &c.common, iod, length,
+								GFP_KERNEL);
+	}
+
+	if (length != cmd.data_len)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_admin_cmd(dev, &c, NULL);
+
+	if (cmd.data_len) {
+		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
+		nvme_free_iod(dev, iod);
+	}
+	return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+							unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		return ns->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_admin_cmd(ns, (void __user *)arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_ioctl,
+};
+
+static void nvme_timeout_ios(struct nvme_queue *nvmeq)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	unsigned long now = jiffies;
+	int cmdid;
+
+	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+		void *ctx;
+		nvme_completion_fn fn;
+		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
+
+		if (!time_after(now, info[cmdid].timeout))
+			continue;
+		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
+		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+		fn(nvmeq->dev, ctx, &cqe);
+	}
+}
+
+static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
+{
+	while (bio_list_peek(&nvmeq->sq_cong)) {
+		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+			bio_list_add_head(&nvmeq->sq_cong, bio);
+			break;
+		}
+		if (bio_list_empty(&nvmeq->sq_cong))
+			remove_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
+	}
+}
+
+static int nvme_kthread(void *data)
+{
+	struct nvme_dev *dev;
+
+	while (!kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock(&dev_list_lock);
+		list_for_each_entry(dev, &dev_list, node) {
+			int i;
+			for (i = 0; i < dev->queue_count; i++) {
+				struct nvme_queue *nvmeq = dev->queues[i];
+				if (!nvmeq)
+					continue;
+				spin_lock_irq(&nvmeq->q_lock);
+				if (nvme_process_cq(nvmeq))
+					printk("process_cq did something\n");
+				nvme_timeout_ios(nvmeq);
+				nvme_resubmit_bios(nvmeq);
+				spin_unlock_irq(&nvmeq->q_lock);
+			}
+		}
+		spin_unlock(&dev_list_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+	return 0;
+}
+
+static DEFINE_IDA(nvme_index_ida);
+
+static int nvme_get_ns_idx(void)
+{
+	int index, error;
+
+	do {
+		if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
+			return -1;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_index_ida, &index);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		index = -1;
+	return index;
+}
+
+static void nvme_put_ns_idx(int index)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_index_ida, index);
+	spin_unlock(&dev_list_lock);
+}
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
+			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	int lbaf;
+
+	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
+		return NULL;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!ns->queue)
+		goto out_free_ns;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+/*	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
+	blk_queue_make_request(ns->queue, nvme_make_request);
+	ns->dev = dev;
+	ns->queue->queuedata = ns;
+
+	disk = alloc_disk(NVME_MINORS);
+	if (!disk)
+		goto out_free_queue;
+	ns->ns_id = nsid;
+	ns->disk = disk;
+	lbaf = id->flbas & 0xf;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+
+	disk->major = nvme_major;
+	disk->minors = NVME_MINORS;
+	disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->driverfs_dev = &dev->pci_dev->dev;
+	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
+	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	return ns;
+
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+	return NULL;
+}
+
+static void nvme_ns_free(struct nvme_ns *ns)
+{
+	int index = ns->disk->first_minor / NVME_MINORS;
+	put_disk(ns->disk);
+	nvme_put_ns_idx(index);
+	blk_cleanup_queue(ns->queue);
+	kfree(ns);
+}
+
+static int set_queue_count(struct nvme_dev *dev, int count)
+{
+	int status;
+	u32 result;
+	u32 q_count = (count - 1) | ((count - 1) << 16);
+
+	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+								&result);
+	if (status)
+		return -EIO;
+	return min(result & 0xffff, result >> 16) + 1;
+}
+
+static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
+{
+	int result, cpu, i, nr_io_queues, db_bar_size;
+
+	nr_io_queues = num_online_cpus();
+	result = set_queue_count(dev, nr_io_queues);
+	if (result < 0)
+		return result;
+	if (result < nr_io_queues)
+		nr_io_queues = result;
+
+	/* Deregister the admin queue's interrupt */
+	free_irq(dev->entry[0].vector, dev->queues[0]);
+
+	db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
+	if (db_bar_size > 8192) {
+		iounmap(dev->bar);
+		dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0),
+								db_bar_size);
+		dev->dbs = ((void __iomem *)dev->bar) + 4096;
+		dev->queues[0]->q_db = dev->dbs;
+	}
+
+	for (i = 0; i < nr_io_queues; i++)
+		dev->entry[i].entry = i;
+	for (;;) {
+		result = pci_enable_msix(dev->pci_dev, dev->entry,
+								nr_io_queues);
+		if (result == 0) {
+			break;
+		} else if (result > 0) {
+			nr_io_queues = result;
+			continue;
+		} else {
+			nr_io_queues = 1;
+			break;
+		}
+	}
+
+	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
+	/* XXX: handle failure here */
+
+	cpu = cpumask_first(cpu_online_mask);
+	for (i = 0; i < nr_io_queues; i++) {
+		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+		cpu = cpumask_next(cpu, cpu_online_mask);
+	}
+
+	for (i = 0; i < nr_io_queues; i++) {
+		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
+							NVME_Q_DEPTH, i);
+		if (IS_ERR(dev->queues[i + 1]))
+			return PTR_ERR(dev->queues[i + 1]);
+		dev->queue_count++;
+	}
+
+	for (; i < num_possible_cpus(); i++) {
+		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
+		dev->queues[i + 1] = dev->queues[target + 1];
+	}
+
+	return 0;
+}
+
+static void nvme_free_queues(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = dev->queue_count - 1; i >= 0; i--)
+		nvme_free_queue(dev, i);
+}
+
+static int __devinit nvme_dev_add(struct nvme_dev *dev)
+{
+	int res, nn, i;
+	struct nvme_ns *ns, *next;
+	struct nvme_id_ctrl *ctrl;
+	struct nvme_id_ns *id_ns;
+	void *mem;
+	dma_addr_t dma_addr;
+
+	res = nvme_setup_io_queues(dev);
+	if (res)
+		return res;
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
+								GFP_KERNEL);
+
+	res = nvme_identify(dev, 0, 1, dma_addr);
+	if (res) {
+		res = -EIO;
+		goto out_free;
+	}
+
+	ctrl = mem;
+	nn = le32_to_cpup(&ctrl->nn);
+	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
+	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
+	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
+
+	id_ns = mem;
+	for (i = 1; i <= nn; i++) {
+		res = nvme_identify(dev, i, 0, dma_addr);
+		if (res)
+			continue;
+
+		if (id_ns->ncap == 0)
+			continue;
+
+		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
+							dma_addr + 4096);
+		if (res)
+			continue;
+
+		ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
+		if (ns)
+			list_add_tail(&ns->list, &dev->namespaces);
+	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		add_disk(ns->disk);
+
+	goto out;
+
+ out_free:
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		nvme_ns_free(ns);
+	}
+
+ out:
+	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
+	return res;
+}
+
+static int nvme_dev_remove(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns, *next;
+
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
+	/* TODO: wait all I/O finished or cancel them */
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		del_gendisk(ns->disk);
+		nvme_ns_free(ns);
+	}
+
+	nvme_free_queues(dev);
+
+	return 0;
+}
+
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+						PAGE_SIZE, PAGE_SIZE, 0);
+	if (!dev->prp_page_pool)
+		return -ENOMEM;
+
+	/* Optimisation for I/Os between 4k and 128k */
+	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+						256, 256, 0);
+	if (!dev->prp_small_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+	dma_pool_destroy(dev->prp_page_pool);
+	dma_pool_destroy(dev->prp_small_pool);
+}
+
+/* XXX: Use an ida or something to let remove / add work correctly */
+static void nvme_set_instance(struct nvme_dev *dev)
+{
+	static int instance;
+	dev->instance = instance++;
+}
+
+static void nvme_release_instance(struct nvme_dev *dev)
+{
+}
+
+static int __devinit nvme_probe(struct pci_dev *pdev,
+						const struct pci_device_id *id)
+{
+	int bars, result = -ENOMEM;
+	struct nvme_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
+								GFP_KERNEL);
+	if (!dev->entry)
+		goto free;
+	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
+								GFP_KERNEL);
+	if (!dev->queues)
+		goto free;
+
+	if (pci_enable_device_mem(pdev))
+		goto free;
+	pci_set_master(pdev);
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		goto disable;
+
+	INIT_LIST_HEAD(&dev->namespaces);
+	dev->pci_dev = pdev;
+	pci_set_drvdata(pdev, dev);
+	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+	dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
+	nvme_set_instance(dev);
+	dev->entry[0].vector = pdev->irq;
+
+	result = nvme_setup_prp_pools(dev);
+	if (result)
+		goto disable_msix;
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar) {
+		result = -ENOMEM;
+		goto disable_msix;
+	}
+
+	result = nvme_configure_admin_queue(dev);
+	if (result)
+		goto unmap;
+	dev->queue_count++;
+
+	spin_lock(&dev_list_lock);
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
+	result = nvme_dev_add(dev);
+	if (result)
+		goto delete;
+
+	return 0;
+
+ delete:
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
+	nvme_free_queues(dev);
+ unmap:
+	iounmap(dev->bar);
+ disable_msix:
+	pci_disable_msix(pdev);
+	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
+ disable:
+	pci_disable_device(pdev);
+	pci_release_regions(pdev);
+ free:
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+	return result;
+}
+
+static void __devexit nvme_remove(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_dev_remove(dev);
+	pci_disable_msix(pdev);
+	iounmap(dev->bar);
+	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
+	pci_disable_device(pdev);
+	pci_release_regions(pdev);
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+}
+
+/* These functions are yet to be implemented */
+#define nvme_error_detected NULL
+#define nvme_dump_registers NULL
+#define nvme_link_reset NULL
+#define nvme_slot_reset NULL
+#define nvme_error_resume NULL
+#define nvme_suspend NULL
+#define nvme_resume NULL
+
+static struct pci_error_handlers nvme_err_handler = {
+	.error_detected	= nvme_error_detected,
+	.mmio_enabled	= nvme_dump_registers,
+	.link_reset	= nvme_link_reset,
+	.slot_reset	= nvme_slot_reset,
+	.resume		= nvme_error_resume,
+};
+
+/* Move to pci_ids.h later */
+#define PCI_CLASS_STORAGE_EXPRESS	0x010802
+
+static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+	.name		= "nvme",
+	.id_table	= nvme_id_table,
+	.probe		= nvme_probe,
+	.remove		= __devexit_p(nvme_remove),
+	.suspend	= nvme_suspend,
+	.resume		= nvme_resume,
+	.err_handler	= &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+	int result = -EBUSY;
+
+	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+	if (IS_ERR(nvme_thread))
+		return PTR_ERR(nvme_thread);
+
+	nvme_major = register_blkdev(nvme_major, "nvme");
+	if (nvme_major <= 0)
+		goto kill_kthread;
+
+	result = pci_register_driver(&nvme_driver);
+	if (result)
+		goto unregister_blkdev;
+	return 0;
+
+ unregister_blkdev:
+	unregister_blkdev(nvme_major, "nvme");
+ kill_kthread:
+	kthread_stop(nvme_thread);
+	return result;
+}
+
+static void __exit nvme_exit(void)
+{
+	pci_unregister_driver(&nvme_driver);
+	unregister_blkdev(nvme_major, "nvme");
+	kthread_stop(nvme_thread);
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.8");
+module_init(nvme_init);
+module_exit(nvme_exit);
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac23..ec64e7f5d1ce 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
 */
 
 
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
 #define BACKPACK_VERSION "2.0.2"
 
 #include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
 #include "ppc6lnx.c"
 #include "paride.h"
 
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
  
 
 #define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 46b8136c31bb..ba2b6b5e5910 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -144,7 +144,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 static DEFINE_MUTEX(pcd_mutex);
 static DEFINE_SPINLOCK(pcd_lock);
 
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
 module_param(major, int, 0);
 module_param(name, charp, 0);
 module_param(nice, int, 0);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 869e7676d46f..831e3ac156e6 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
    by default.
 
 */
+#include <linux/types.h>
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PD_MAJOR;
 static char *name = PD_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f21b520ef419..ec8f9ed6326e 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
 #define PF_NAME		"pf"
 #define PF_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is off
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PF_MAJOR;
 static char *name = PF_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index a79fb4f7ff62..4a27b1de5fcb 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
 #define PI_PG	4
 #endif
 
+#include <linux/types.h>
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is 0
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PG_MAJOR;
 static char *name = PG_NAME;
 static int disable = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 7179f79d7468..2596042eb987 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
 #define PT_NAME		"pt"
 #define PT_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is on
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PT_MAJOR;
 static char *name = PT_NAME;
 static int disable = 0;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 148ab944378d..3fd31dec8c9c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2184,6 +2184,8 @@ static ssize_t rbd_add(struct bus_type *bus,
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 
+	init_rwsem(&rbd_dev->header.snap_rwsem);
+
 	/* generate unique id: find highest unique id, add one */
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index b70f0fca9a42..e7472f567c9d 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 }
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0e376d46bdd1..7333b9e44411 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1744,12 +1744,11 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
 static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	void __user *usermem = (void __user *) arg;
 	int ret;
 
 	mutex_lock(&ub_mutex);
-	ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+	ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem);
 	mutex_unlock(&ub_mutex);
 
 	return ret;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index ffd5ca919295..c4a60badf252 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -250,8 +250,8 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOTTY;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
-			      (void __user *)data);
+	return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+				  (void __user *)data);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4abd2bcd20fb..51a972704db5 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -148,7 +148,7 @@ static volatile int xdc_busy;
 static struct timer_list xd_watchdog_int;
 
 static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
+static bool nodma = XD_DONT_USE_DMA;
 
 static struct request_queue *xd_queue;
 
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 15ec4db194d1..0088bf60f368 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -39,9 +39,6 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
-#include <linux/loop.h>
-#include <linux/falloc.h>
-#include <linux/fs.h>
 
 #include <xen/events.h>
 #include <xen/page.h>
@@ -362,7 +359,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i;
-	int nseg = req->nr_segments;
+	int nseg = req->u.rw.nr_segments;
 	int ret = 0;
 
 	/*
@@ -416,30 +413,25 @@ static int xen_blkbk_map(struct blkif_request *req,
 	return ret;
 }
 
-static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
+static int dispatch_discard_io(struct xen_blkif *blkif,
+				struct blkif_request *req)
 {
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
 	struct block_device *bdev = blkif->vbd.bdev;
 
-	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY)
-		/* just forward the discard request */
+	blkif->st_ds_req++;
+
+	xen_blkif_get(blkif);
+	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY ||
+	    blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
+		unsigned long secure = (blkif->vbd.discard_secure &&
+			(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+			BLKDEV_DISCARD_SECURE : 0;
 		err = blkdev_issue_discard(bdev,
 				req->u.discard.sector_number,
 				req->u.discard.nr_sectors,
-				GFP_KERNEL, 0);
-	else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
-		/* punch a hole in the backing file */
-		struct loop_device *lo = bdev->bd_disk->private_data;
-		struct file *file = lo->lo_backing_file;
-
-		if (file->f_op->fallocate)
-			err = file->f_op->fallocate(file,
-				FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
-				req->u.discard.sector_number << 9,
-				req->u.discard.nr_sectors << 9);
-		else
-			err = -EOPNOTSUPP;
+				GFP_KERNEL, secure);
 	} else
 		err = -EOPNOTSUPP;
 
@@ -449,7 +441,9 @@ static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
 	} else if (err)
 		status = BLKIF_RSP_ERROR;
 
-	make_response(blkif, req->id, req->operation, status);
+	make_response(blkif, req->u.discard.id, req->operation, status);
+	xen_blkif_put(blkif);
+	return err;
 }
 
 static void xen_blk_drain_io(struct xen_blkif *blkif)
@@ -573,8 +567,11 @@ __do_block_io_op(struct xen_blkif *blkif)
 
 		/* Apply all sanity checks to /private copy/ of request. */
 		barrier();
-
-		if (dispatch_rw_block_io(blkif, &req, pending_req))
+		if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
+			free_req(pending_req);
+			if (dispatch_discard_io(blkif, &req))
+				break;
+		} else if (dispatch_rw_block_io(blkif, &req, pending_req))
 			break;
 
 		/* Yield point for this unbounded loop. */
@@ -633,10 +630,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
 		break;
-	case BLKIF_OP_DISCARD:
-		blkif->st_ds_req++;
-		operation = REQ_DISCARD;
-		break;
 	default:
 		operation = 0; /* make gcc happy */
 		goto fail_response;
@@ -644,9 +637,9 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	}
 
 	/* Check that the number of segments is sane. */
-	nseg = req->nr_segments;
-	if (unlikely(nseg == 0 && operation != WRITE_FLUSH &&
-				operation != REQ_DISCARD) ||
+	nseg = req->u.rw.nr_segments;
+
+	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 			 nseg);
@@ -654,12 +647,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		goto fail_response;
 	}
 
-	preq.dev           = req->handle;
+	preq.dev           = req->u.rw.handle;
 	preq.sector_number = req->u.rw.sector_number;
 	preq.nr_sects      = 0;
 
 	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
+	pending_req->id        = req->u.rw.id;
 	pending_req->operation = req->operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
@@ -707,7 +700,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * the hypercall to unmap the grants - that is all done in
 	 * xen_blkbk_unmap.
 	 */
-	if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg))
+	if (xen_blkbk_map(req, pending_req, seg))
 		goto fail_flush;
 
 	/*
@@ -739,23 +732,16 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	/* This will be hit if the operation was a flush or discard. */
 	if (!bio) {
-		BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD);
+		BUG_ON(operation != WRITE_FLUSH);
 
-		if (operation == WRITE_FLUSH) {
-			bio = bio_alloc(GFP_KERNEL, 0);
-			if (unlikely(bio == NULL))
-				goto fail_put_bio;
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
 
-			biolist[nbio++] = bio;
-			bio->bi_bdev    = preq.bdev;
-			bio->bi_private = pending_req;
-			bio->bi_end_io  = end_block_io_op;
-		} else if (operation == REQ_DISCARD) {
-			xen_blk_discard(blkif, req);
-			xen_blkif_put(blkif);
-			free_req(pending_req);
-			return 0;
-		}
+		biolist[nbio++] = bio;
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
 	}
 
 	/*
@@ -784,7 +770,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	xen_blkbk_unmap(pending_req);
  fail_response:
 	/* Haven't submitted any bio's yet. */
-	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
 	free_req(pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index dfb1b3a43a5d..d0ee7edc9be8 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -60,58 +60,66 @@ struct blkif_common_response {
 	char dummy;
 };
 
-/* i386 protocol version */
-#pragma pack(push, 4)
-
 struct blkif_x86_32_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
 
 struct blkif_x86_32_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	uint64_t nr_sectors;
-};
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
 
 struct blkif_x86_32_request {
 	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       id;           /* private guest value, echoed in resp  */
 	union {
 		struct blkif_x86_32_request_rw rw;
 		struct blkif_x86_32_request_discard discard;
 	} u;
-};
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
 struct blkif_x86_32_response {
 	uint64_t        id;              /* copied from request */
 	uint8_t         operation;       /* copied from request */
 	int16_t         status;          /* BLKIF_RSP_???       */
 };
 #pragma pack(pop)
-
 /* x86_64 protocol version */
 
 struct blkif_x86_64_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint32_t       _pad1;        /* offsetof(blkif_reqest..,u.rw.id)==8  */
+	uint64_t       id;
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
 
 struct blkif_x86_64_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+        uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	uint64_t nr_sectors;
-};
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
 
 struct blkif_x86_64_request {
 	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       __attribute__((__aligned__(8))) id;
 	union {
 		struct blkif_x86_64_request_rw rw;
 		struct blkif_x86_64_request_discard discard;
 	} u;
-};
+} __attribute__((__packed__));
+
 struct blkif_x86_64_response {
 	uint64_t       __attribute__((__aligned__(8))) id;
 	uint8_t         operation;       /* copied from request */
@@ -156,6 +164,7 @@ struct xen_vbd {
 	/* Cached size parameter. */
 	sector_t		size;
 	bool			flush_support;
+	bool			discard_secure;
 };
 
 struct backend_info;
@@ -237,22 +246,23 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
 	switch (src->operation) {
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
 	case BLKIF_OP_WRITE_BARRIER:
 	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
 		dst->u.rw.sector_number = src->u.rw.sector_number;
 		barrier();
-		if (n > dst->nr_segments)
-			n = dst->nr_segments;
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
 		for (i = 0; i < n; i++)
 			dst->u.rw.seg[i] = src->u.rw.seg[i];
 		break;
 	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
 		dst->u.discard.sector_number = src->u.discard.sector_number;
 		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
 		break;
@@ -266,22 +276,23 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
 	switch (src->operation) {
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
 	case BLKIF_OP_WRITE_BARRIER:
 	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
 		dst->u.rw.sector_number = src->u.rw.sector_number;
 		barrier();
-		if (n > dst->nr_segments)
-			n = dst->nr_segments;
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
 		for (i = 0; i < n; i++)
 			dst->u.rw.seg[i] = src->u.rw.seg[i];
 		break;
 	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
 		dst->u.discard.sector_number = src->u.discard.sector_number;
 		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
 		break;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 37c794d31264..24a2fb57e5d0 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -338,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	if (q && q->flush_flags)
 		vbd->flush_support = true;
 
+	if (q && blk_queue_secdiscard(q))
+		vbd->discard_secure = true;
+
 	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
 		handle, blkif->domid);
 	return 0;
@@ -420,6 +423,15 @@ int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
 				state = 1;
 				blkif->blk_backend_type = BLKIF_BACKEND_PHY;
 			}
+			/* Optional. */
+			err = xenbus_printf(xbt, dev->nodename,
+				"discard-secure", "%d",
+				blkif->vbd.discard_secure);
+			if (err) {
+				xenbus_dev_fatal(dev, err,
+					"writting discard-secure");
+				goto kfree;
+			}
 		}
 	} else {
 		err = PTR_ERR(type);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9fd3ee203b1e..2f22874c0a37 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -98,7 +98,8 @@ struct blkfront_info
 	unsigned long shadow_free;
 	unsigned int feature_flush;
 	unsigned int flush_op;
-	unsigned int feature_discard;
+	unsigned int feature_discard:1;
+	unsigned int feature_secdiscard:1;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	int is_ready;
@@ -135,15 +136,15 @@ static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
 	BUG_ON(free >= BLK_RING_SIZE);
-	info->shadow_free = info->shadow[free].req.id;
-	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	info->shadow_free = info->shadow[free].req.u.rw.id;
+	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
 }
 
 static void add_id_to_freelist(struct blkfront_info *info,
 			       unsigned long id)
 {
-	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].req.u.rw.id  = info->shadow_free;
 	info->shadow[id].request = NULL;
 	info->shadow_free = id;
 }
@@ -156,7 +157,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 	if (end > nr_minors) {
 		unsigned long *bitmap, *old;
 
-		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
 				 GFP_KERNEL);
 		if (bitmap == NULL)
 			return -ENOMEM;
@@ -287,9 +288,9 @@ static int blkif_queue_request(struct request *req)
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = req;
 
-	ring_req->id = id;
+	ring_req->u.rw.id = id;
 	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
-	ring_req->handle = info->handle;
+	ring_req->u.rw.handle = info->handle;
 
 	ring_req->operation = rq_data_dir(req) ?
 		BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -305,16 +306,21 @@ static int blkif_queue_request(struct request *req)
 		ring_req->operation = info->flush_op;
 	}
 
-	if (unlikely(req->cmd_flags & REQ_DISCARD)) {
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
 		/* id, sector_number and handle are set above. */
 		ring_req->operation = BLKIF_OP_DISCARD;
-		ring_req->nr_segments = 0;
 		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+		else
+			ring_req->u.discard.flag = 0;
 	} else {
-		ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
-		BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
+							   info->sg);
+		BUG_ON(ring_req->u.rw.nr_segments >
+		       BLKIF_MAX_SEGMENTS_PER_REQUEST);
 
-		for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
+		for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
 			buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 			fsect = sg->offset >> 9;
 			lsect = fsect + (sg->length >> 9) - 1;
@@ -424,6 +430,8 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 		blk_queue_max_discard_sectors(rq, get_capacity(gd));
 		rq->limits.discard_granularity = info->discard_granularity;
 		rq->limits.discard_alignment = info->discard_alignment;
+		if (info->feature_secdiscard)
+			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
 	}
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -705,7 +713,9 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
-	for (i = 0; i < s->req.nr_segments; i++)
+	/* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
+	 * flag. */
+	for (i = 0; i < s->req.u.rw.nr_segments; i++)
 		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }
 
@@ -736,7 +746,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 		id   = bret->id;
 		req  = info->shadow[id].request;
 
-		blkif_completion(&info->shadow[id]);
+		if (bret->operation != BLKIF_OP_DISCARD)
+			blkif_completion(&info->shadow[id]);
 
 		add_id_to_freelist(info, id);
 
@@ -749,7 +760,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 					   info->gd->disk_name);
 				error = -EOPNOTSUPP;
 				info->feature_discard = 0;
+				info->feature_secdiscard = 0;
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
 			}
 			__blk_end_request_all(req, error);
 			break;
@@ -763,7 +776,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-				     info->shadow[id].req.nr_segments == 0)) {
+				     info->shadow[id].req.u.rw.nr_segments == 0)) {
 				printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
 				       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
 				       "barrier" :  "flush disk cache",
@@ -984,8 +997,8 @@ static int blkfront_probe(struct xenbus_device *dev,
 	INIT_WORK(&info->work, blkif_restart_queue);
 
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+		info->shadow[i].req.u.rw.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1019,9 +1032,9 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
+		info->shadow[i].req.u.rw.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Stage 3: Find pending requests and requeue them. */
 	for (i = 0; i < BLK_RING_SIZE; i++) {
@@ -1034,17 +1047,19 @@ static int blkif_recover(struct blkfront_info *info)
 		*req = copy[i].req;
 
 		/* We get a new request id, and must reset the shadow state. */
-		req->id = get_id_from_freelist(info);
-		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+		req->u.rw.id = get_id_from_freelist(info);
+		memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
 
+		if (req->operation != BLKIF_OP_DISCARD) {
 		/* Rewrite any grant references invalidated by susp/resume. */
-		for (j = 0; j < req->nr_segments; j++)
-			gnttab_grant_foreign_access_ref(
-				req->u.rw.seg[j].gref,
-				info->xbdev->otherend_id,
-				pfn_to_mfn(info->shadow[req->id].frame[j]),
-				rq_data_dir(info->shadow[req->id].request));
-		info->shadow[req->id].req = *req;
+			for (j = 0; j < req->u.rw.nr_segments; j++)
+				gnttab_grant_foreign_access_ref(
+					req->u.rw.seg[j].gref,
+					info->xbdev->otherend_id,
+					pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
+					rq_data_dir(info->shadow[req->u.rw.id].request));
+		}
+		info->shadow[req->u.rw.id].req = *req;
 
 		info->ring.req_prod_pvt++;
 	}
@@ -1135,11 +1150,13 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 	char *type;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
+	unsigned int discard_secure;
 
 	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
 	if (IS_ERR(type))
 		return;
 
+	info->feature_secdiscard = 0;
 	if (strncmp(type, "phy", 3) == 0) {
 		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			"discard-granularity", "%u", &discard_granularity,
@@ -1150,6 +1167,12 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 			info->discard_granularity = discard_granularity;
 			info->discard_alignment = discard_alignment;
 		}
+		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "discard-secure", "%d", &discard_secure,
+			    NULL);
+		if (!err)
+			info->feature_secdiscard = discard_secure;
+
 	} else if (strncmp(type, "file", 4) == 0)
 		info->feature_discard = 1;