166 files changed, 4458 insertions, 2017 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0c5086db8352..80e193d82e2e 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
 1. Quick usage instructions:
 ===========================
 
-  - Grab updated e2fsprogs from
-    ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
-    This is a patchset on top of e2fsprogs-1.39, which can be found at
+  - Compile and install the latest version of e2fsprogs (as of this
+    writing version 1.41) from:
+
+    http://sourceforge.net/project/showfiles.php?group_id=2406
+	
+	or
+
     ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
 
-  - It's still mke2fs -j /dev/hda1
+	or grab the latest git repository from:
+
+    git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+  - Create a new filesystem using the ext4dev filesystem type:
+
+    	# mke2fs -t ext4dev /dev/hda1
+
+    Or configure an existing ext3 filesystem to support extents and set
+    the test_fs flag to indicate that it's ok for an in-development
+    filesystem to touch this filesystem:
 
-  - mount /dev/hda1 /wherever -t ext4dev
+	# tune2fs -O extents -E test_fs /dev/hda1
 
-  - To enable extents,
+    If the filesystem was created with 128 byte inodes, it can be
+    converted to use 256 byte for greater efficiency via:
 
-	mount /dev/hda1 /wherever -t ext4dev -o extents
+        # tune2fs -I 256 /dev/hda1
 
-  - The filesystem is compatible with the ext3 driver until you add a file
-    which has extents (ie: `mount -o extents', then create a file).
+    (Note: we currently do not have tools to convert an ext4dev
+    filesystem back to ext3; so please do not do try this on production
+    filesystems.)
 
-    NOTE: The "extents" mount flag is temporary.  It will soon go away and
-    extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+  - Mounting:
+
+	# mount -t ext4dev /dev/hda1 /wherever
 
   - When comparing performance with other filesystems, remember that
-    ext3/4 by default offers higher data integrity guarantees than most.  So
-    when comparing with a metadata-only journalling filesystem, use `mount -o
-    data=writeback'.  And you might as well use `mount -o nobh' too along
-    with it.  Making the journal larger than the mke2fs default often helps
-    performance with metadata-intensive workloads.
+    ext3/4 by default offers higher data integrity guarantees than most.
+    So when comparing with a metadata-only journalling filesystem, such
+    as ext3, use `mount -o data=writeback'.  And you might as well use
+    `mount -o nobh' too along with it.  Making the journal larger than
+    the mke2fs default often helps performance with metadata-intensive
+    workloads.
 
 2. Features
 ===========
 
 2.1 Currently available
 
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
 * extent format reduces metadata overhead (RAM, IO for access, transactions)
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redunancy in tree
-
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
-
-* dir_index and resize inode will be on by default
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+  flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+  the ordering)
 
 2.2 Candidate features for future inclusion
 
-There are several under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them:
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+  the uninit_bg feature (capability to do this is available in e2fsprogs
+  but a kernel thread to do lazy zeroing of unused inode table blocks
+  after filesystem is first mounted is required for safety)
 
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
-  needs some e2fsck work)
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
 
-Features like metadata checksumming have been discussed and planned for
-a bit but no patches exist yet so I'm not sure they're in the near-term
-roadmap.
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables.  Some test results available here:
 
-The big performance win will come with mballoc and delalloc.  CFS has
-been using mballoc for a few years already with Lustre, and IBM + Bull
-did a lot of benchmarking on it.  The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes.  I believe
-Alex is working on a new set of patches right now.
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
 
 3. Options
 ==========
@@ -222,9 +243,11 @@ stripe=n		Number of filesystem blocks that mballoc will try
 			to use for allocation size and alignment. For RAID5/6
 			systems this should be the number of data
 			disks *  RAID chunk size in file system blocks.
-
+delalloc	(*)	Deferring block allocation until write-out time.
+nodelalloc		Disable delayed allocation. Blocks are allocation
+			when data is copied from user to page cache.
 Data Mode
----------
+=========
 There are 3 different data modes:
 
 * writeback mode
@@ -236,10 +259,10 @@ typically provide the best ext4 performance.
 
 * ordered mode
 In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction.  When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first.  In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction.  When it's time to write the new metadata
+out to disk, the associated data blocks are written first.  In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
 
 * journal mode
 data=journal mode provides full data and metadata journaling.  All new data is
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location.
 In the event of a crash, the journal can be replayed, bringing both data and
 metadata into a consistent state.  This mode is the slowest except when data
 needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes.  Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
 
 References
 ==========
@@ -256,7 +280,8 @@ kernel source:	<file:fs/ext4/>
 		<file:fs/jbd2/>
 
 programs:	http://e2fsprogs.sourceforge.net/
-		http://ext2resize.sourceforge.net
 
 useful links:	http://fedoraproject.org/wiki/ext3-devel
 		http://www.bullopensource.org/ext4/
+		http://ext4.wiki.kernel.org/index.php/Main_Page
+		http://fedoraproject.org/wiki/Features/Ext4
diff --git a/drivers/gpu/drm/drm_memory.c b/drivers/gpu/drm/drm_memory.c
index 845081b44f63..0177012845c6 100644
--- a/drivers/gpu/drm/drm_memory.c
+++ b/drivers/gpu/drm/drm_memory.c
@@ -167,6 +167,11 @@ void drm_core_ioremap(struct drm_map *map, struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_core_ioremap);
 
+void drm_core_ioremap_wc(struct drm_map *map, struct drm_device *dev)
+{
+	map->handle = ioremap_wc(map->offset, map->size);
+}
+EXPORT_SYMBOL(drm_core_ioremap_wc);
 void drm_core_ioremapfree(struct drm_map *map, struct drm_device *dev)
 {
 	if (!map->handle || !map->size)
diff --git a/drivers/gpu/drm/radeon/radeon_cp.c b/drivers/gpu/drm/radeon/radeon_cp.c
index e53158f0ecb5..f0de81a5689d 100644
--- a/drivers/gpu/drm/radeon/radeon_cp.c
+++ b/drivers/gpu/drm/radeon/radeon_cp.c
@@ -1154,7 +1154,7 @@ static int radeon_do_init_cp(struct drm_device * dev, drm_radeon_init_t * init)
 			dev_priv->gart_info.mapping.size =
 			    dev_priv->gart_info.table_size;
 
-			drm_core_ioremap(&dev_priv->gart_info.mapping, dev);
+			drm_core_ioremap_wc(&dev_priv->gart_info.mapping, dev);
 			dev_priv->gart_info.addr =
 			    dev_priv->gart_info.mapping.handle;
 
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 781ea5950373..09a2bec7fd32 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -4,28 +4,33 @@
  * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
  *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
  *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
  *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/mutex.h>
@@ -100,6 +105,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
 	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
 	if (dst_dev_addr)
 		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->src_dev = dev;
 	return 0;
 }
 EXPORT_SYMBOL(rdma_copy_addr);
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
index fb9ed1489f95..6669287009c2 100644
--- a/drivers/infiniband/core/agent.h
+++ b/drivers/infiniband/core/agent.h
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: agent.h 1389 2004-12-27 22:56:47Z roland $
  */
 
 #ifndef __AGENT_H_
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index e85f7013de57..68883565b725 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index a47fe64e5c39..55738eead3bf 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: cm.c 4311 2005-12-05 18:42:01Z sean.hefty $
  */
 
 #include <linux/completion.h>
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 671f13738054..ae11d5cc74d0 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4,29 +4,33 @@
  * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
  *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
  *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
  *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/completion.h>
@@ -126,8 +130,7 @@ struct rdma_id_private {
 
 	struct completion	comp;
 	atomic_t		refcount;
-	wait_queue_head_t	wait_remove;
-	atomic_t		dev_remove;
+	struct mutex		handler_mutex;
 
 	int			backlog;
 	int			timeout_ms;
@@ -351,26 +354,15 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
 		complete(&id_priv->comp);
 }
 
-static int cma_disable_remove(struct rdma_id_private *id_priv,
+static int cma_disable_callback(struct rdma_id_private *id_priv,
 			      enum cma_state state)
 {
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&id_priv->lock, flags);
-	if (id_priv->state == state) {
-		atomic_inc(&id_priv->dev_remove);
-		ret = 0;
-	} else
-		ret = -EINVAL;
-	spin_unlock_irqrestore(&id_priv->lock, flags);
-	return ret;
-}
-
-static void cma_enable_remove(struct rdma_id_private *id_priv)
-{
-	if (atomic_dec_and_test(&id_priv->dev_remove))
-		wake_up(&id_priv->wait_remove);
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != state) {
+		mutex_unlock(&id_priv->handler_mutex);
+		return -EINVAL;
+	}
+	return 0;
 }
 
 static int cma_has_cm_dev(struct rdma_id_private *id_priv)
@@ -395,8 +387,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
 	mutex_init(&id_priv->qp_mutex);
 	init_completion(&id_priv->comp);
 	atomic_set(&id_priv->refcount, 1);
-	init_waitqueue_head(&id_priv->wait_remove);
-	atomic_set(&id_priv->dev_remove, 0);
+	mutex_init(&id_priv->handler_mutex);
 	INIT_LIST_HEAD(&id_priv->listen_list);
 	INIT_LIST_HEAD(&id_priv->mc_list);
 	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
@@ -923,7 +914,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	struct rdma_cm_event event;
 	int ret = 0;
 
-	if (cma_disable_remove(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -970,7 +961,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
 		break;
 	default:
-		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
 		       ib_event->event);
 		goto out;
 	}
@@ -980,12 +971,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return ret;
 }
 
@@ -998,6 +989,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	union cma_ip_addr *src, *dst;
 	__be16 port;
 	u8 ip_ver;
+	int ret;
 
 	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
 			     &ip_ver, &port, &src, &dst))
@@ -1022,10 +1014,11 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	if (rt->num_paths == 2)
 		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
-	ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
 	ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
-	ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
-	rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA;
+	ret = rdma_translate_ip(&id->route.addr.src_addr,
+				&id->route.addr.dev_addr);
+	if (ret)
+		goto destroy_id;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
 	id_priv->state = CMA_CONNECT;
@@ -1095,7 +1088,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	int offset, ret;
 
 	listen_id = cm_id->context;
-	if (cma_disable_remove(listen_id, CMA_LISTEN))
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
 		return -ECONNABORTED;
 
 	memset(&event, 0, sizeof event);
@@ -1116,7 +1109,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		goto out;
 	}
 
-	atomic_inc(&conn_id->dev_remove);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	mutex_lock(&lock);
 	ret = cma_acquire_dev(conn_id);
 	mutex_unlock(&lock);
@@ -1138,7 +1131,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		    !cma_is_ud_ps(conn_id->id.ps))
 			ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
 		mutex_unlock(&lock);
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		goto out;
 	}
 
@@ -1147,11 +1140,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 
 release_conn_id:
 	cma_exch(conn_id, CMA_DESTROYING);
-	cma_enable_remove(conn_id);
+	mutex_unlock(&conn_id->handler_mutex);
 	rdma_destroy_id(&conn_id->id);
 
 out:
-	cma_enable_remove(listen_id);
+	mutex_unlock(&listen_id->handler_mutex);
 	return ret;
 }
 
@@ -1217,7 +1210,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 	struct sockaddr_in *sin;
 	int ret = 0;
 
-	if (cma_disable_remove(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -1261,12 +1254,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.iw = NULL;
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
 	}
 
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return ret;
 }
 
@@ -1282,7 +1275,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	struct ib_device_attr attr;
 
 	listen_id = cm_id->context;
-	if (cma_disable_remove(listen_id, CMA_LISTEN))
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
 		return -ECONNABORTED;
 
 	/* Create a new RDMA id for the new IW CM ID */
@@ -1294,19 +1287,19 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 		goto out;
 	}
 	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
-	atomic_inc(&conn_id->dev_remove);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	conn_id->state = CMA_CONNECT;
 
 	dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr);
 	if (!dev) {
 		ret = -EADDRNOTAVAIL;
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
 	ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
 	if (ret) {
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
@@ -1315,7 +1308,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	ret = cma_acquire_dev(conn_id);
 	mutex_unlock(&lock);
 	if (ret) {
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
@@ -1331,7 +1324,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 
 	ret = ib_query_device(conn_id->id.device, &attr);
 	if (ret) {
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
@@ -1347,14 +1340,17 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 		/* User wants to destroy the CM ID */
 		conn_id->cm_id.iw = NULL;
 		cma_exch(conn_id, CMA_DESTROYING);
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(&conn_id->id);
+		goto out;
 	}
 
+	mutex_unlock(&conn_id->handler_mutex);
+
 out:
 	if (dev)
 		dev_put(dev);
-	cma_enable_remove(listen_id);
+	mutex_unlock(&listen_id->handler_mutex);
 	return ret;
 }
 
@@ -1446,7 +1442,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 	ret = rdma_listen(id, id_priv->backlog);
 	if (ret)
 		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
-		       "listening on device %s", ret, cma_dev->device->name);
+		       "listening on device %s\n", ret, cma_dev->device->name);
 }
 
 static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -1586,7 +1582,7 @@ static void cma_work_handler(struct work_struct *_work)
 	struct rdma_id_private *id_priv = work->id;
 	int destroy = 0;
 
-	atomic_inc(&id_priv->dev_remove);
+	mutex_lock(&id_priv->handler_mutex);
 	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
 		goto out;
 
@@ -1595,7 +1591,7 @@ static void cma_work_handler(struct work_struct *_work)
 		destroy = 1;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	cma_deref_id(id_priv);
 	if (destroy)
 		rdma_destroy_id(&id_priv->id);
@@ -1758,7 +1754,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 	struct rdma_cm_event event;
 
 	memset(&event, 0, sizeof event);
-	atomic_inc(&id_priv->dev_remove);
+	mutex_lock(&id_priv->handler_mutex);
 
 	/*
 	 * Grab mutex to block rdma_destroy_id() from removing the device while
@@ -1787,13 +1783,13 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 
 	if (id_priv->id.event_handler(&id_priv->id, &event)) {
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		cma_deref_id(id_priv);
 		rdma_destroy_id(&id_priv->id);
 		return;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	cma_deref_id(id_priv);
 }
 
@@ -2120,7 +2116,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
 	int ret = 0;
 
-	if (cma_disable_remove(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -2151,7 +2147,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 		event.status = 0;
 		break;
 	default:
-		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
 		       ib_event->event);
 		goto out;
 	}
@@ -2161,12 +2157,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return ret;
 }
 
@@ -2564,8 +2560,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	int ret;
 
 	id_priv = mc->id_priv;
-	if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) &&
-	    cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
+	if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
 		return 0;
 
 	mutex_lock(&id_priv->qp_mutex);
@@ -2590,12 +2586,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	ret = id_priv->id.event_handler(&id_priv->id, &event);
 	if (ret) {
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return 0;
 	}
 
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return 0;
 }
 
@@ -2754,6 +2750,7 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
 {
 	struct rdma_cm_event event;
 	enum cma_state state;
+	int ret = 0;
 
 	/* Record that we want to remove the device */
 	state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
@@ -2761,15 +2758,18 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
 		return 0;
 
 	cma_cancel_operation(id_priv, state);
-	wait_event(id_priv->wait_remove, !atomic_read(&id_priv->dev_remove));
+	mutex_lock(&id_priv->handler_mutex);
 
 	/* Check for destruction from another callback. */
 	if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
-		return 0;
+		goto out;
 
 	memset(&event, 0, sizeof event);
 	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
-	return id_priv->id.event_handler(&id_priv->id, &event);
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
 }
 
 static void cma_process_remove(struct cma_device *cma_dev)
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 7ad47a4b166b..05ac36e6acdb 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef _CORE_PRIV_H
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 5ac5ffee05cb..7913b804311e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: device.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index 1286dc1b98b2..4507043d24c8 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: fmr_pool.c 2730 2005-06-28 16:43:03Z sean.hefty $
  */
 
 #include <linux/errno.h>
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 8b75010016ec..05ce331733b0 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mad_priv.h 5596 2006-03-03 01:00:07Z sean.hefty $
  */
 
 #ifndef __IB_MAD_PRIV_H__
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index a5e2a310f312..d0ef7d61c037 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mad_rmpp.c 1921 2005-03-02 22:58:44Z sean.hefty $
  */
 
 #include "mad_priv.h"
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
index f0616fd22494..3d336bff1148 100644
--- a/drivers/infiniband/core/mad_rmpp.h
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mad_rmpp.h 1921 2005-02-25 22:58:44Z sean.hefty $
  */
 
 #ifndef __MAD_RMPP_H__
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
index c972d7235764..019bd4b0863e 100644
--- a/drivers/infiniband/core/packer.c
+++ b/drivers/infiniband/core/packer.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/string.h>
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index cf474ec27070..1341de793e51 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: sa_query.c 2811 2005-07-06 18:11:43Z halr $
  */
 
 #include <linux/module.h>
@@ -361,7 +359,7 @@ static void update_sm_ah(struct work_struct *work)
 {
 	struct ib_sa_port *port =
 		container_of(work, struct ib_sa_port, update_task);
-	struct ib_sa_sm_ah *new_ah, *old_ah;
+	struct ib_sa_sm_ah *new_ah;
 	struct ib_port_attr port_attr;
 	struct ib_ah_attr   ah_attr;
 
@@ -397,12 +395,9 @@ static void update_sm_ah(struct work_struct *work)
 	}
 
 	spin_lock_irq(&port->ah_lock);
-	old_ah = port->sm_ah;
 	port->sm_ah = new_ah;
 	spin_unlock_irq(&port->ah_lock);
 
-	if (old_ah)
-		kref_put(&old_ah->ref, free_sm_ah);
 }
 
 static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
@@ -413,8 +408,17 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
 	    event->event == IB_EVENT_PKEY_CHANGE ||
 	    event->event == IB_EVENT_SM_CHANGE   ||
 	    event->event == IB_EVENT_CLIENT_REREGISTER) {
-		struct ib_sa_device *sa_dev;
-		sa_dev = container_of(handler, typeof(*sa_dev), event_handler);
+		unsigned long flags;
+		struct ib_sa_device *sa_dev =
+			container_of(handler, typeof(*sa_dev), event_handler);
+		struct ib_sa_port *port =
+			&sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+		spin_lock_irqsave(&port->ah_lock, flags);
+		if (port->sm_ah)
+			kref_put(&port->sm_ah->ref, free_sm_ah);
+		port->sm_ah = NULL;
+		spin_unlock_irqrestore(&port->ah_lock, flags);
 
 		schedule_work(&sa_dev->port[event->element.port_num -
 					    sa_dev->start_port].update_task);
@@ -519,6 +523,10 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
 	unsigned long flags;
 
 	spin_lock_irqsave(&query->port->ah_lock, flags);
+	if (!query->port->sm_ah) {
+		spin_unlock_irqrestore(&query->port->ah_lock, flags);
+		return -EAGAIN;
+	}
 	kref_get(&query->port->sm_ah->ref);
 	query->sm_ah = query->port->sm_ah;
 	spin_unlock_irqrestore(&query->port->ah_lock, flags);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 95756551cf7c..4d1042115598 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: sysfs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include "core_priv.h"
@@ -665,6 +663,120 @@ static struct class ib_class = {
 	.dev_uevent = ib_device_uevent,
 };
 
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+			    struct device_attribute *attr, char *buf,
+			    unsigned offset)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	union rdma_protocol_stats stats;
+	ssize_t ret;
+
+	ret = dev->get_protocol_stats(dev, &stats);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name)						\
+static ssize_t show_##name(struct device *device,			\
+			   struct device_attribute *attr, char *buf)	\
+{									\
+	return show_protocol_stat(device, attr, buf,			\
+				  offsetof(struct iw_protocol_stats, name) / \
+				  sizeof (u64));			\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+	&dev_attr_ipInReceives.attr,
+	&dev_attr_ipInHdrErrors.attr,
+	&dev_attr_ipInTooBigErrors.attr,
+	&dev_attr_ipInNoRoutes.attr,
+	&dev_attr_ipInAddrErrors.attr,
+	&dev_attr_ipInUnknownProtos.attr,
+	&dev_attr_ipInTruncatedPkts.attr,
+	&dev_attr_ipInDiscards.attr,
+	&dev_attr_ipInDelivers.attr,
+	&dev_attr_ipOutForwDatagrams.attr,
+	&dev_attr_ipOutRequests.attr,
+	&dev_attr_ipOutDiscards.attr,
+	&dev_attr_ipOutNoRoutes.attr,
+	&dev_attr_ipReasmTimeout.attr,
+	&dev_attr_ipReasmReqds.attr,
+	&dev_attr_ipReasmOKs.attr,
+	&dev_attr_ipReasmFails.attr,
+	&dev_attr_ipFragOKs.attr,
+	&dev_attr_ipFragFails.attr,
+	&dev_attr_ipFragCreates.attr,
+	&dev_attr_ipInMcastPkts.attr,
+	&dev_attr_ipOutMcastPkts.attr,
+	&dev_attr_ipInBcastPkts.attr,
+	&dev_attr_ipOutBcastPkts.attr,
+	&dev_attr_tcpRtoAlgorithm.attr,
+	&dev_attr_tcpRtoMin.attr,
+	&dev_attr_tcpRtoMax.attr,
+	&dev_attr_tcpMaxConn.attr,
+	&dev_attr_tcpActiveOpens.attr,
+	&dev_attr_tcpPassiveOpens.attr,
+	&dev_attr_tcpAttemptFails.attr,
+	&dev_attr_tcpEstabResets.attr,
+	&dev_attr_tcpCurrEstab.attr,
+	&dev_attr_tcpInSegs.attr,
+	&dev_attr_tcpOutSegs.attr,
+	&dev_attr_tcpRetransSegs.attr,
+	&dev_attr_tcpInErrs.attr,
+	&dev_attr_tcpOutRsts.attr,
+	NULL
+};
+
+static struct attribute_group iw_stats_group = {
+	.name	= "proto_stats",
+	.attrs	= iw_proto_stats_attrs,
+};
+
 int ib_device_register_sysfs(struct ib_device *device)
 {
 	struct device *class_dev = &device->dev;
@@ -707,6 +819,12 @@ int ib_device_register_sysfs(struct ib_device *device)
 		}
 	}
 
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+		if (ret)
+			goto err_put;
+	}
+
 	return 0;
 
 err_put:
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index b25675faaaf5..9494005d1c9a 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ucm.c 4311 2005-12-05 18:42:01Z sean.hefty $
  */
 
 #include <linux/completion.h>
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 997c07db6d8f..8ec7876bedcf 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ud_header.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/errno.h>
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index a1768dbb0720..6f7c096abf13 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs_mem.c 2743 2005-06-28 22:27:59Z roland $
  */
 
 #include <linux/mm.h>
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 208c7f34323c..268a2d23b7c9 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: user_mad.c 5596 2006-03-03 01:00:07Z sean.hefty $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 376a57ce1b40..b3ea9587dc80 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs.h 2559 2005-06-06 19:43:16Z roland $
  */
 
 #ifndef UVERBS_H
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 2c3bff5fe867..56feab6c251e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs_cmd.c 2708 2005-06-24 17:27:21Z roland $
  */
 
 #include <linux/file.h>
@@ -919,7 +917,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
 		resp->wc[i].opcode 	   = wc[i].opcode;
 		resp->wc[i].vendor_err 	   = wc[i].vendor_err;
 		resp->wc[i].byte_len 	   = wc[i].byte_len;
-		resp->wc[i].imm_data 	   = (__u32 __force) wc[i].imm_data;
+		resp->wc[i].ex.imm_data    = (__u32 __force) wc[i].ex.imm_data;
 		resp->wc[i].qp_num 	   = wc[i].qp->qp_num;
 		resp->wc[i].src_qp 	   = wc[i].src_qp;
 		resp->wc[i].wc_flags 	   = wc[i].wc_flags;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 0f34858e31e7..aeee856c4060 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs_main.c 2733 2005-06-28 19:14:34Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 05042089de6e..a7da9be43e61 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -34,8 +34,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/errno.h>
@@ -317,7 +315,6 @@ static const struct {
 } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
 	[IB_QPS_RESET] = {
 		[IB_QPS_RESET] = { .valid = 1 },
-		[IB_QPS_ERR]   = { .valid = 1 },
 		[IB_QPS_INIT]  = {
 			.valid = 1,
 			.req_param = {
@@ -755,6 +752,52 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+							  int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
 /* Memory windows */
 
 struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c
index b1441aeb60c2..dd05c4835642 100644
--- a/drivers/infiniband/hw/amso1100/c2_rnic.c
+++ b/drivers/infiniband/hw/amso1100/c2_rnic.c
@@ -454,7 +454,7 @@ int __devinit c2_rnic_init(struct c2_dev *c2dev)
 	    (IB_DEVICE_RESIZE_MAX_WR |
 	     IB_DEVICE_CURR_QP_STATE_MOD |
 	     IB_DEVICE_SYS_IMAGE_GUID |
-	     IB_DEVICE_ZERO_STAG |
+	     IB_DEVICE_LOCAL_DMA_LKEY |
 	     IB_DEVICE_MEM_WINDOW);
 
 	/* Allocate the qptr_array */
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 3f441fc57c17..f6d5747153a5 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -145,7 +145,9 @@ static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
 	}
 	wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
 	memset(wqe, 0, sizeof(*wqe));
-	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, qpid, 7);
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD,
+		       T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7,
+		       T3_SOPEOP);
 	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
 	sge_cmd = qpid << 8 | 3;
 	wqe->sge_cmd = cpu_to_be64(sge_cmd);
@@ -276,7 +278,7 @@ int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
 	if (!wq->qpid)
 		return -ENOMEM;
 
-	wq->rq = kzalloc(depth * sizeof(u64), GFP_KERNEL);
+	wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL);
 	if (!wq->rq)
 		goto err1;
 
@@ -300,6 +302,7 @@ int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
 	if (!kernel_domain)
 		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
 					(wq->qpid << rdev_p->qpshift);
+	wq->rdev = rdev_p;
 	PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __func__,
 	     wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
 	return 0;
@@ -558,7 +561,7 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
 	wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
 	memset(wqe, 0, sizeof(*wqe));
 	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
-		       T3_CTL_QP_TID, 7);
+		       T3_CTL_QP_TID, 7, T3_SOPEOP);
 	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
 	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
 	wqe->sge_cmd = cpu_to_be64(sge_cmd);
@@ -674,7 +677,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
 		build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
 			       Q_GENBIT(rdev_p->ctrl_qp.wptr,
 					T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
-			       wr_len);
+			       wr_len, T3_SOPEOP);
 		if (flag == T3_COMPLETION_FLAG)
 			ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
 		len -= 96;
@@ -816,6 +819,13 @@ int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
 			     0, 0);
 }
 
+int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
+			     0, 0, 0ULL, 0, 0, pbl_size, pbl_addr);
+}
+
 int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
 {
 	struct t3_rdma_init_wr *wqe;
@@ -1257,13 +1267,16 @@ proc_cqe:
 		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
 		PDBG("%s completing sq idx %ld\n", __func__,
 		     Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
-		*cookie = (wq->sq +
-			   Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id;
+		*cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
 		wq->sq_rptr++;
 	} else {
 		PDBG("%s completing rq idx %ld\n", __func__,
 		     Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
-		*cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		*cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
+		if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
+			cxio_hal_pblpool_free(wq->rdev,
+				wq->rq[Q_PTR2IDX(wq->rq_rptr,
+				wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
 		wq->rq_rptr++;
 	}
 
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 6e128f6bab05..656fe47bc84f 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -45,15 +45,17 @@
 #define T3_CTRL_QP_SIZE_LOG2  8
 #define T3_CTRL_CQ_ID    0
 
-/* TBD */
 #define T3_MAX_NUM_RI (1<<15)
 #define T3_MAX_NUM_QP (1<<15)
 #define T3_MAX_NUM_CQ (1<<15)
 #define T3_MAX_NUM_PD (1<<15)
 #define T3_MAX_PBL_SIZE 256
 #define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
+#define T3_MAX_CQ_DEPTH 8192
 #define T3_MAX_NUM_STAG (1<<15)
 #define T3_MAX_MR_SIZE 0x100000000ULL
+#define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
 
 #define T3_STAG_UNSET 0xffffffff
 
@@ -165,6 +167,7 @@ int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
 int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
 		   u32 pbl_addr);
 int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr);
 int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
 int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
 void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index f1a25a821a45..04618f7bfbb3 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -39,6 +39,9 @@
 
 #define T3_MAX_SGE      4
 #define T3_MAX_INLINE	64
+#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
+#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
+#define T3_STAG0_PAGE_SHIFT 15
 
 #define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
 #define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
@@ -72,7 +75,8 @@ enum t3_wr_opcode {
 	T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
 	T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
 	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
-	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP
+	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
+	T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
 } __attribute__ ((packed));
 
 enum t3_rdma_opcode {
@@ -89,7 +93,8 @@ enum t3_rdma_opcode {
 	T3_FAST_REGISTER,
 	T3_LOCAL_INV,
 	T3_QP_MOD,
-	T3_BYPASS
+	T3_BYPASS,
+	T3_RDMA_READ_REQ_WITH_INV,
 } __attribute__ ((packed));
 
 static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
@@ -103,6 +108,7 @@ static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
 		case T3_WR_BIND: return T3_BIND_MW;
 		case T3_WR_INIT: return T3_RDMA_INIT;
 		case T3_WR_QP_MOD: return T3_QP_MOD;
+		case T3_WR_FASTREG: return T3_FAST_REGISTER;
 		default: break;
 	}
 	return -1;
@@ -170,11 +176,54 @@ struct t3_send_wr {
 	struct t3_sge sgl[T3_MAX_SGE];	/* 4+ */
 };
 
+#define T3_MAX_FASTREG_DEPTH 24
+#define T3_MAX_FASTREG_FRAG 10
+
+struct t3_fastreg_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 len;
+	__be32 va_base_hi;	/* 3 */
+	__be32 va_base_lo_fbo;
+	__be32 page_type_perms; /* 4 */
+	__be32 reserved1;
+	__be64 pbl_addrs[0];	/* 5+ */
+};
+
+/*
+ * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this.
+ */
+struct t3_pbl_frag {
+	struct fw_riwrh wrh;	/* 0 */
+	__be64 pbl_addrs[14];	/* 1..14 */
+};
+
+#define S_FR_PAGE_COUNT		24
+#define M_FR_PAGE_COUNT		0xff
+#define V_FR_PAGE_COUNT(x)	((x) << S_FR_PAGE_COUNT)
+#define G_FR_PAGE_COUNT(x)	((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT)
+
+#define S_FR_PAGE_SIZE		16
+#define M_FR_PAGE_SIZE		0x1f
+#define V_FR_PAGE_SIZE(x)	((x) << S_FR_PAGE_SIZE)
+#define G_FR_PAGE_SIZE(x)	((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE)
+
+#define S_FR_TYPE		8
+#define M_FR_TYPE		0x1
+#define V_FR_TYPE(x)		((x) << S_FR_TYPE)
+#define G_FR_TYPE(x)		((((x) >> S_FR_TYPE)) & M_FR_TYPE)
+
+#define S_FR_PERMS		0
+#define M_FR_PERMS		0xff
+#define V_FR_PERMS(x)		((x) << S_FR_PERMS)
+#define G_FR_PERMS(x)		((((x) >> S_FR_PERMS)) & M_FR_PERMS)
+
 struct t3_local_inv_wr {
 	struct fw_riwrh wrh;	/* 0 */
 	union t3_wrid wrid;	/* 1 */
 	__be32 stag;		/* 2 */
-	__be32 reserved3;
+	__be32 reserved;
 };
 
 struct t3_rdma_write_wr {
@@ -193,7 +242,8 @@ struct t3_rdma_read_wr {
 	struct fw_riwrh wrh;	/* 0 */
 	union t3_wrid wrid;	/* 1 */
 	u8 rdmaop;		/* 2 */
-	u8 reserved[3];
+	u8 local_inv;
+	u8 reserved[2];
 	__be32 rem_stag;
 	__be64 rem_to;		/* 3 */
 	__be32 local_stag;	/* 4 */
@@ -201,18 +251,6 @@ struct t3_rdma_read_wr {
 	__be64 local_to;	/* 5 */
 };
 
-enum t3_addr_type {
-	T3_VA_BASED_TO = 0x0,
-	T3_ZERO_BASED_TO = 0x1
-} __attribute__ ((packed));
-
-enum t3_mem_perms {
-	T3_MEM_ACCESS_LOCAL_READ = 0x1,
-	T3_MEM_ACCESS_LOCAL_WRITE = 0x2,
-	T3_MEM_ACCESS_REM_READ = 0x4,
-	T3_MEM_ACCESS_REM_WRITE = 0x8
-} __attribute__ ((packed));
-
 struct t3_bind_mw_wr {
 	struct fw_riwrh wrh;	/* 0 */
 	union t3_wrid wrid;	/* 1 */
@@ -336,6 +374,11 @@ struct t3_genbit {
 	__be64 genbit;
 };
 
+struct t3_wq_in_err {
+	u64 flit[13];
+	u64 err;
+};
+
 enum rdma_init_wr_flags {
 	MPA_INITIATOR = (1<<0),
 	PRIV_QP = (1<<1),
@@ -346,13 +389,16 @@ union t3_wr {
 	struct t3_rdma_write_wr write;
 	struct t3_rdma_read_wr read;
 	struct t3_receive_wr recv;
+	struct t3_fastreg_wr fastreg;
+	struct t3_pbl_frag pbl_frag;
 	struct t3_local_inv_wr local_inv;
 	struct t3_bind_mw_wr bind;
 	struct t3_bypass_wr bypass;
 	struct t3_rdma_init_wr init;
 	struct t3_modify_qp_wr qp_mod;
 	struct t3_genbit genbit;
-	u64 flit[16];
+	struct t3_wq_in_err wq_in_err;
+	__be64 flit[16];
 };
 
 #define T3_SQ_CQE_FLIT	  13
@@ -366,12 +412,18 @@ static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
 	return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
 }
 
+enum t3_wr_hdr_bits {
+	T3_EOP = 1,
+	T3_SOP = 2,
+	T3_SOPEOP = T3_EOP|T3_SOP,
+};
+
 static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
 				  enum t3_wr_flags flags, u8 genbit, u32 tid,
-				  u8 len)
+				  u8 len, u8 sopeop)
 {
 	wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
-					 V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) |
+					 V_FW_RIWR_SOPEOP(sopeop) |
 					 V_FW_RIWR_FLAGS(flags));
 	wmb();
 	wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
@@ -404,6 +456,7 @@ enum tpt_addr_type {
 };
 
 enum tpt_mem_perm {
+	TPT_MW_BIND = 0x10,
 	TPT_LOCAL_READ = 0x8,
 	TPT_LOCAL_WRITE = 0x4,
 	TPT_REMOTE_READ = 0x2,
@@ -615,6 +668,11 @@ struct t3_swsq {
 	int			signaled;
 };
 
+struct t3_swrq {
+	__u64			wr_id;
+	__u32			pbl_addr;
+};
+
 /*
  * A T3 WQ implements both the SQ and RQ.
  */
@@ -631,14 +689,15 @@ struct t3_wq {
 	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
 	u32 sq_rptr;			/* pending wrs */
 	u32 sq_size_log2;		/* sq size */
-	u64 *rq;			/* SW RQ (holds consumer wr_ids */
+	struct t3_swrq *rq;		/* SW RQ (holds consumer wr_ids */
 	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
 	u32 rq_rptr;			/* pending wrs */
-	u64 *rq_oldest_wr;		/* oldest wr on the SW RQ */
+	struct t3_swrq *rq_oldest_wr;	/* oldest wr on the SW RQ */
 	u32 rq_size_log2;		/* rq size */
 	u32 rq_addr;			/* rq adapter address */
 	void __iomem *doorbell;		/* kernel db */
 	u64 udb;			/* user db if any */
+	struct cxio_rdev *rdev;
 };
 
 struct t3_cq {
@@ -659,7 +718,7 @@ struct t3_cq {
 
 static inline void cxio_set_wq_in_error(struct t3_wq *wq)
 {
-	wq->queue->flit[13] = 1;
+	wq->queue->wq_in_err.err = 1;
 }
 
 static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 71554eacb13c..4489c89d6710 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -71,18 +71,16 @@ static void rnic_init(struct iwch_dev *rnicp)
 	idr_init(&rnicp->mmidr);
 	spin_lock_init(&rnicp->lock);
 
-	rnicp->attr.vendor_id = 0x168;
-	rnicp->attr.vendor_part_id = 7;
 	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
-	rnicp->attr.max_wrs = (1UL << 24) - 1;
+	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
 	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
 	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
 	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
-	rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1;
+	rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
 	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
 	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
 	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
-	rnicp->attr.mem_pgsizes_bitmask = 0x7FFF;	/* 4KB-128MB */
+	rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
 	rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
 	rnicp->attr.can_resize_wq = 0;
 	rnicp->attr.max_rdma_reads_per_qp = 8;
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
index d2409a505e8d..3773453b2cf0 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -48,8 +48,6 @@ struct iwch_qp;
 struct iwch_mr;
 
 struct iwch_rnic_attributes {
-	u32 vendor_id;
-	u32 vendor_part_id;
 	u32 max_qps;
 	u32 max_wrs;				/* Max for any SQ/RQ */
 	u32 max_sge_per_wr;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
index 4ee8ccd0a9e5..cf5474ae68ff 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cq.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -81,6 +81,7 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
 	wc->wr_id = cookie;
 	wc->qp = &qhp->ibqp;
 	wc->vendor_err = CQE_STATUS(cqe);
+	wc->wc_flags = 0;
 
 	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
 	     "lo 0x%x cookie 0x%llx\n", __func__,
@@ -94,6 +95,11 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
 		else
 			wc->byte_len = 0;
 		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV ||
+		    CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
 	} else {
 		switch (CQE_OPCODE(cqe)) {
 		case T3_RDMA_WRITE:
@@ -105,17 +111,20 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
 			break;
 		case T3_SEND:
 		case T3_SEND_WITH_SE:
+		case T3_SEND_WITH_INV:
+		case T3_SEND_WITH_SE_INV:
 			wc->opcode = IB_WC_SEND;
 			break;
 		case T3_BIND_MW:
 			wc->opcode = IB_WC_BIND_MW;
 			break;
 
-		/* these aren't supported yet */
-		case T3_SEND_WITH_INV:
-		case T3_SEND_WITH_SE_INV:
 		case T3_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
 		case T3_FAST_REGISTER:
+			wc->opcode = IB_WC_FAST_REG_MR;
+			break;
 		default:
 			printk(KERN_ERR MOD "Unexpected opcode %d "
 			       "in the CQE received for QPID=0x%0x\n",
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 95f82cfb6c54..b89640aa6e10 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -56,6 +56,7 @@
 #include "iwch_provider.h"
 #include "iwch_cm.h"
 #include "iwch_user.h"
+#include "common.h"
 
 static int iwch_modify_port(struct ib_device *ibdev,
 			    u8 port, int port_modify_mask,
@@ -747,6 +748,7 @@ static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd)
 	mhp->attr.type = TPT_MW;
 	mhp->attr.stag = stag;
 	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
 	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
 	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
 	return &(mhp->ibmw);
@@ -768,6 +770,68 @@ static int iwch_dealloc_mw(struct ib_mw *mw)
 	return 0;
 }
 
+static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+	ret = iwch_alloc_pbl(mhp, pbl_depth);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->attr.pbl_size = pbl_depth;
+	ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret) {
+		iwch_free_pbl(mhp);
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_NON_SHARED_MR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+}
+
+static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl(
+					struct ib_device *device,
+					int page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	page_list = kmalloc(sizeof *page_list + page_list_len * sizeof(u64),
+			    GFP_KERNEL);
+	if (!page_list)
+		return ERR_PTR(-ENOMEM);
+
+	page_list->page_list = (u64 *)(page_list + 1);
+	page_list->max_page_list_len = page_list_len;
+
+	return page_list;
+}
+
+static void iwch_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list)
+{
+	kfree(page_list);
+}
+
 static int iwch_destroy_qp(struct ib_qp *ib_qp)
 {
 	struct iwch_dev *rhp;
@@ -843,6 +907,15 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
 	 */
 	sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
 	wqsize = roundup_pow_of_two(rqsize + sqsize);
+
+	/*
+	 * Kernel users need more wq space for fastreg WRs which can take
+	 * 2 WR fragments.
+	 */
+	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
+		wqsize = roundup_pow_of_two(rqsize +
+				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
 	PDBG("%s wqsize %d sqsize %d rqsize %d\n", __func__,
 	     wqsize, sqsize, rqsize);
 	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
@@ -851,7 +924,6 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
 	qhp->wq.size_log2 = ilog2(wqsize);
 	qhp->wq.rq_size_log2 = ilog2(rqsize);
 	qhp->wq.sq_size_log2 = ilog2(sqsize);
-	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
 	if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
 			   ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
 		kfree(qhp);
@@ -935,10 +1007,10 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
 	qhp->ibqp.qp_num = qhp->wq.qpid;
 	init_timer(&(qhp->timer));
 	PDBG("%s sq_num_entries %d, rq_num_entries %d "
-	     "qpid 0x%0x qhp %p dma_addr 0x%llx size %d\n",
+	     "qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
 	     __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
 	     qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr,
-	     1 << qhp->wq.size_log2);
+	     1 << qhp->wq.size_log2, qhp->wq.rq_addr);
 	return &qhp->ibqp;
 }
 
@@ -1023,6 +1095,29 @@ static int iwch_query_gid(struct ib_device *ibdev, u8 port,
 	return 0;
 }
 
+static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev)
+{
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+	char *cp, *next;
+	unsigned fw_maj, fw_min, fw_mic;
+
+	rtnl_lock();
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	rtnl_unlock();
+
+	next = info.fw_version + 1;
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_maj);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_min);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_mic);
+
+	return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) |
+	       (fw_mic & 0xffff);
+}
+
 static int iwch_query_device(struct ib_device *ibdev,
 			     struct ib_device_attr *props)
 {
@@ -1033,7 +1128,10 @@ static int iwch_query_device(struct ib_device *ibdev,
 	dev = to_iwch_dev(ibdev);
 	memset(props, 0, sizeof *props);
 	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	props->hw_ver = dev->rdev.t3cdev_p->type;
+	props->fw_ver = fw_vers_string_to_u64(dev);
 	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
 	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
 	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
 	props->max_mr_size = dev->attr.max_mr_size;
@@ -1048,6 +1146,7 @@ static int iwch_query_device(struct ib_device *ibdev,
 	props->max_mr = dev->attr.max_mem_regs;
 	props->max_pd = dev->attr.max_pds;
 	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH;
 
 	return 0;
 }
@@ -1088,6 +1187,28 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
 }
 
+static int fw_supports_fastreg(struct iwch_dev *iwch_dev)
+{
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+	char *cp, *next;
+	unsigned fw_maj, fw_min;
+
+	rtnl_lock();
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	rtnl_unlock();
+
+	next = info.fw_version+1;
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_maj);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_min);
+
+	PDBG("%s maj %u min %u\n", __func__, fw_maj, fw_min);
+
+	return fw_maj > 6 || (fw_maj == 6 && fw_min > 0);
+}
+
 static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
@@ -1127,6 +1248,61 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
 		       iwch_dev->rdev.rnic_info.pdev->device);
 }
 
+static int iwch_get_mib(struct ib_device *ibdev,
+			union rdma_protocol_stats *stats)
+{
+	struct iwch_dev *dev;
+	struct tp_mib_stats m;
+	int ret;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	dev = to_iwch_dev(ibdev);
+	ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
+	if (ret)
+		return -ENOSYS;
+
+	memset(stats, 0, sizeof *stats);
+	stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
+				m.ipInReceive_lo;
+	stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
+				  m.ipInHdrErrors_lo;
+	stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
+				   m.ipInAddrErrors_lo;
+	stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
+				      m.ipInUnknownProtos_lo;
+	stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
+				 m.ipInDiscards_lo;
+	stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
+				 m.ipInDelivers_lo;
+	stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
+				  m.ipOutRequests_lo;
+	stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
+				  m.ipOutDiscards_lo;
+	stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
+				  m.ipOutNoRoutes_lo;
+	stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
+	stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
+	stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
+	stats->iw.ipReasmFails = (u64) m.ipReasmFails;
+	stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
+	stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
+	stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
+	stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
+	stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
+	stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
+	stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
+			      m.tcpInSegs_lo;
+	stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
+			       m.tcpOutSegs_lo;
+	stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
+				  m.tcpRetransSeg_lo;
+	stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
+			      m.tcpInErrs_lo;
+	stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
+	stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
+	return 0;
+}
+
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
 static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
@@ -1136,7 +1312,7 @@ static struct device_attribute *iwch_class_attributes[] = {
 	&dev_attr_hw_rev,
 	&dev_attr_fw_ver,
 	&dev_attr_hca_type,
-	&dev_attr_board_id
+	&dev_attr_board_id,
 };
 
 int iwch_register_device(struct iwch_dev *dev)
@@ -1149,8 +1325,12 @@ int iwch_register_device(struct iwch_dev *dev)
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
 	dev->ibdev.owner = THIS_MODULE;
-	dev->device_cap_flags =
-	    (IB_DEVICE_ZERO_STAG | IB_DEVICE_MEM_WINDOW);
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
+
+	/* cxgb3 supports STag 0. */
+	dev->ibdev.local_dma_lkey = 0;
+	if (fw_supports_fastreg(dev))
+		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 
 	dev->ibdev.uverbs_cmd_mask =
 	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
@@ -1202,15 +1382,16 @@ int iwch_register_device(struct iwch_dev *dev)
 	dev->ibdev.alloc_mw = iwch_alloc_mw;
 	dev->ibdev.bind_mw = iwch_bind_mw;
 	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
-
+	dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr;
+	dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
+	dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
 	dev->ibdev.attach_mcast = iwch_multicast_attach;
 	dev->ibdev.detach_mcast = iwch_multicast_detach;
 	dev->ibdev.process_mad = iwch_process_mad;
-
 	dev->ibdev.req_notify_cq = iwch_arm_cq;
 	dev->ibdev.post_send = iwch_post_send;
 	dev->ibdev.post_recv = iwch_post_receive;
-
+	dev->ibdev.get_protocol_stats = iwch_get_mib;
 
 	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
 	if (!dev->ibdev.iwcm)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index 836163fc5429..f5ceca05c435 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -296,14 +296,6 @@ static inline u32 iwch_ib_to_tpt_access(int acc)
 	       TPT_LOCAL_READ;
 }
 
-static inline u32 iwch_ib_to_mwbind_access(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_WRITE ? T3_MEM_ACCESS_REM_WRITE : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ ? T3_MEM_ACCESS_REM_READ : 0) |
-	       (acc & IB_ACCESS_LOCAL_WRITE ? T3_MEM_ACCESS_LOCAL_WRITE : 0) |
-	       T3_MEM_ACCESS_LOCAL_READ;
-}
-
 enum iwch_mmid_state {
 	IWCH_STAG_STATE_VALID,
 	IWCH_STAG_STATE_INVALID
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 992613799228..9a3be3a9d5dc 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -33,10 +33,11 @@
 #include "iwch.h"
 #include "iwch_cm.h"
 #include "cxio_hal.h"
+#include "cxio_resource.h"
 
 #define NO_SUPPORT -1
 
-static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
 				u8 * flit_cnt)
 {
 	int i;
@@ -44,59 +45,44 @@ static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
 
 	switch (wr->opcode) {
 	case IB_WR_SEND:
-	case IB_WR_SEND_WITH_IMM:
 		if (wr->send_flags & IB_SEND_SOLICITED)
 			wqe->send.rdmaop = T3_SEND_WITH_SE;
 		else
 			wqe->send.rdmaop = T3_SEND;
 		wqe->send.rem_stag = 0;
 		break;
-#if 0				/* Not currently supported */
-	case TYPE_SEND_INVALIDATE:
-	case TYPE_SEND_INVALIDATE_IMMEDIATE:
-		wqe->send.rdmaop = T3_SEND_WITH_INV;
-		wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
-		break;
-	case TYPE_SEND_SE_INVALIDATE:
-		wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
-		wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+		else
+			wqe->send.rdmaop = T3_SEND_WITH_INV;
+		wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey);
 		break;
-#endif
 	default:
-		break;
+		return -EINVAL;
 	}
 	if (wr->num_sge > T3_MAX_SGE)
 		return -EINVAL;
 	wqe->send.reserved[0] = 0;
 	wqe->send.reserved[1] = 0;
 	wqe->send.reserved[2] = 0;
-	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
-		plen = 4;
-		wqe->send.sgl[0].stag = wr->ex.imm_data;
-		wqe->send.sgl[0].len = __constant_cpu_to_be32(0);
-		wqe->send.num_sgle = __constant_cpu_to_be32(0);
-		*flit_cnt = 5;
-	} else {
-		plen = 0;
-		for (i = 0; i < wr->num_sge; i++) {
-			if ((plen + wr->sg_list[i].length) < plen) {
-				return -EMSGSIZE;
-			}
-			plen += wr->sg_list[i].length;
-			wqe->send.sgl[i].stag =
-			    cpu_to_be32(wr->sg_list[i].lkey);
-			wqe->send.sgl[i].len =
-			    cpu_to_be32(wr->sg_list[i].length);
-			wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
-		}
-		wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
-		*flit_cnt = 4 + ((wr->num_sge) << 1);
+	plen = 0;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) < plen)
+			return -EMSGSIZE;
+
+		plen += wr->sg_list[i].length;
+		wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
 	}
+	wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
+	*flit_cnt = 4 + ((wr->num_sge) << 1);
 	wqe->send.plen = cpu_to_be32(plen);
 	return 0;
 }
 
-static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
 				 u8 *flit_cnt)
 {
 	int i;
@@ -137,15 +123,18 @@ static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
 	return 0;
 }
 
-static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
 				u8 *flit_cnt)
 {
 	if (wr->num_sge > 1)
 		return -EINVAL;
 	wqe->read.rdmaop = T3_READ_REQ;
+	if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+		wqe->read.local_inv = 1;
+	else
+		wqe->read.local_inv = 0;
 	wqe->read.reserved[0] = 0;
 	wqe->read.reserved[1] = 0;
-	wqe->read.reserved[2] = 0;
 	wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
 	wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr);
 	wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
@@ -155,6 +144,57 @@ static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
 	return 0;
 }
 
+static int build_fastreg(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq)
+{
+	int i;
+	__be64 *p;
+
+	if (wr->wr.fast_reg.page_list_len > T3_MAX_FASTREG_DEPTH)
+		return -EINVAL;
+	*wr_cnt = 1;
+	wqe->fastreg.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+	wqe->fastreg.len = cpu_to_be32(wr->wr.fast_reg.length);
+	wqe->fastreg.va_base_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+	wqe->fastreg.va_base_lo_fbo =
+				cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff);
+	wqe->fastreg.page_type_perms = cpu_to_be32(
+		V_FR_PAGE_COUNT(wr->wr.fast_reg.page_list_len) |
+		V_FR_PAGE_SIZE(wr->wr.fast_reg.page_shift-12) |
+		V_FR_TYPE(TPT_VATO) |
+		V_FR_PERMS(iwch_ib_to_tpt_access(wr->wr.fast_reg.access_flags)));
+	p = &wqe->fastreg.pbl_addrs[0];
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) {
+
+		/* If we need a 2nd WR, then set it up */
+		if (i == T3_MAX_FASTREG_FRAG) {
+			*wr_cnt = 2;
+			wqe = (union t3_wr *)(wq->queue +
+				Q_PTR2IDX((wq->wptr+1), wq->size_log2));
+			build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0,
+			       Q_GENBIT(wq->wptr + 1, wq->size_log2),
+			       0, 1 + wr->wr.fast_reg.page_list_len - T3_MAX_FASTREG_FRAG,
+			       T3_EOP);
+
+			p = &wqe->pbl_frag.pbl_addrs[0];
+		}
+		*p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]);
+	}
+	*flit_cnt = 5 + wr->wr.fast_reg.page_list_len;
+	if (*flit_cnt > 15)
+		*flit_cnt = 15;
+	return 0;
+}
+
+static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->local_inv.reserved = 0;
+	*flit_cnt = sizeof(struct t3_local_inv_wr) >> 3;
+	return 0;
+}
+
 /*
  * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now.
  */
@@ -205,23 +245,106 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
 	return 0;
 }
 
-static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe,
+static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
 				struct ib_recv_wr *wr)
 {
-	int i;
-	if (wr->num_sge > T3_MAX_SGE)
-		return -EINVAL;
+	int i, err = 0;
+	u32 pbl_addr[T3_MAX_SGE];
+	u8 page_size[T3_MAX_SGE];
+
+	err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
+			       page_size);
+	if (err)
+		return err;
+	wqe->recv.pagesz[0] = page_size[0];
+	wqe->recv.pagesz[1] = page_size[1];
+	wqe->recv.pagesz[2] = page_size[2];
+	wqe->recv.pagesz[3] = page_size[3];
 	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
 	for (i = 0; i < wr->num_sge; i++) {
 		wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
 		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+
+		/* to in the WQE == the offset into the page */
+		wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) %
+				(1UL << (12 + page_size[i])));
+
+		/* pbl_addr is the adapters address in the PBL */
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = 0;
+	return 0;
+}
+
+static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i;
+	u32 pbl_addr;
+	u32 pbl_offset;
+
+
+	/*
+	 * The T3 HW requires the PBL in the HW recv descriptor to reference
+	 * a PBL entry.  So we allocate the max needed PBL memory here and pass
+	 * it to the uP in the recv WR.  The uP will build the PBL and setup
+	 * the HW recv descriptor.
+	 */
+	pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
+	if (!pbl_addr)
+		return -ENOMEM;
+
+	/*
+	 * Compute the 8B aligned offset.
+	 */
+	pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
+
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+
+	for (i = 0; i < wr->num_sge; i++) {
+
+		/*
+		 * Use a 128MB page size. This and an imposed 128MB
+		 * sge length limit allows us to require only a 2-entry HW
+		 * PBL for each SGE.  This restriction is acceptable since
+		 * since it is not possible to allocate 128MB of contiguous
+		 * DMA coherent memory!
+		 */
+		if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
+			return -EINVAL;
+		wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
+
+		/*
+		 * T3 restricts a recv to all zero-stag or all non-zero-stag.
+		 */
+		if (wr->sg_list[i].lkey != 0)
+			return -EINVAL;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
 		wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset);
+		pbl_offset += 2;
 	}
 	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.pagesz[i] = 0;
 		wqe->recv.sgl[i].stag = 0;
 		wqe->recv.sgl[i].len = 0;
 		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
 	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
 	return 0;
 }
 
@@ -238,6 +361,7 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	u32 num_wrs;
 	unsigned long flag;
 	struct t3_swsq *sqp;
+	int wr_cnt = 1;
 
 	qhp = to_iwch_qp(ibqp);
 	spin_lock_irqsave(&qhp->lock, flag);
@@ -262,33 +386,45 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		t3_wr_flags = 0;
 		if (wr->send_flags & IB_SEND_SOLICITED)
 			t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
-		if (wr->send_flags & IB_SEND_FENCE)
-			t3_wr_flags |= T3_READ_FENCE_FLAG;
 		if (wr->send_flags & IB_SEND_SIGNALED)
 			t3_wr_flags |= T3_COMPLETION_FLAG;
 		sqp = qhp->wq.sq +
 		      Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
 		switch (wr->opcode) {
 		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_IMM:
+		case IB_WR_SEND_WITH_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_READ_FENCE_FLAG;
 			t3_wr_opcode = T3_WR_SEND;
-			err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+			err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
 			break;
 		case IB_WR_RDMA_WRITE:
 		case IB_WR_RDMA_WRITE_WITH_IMM:
 			t3_wr_opcode = T3_WR_WRITE;
-			err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+			err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
 			break;
 		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
 			t3_wr_opcode = T3_WR_READ;
 			t3_wr_flags = 0; /* T3 reads are always signaled */
-			err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+			err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
 			if (err)
 				break;
 			sqp->read_len = wqe->read.local_len;
 			if (!qhp->wq.oldest_read)
 				qhp->wq.oldest_read = sqp;
 			break;
+		case IB_WR_FAST_REG_MR:
+			t3_wr_opcode = T3_WR_FASTREG;
+			err = build_fastreg(wqe, wr, &t3_wr_flit_cnt,
+						 &wr_cnt, &qhp->wq);
+			break;
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_LOCAL_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_INV_STAG;
+			err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt);
+			break;
 		default:
 			PDBG("%s post of type=%d TBD!\n", __func__,
 			     wr->opcode);
@@ -307,14 +443,15 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 		build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
 			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
-			       0, t3_wr_flit_cnt);
+			       0, t3_wr_flit_cnt,
+			       (wr_cnt == 1) ? T3_SOPEOP : T3_SOP);
 		PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
 		     __func__, (unsigned long long) wr->wr_id, idx,
 		     Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
 		     sqp->opcode);
 		wr = wr->next;
 		num_wrs--;
-		++(qhp->wq.wptr);
+		qhp->wq.wptr += wr_cnt;
 		++(qhp->wq.sq_wptr);
 	}
 	spin_unlock_irqrestore(&qhp->lock, flag);
@@ -345,21 +482,27 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		return -EINVAL;
 	}
 	while (wr) {
+		if (wr->num_sge > T3_MAX_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
 		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
 		wqe = (union t3_wr *) (qhp->wq.queue + idx);
 		if (num_wrs)
-			err = iwch_build_rdma_recv(qhp->rhp, wqe, wr);
+			if (wr->sg_list[0].lkey)
+				err = build_rdma_recv(qhp, wqe, wr);
+			else
+				err = build_zero_stag_recv(qhp, wqe, wr);
 		else
 			err = -ENOMEM;
 		if (err) {
 			*bad_wr = wr;
 			break;
 		}
-		qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] =
-			wr->wr_id;
 		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
 			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
-			       0, sizeof(struct t3_receive_wr) >> 3);
+			       0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP);
 		PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x "
 		     "wqe %p \n", __func__, (unsigned long long) wr->wr_id,
 		     idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
@@ -419,10 +562,10 @@ int iwch_bind_mw(struct ib_qp *qp,
 	sgl.lkey = mw_bind->mr->lkey;
 	sgl.length = mw_bind->length;
 	wqe->bind.reserved = 0;
-	wqe->bind.type = T3_VA_BASED_TO;
+	wqe->bind.type = TPT_VATO;
 
 	/* TBD: check perms */
-	wqe->bind.perms = iwch_ib_to_mwbind_access(mw_bind->mw_access_flags);
+	wqe->bind.perms = iwch_ib_to_tpt_access(mw_bind->mw_access_flags);
 	wqe->bind.mr_stag = cpu_to_be32(mw_bind->mr->lkey);
 	wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
 	wqe->bind.mw_len = cpu_to_be32(mw_bind->length);
@@ -430,7 +573,7 @@ int iwch_bind_mw(struct ib_qp *qp,
 	err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
 	if (err) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
-	        return err;
+		return err;
 	}
 	wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
 	sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
@@ -441,10 +584,9 @@ int iwch_bind_mw(struct ib_qp *qp,
 	sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
 	wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
 	wqe->bind.mr_pagesz = page_size;
-	wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id;
 	build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
 		       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
-			        sizeof(struct t3_bind_mw_wr) >> 3);
+		       sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
 	++(qhp->wq.wptr);
 	++(qhp->wq.sq_wptr);
 	spin_unlock_irqrestore(&qhp->lock, flag);
@@ -758,7 +900,8 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
 	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
 	init_attr.rqe_count = iwch_rqes_posted(qhp);
 	init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
-	init_attr.flags |= capable(CAP_NET_BIND_SERVICE) ? PRIV_QP : 0;
+	if (!qhp->ibqp.uobject)
+		init_attr.flags |= PRIV_QP;
 	if (peer2peer) {
 		init_attr.rtr_type = RTR_READ;
 		if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator)
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index ce1ab0571be3..0792d930c481 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -531,7 +531,7 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
 {
 	struct ehca_eq *eq = &shca->eq;
 	struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
-	u64 eqe_value;
+	u64 eqe_value, ret;
 	unsigned long flags;
 	int eqe_cnt, i;
 	int eq_empty = 0;
@@ -583,8 +583,13 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
 			ehca_dbg(&shca->ib_device,
 				 "No eqe found for irq event");
 		goto unlock_irq_spinlock;
-	} else if (!is_irq)
+	} else if (!is_irq) {
+		ret = hipz_h_eoi(eq->ist);
+		if (ret != H_SUCCESS)
+			ehca_err(&shca->ib_device,
+				 "bad return code EOI -rc = %ld\n", ret);
 		ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+	}
 	if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
 		ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
 	/* enable irq for new packets */
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 482103eb6eac..598844d2edc9 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -923,6 +923,7 @@ static struct of_device_id ehca_device_table[] =
 	},
 	{},
 };
+MODULE_DEVICE_TABLE(of, ehca_device_table);
 
 static struct of_platform_driver ehca_driver = {
 	.name        = "ehca",
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index f093b0033daf..dd9bc68f1c7b 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -544,8 +544,16 @@ int ehca_post_recv(struct ib_qp *qp,
 		   struct ib_recv_wr *recv_wr,
 		   struct ib_recv_wr **bad_recv_wr)
 {
-	return internal_post_recv(container_of(qp, struct ehca_qp, ib_qp),
-				  qp->device, recv_wr, bad_recv_wr);
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+	/* Reject WR if QP is in RESET state */
+	if (unlikely(my_qp->state == IB_QPS_RESET)) {
+		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+			 my_qp->state, qp->qp_num);
+		return -EINVAL;
+	}
+
+	return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
 }
 
 int ehca_post_srq_recv(struct ib_srq *srq,
@@ -681,7 +689,7 @@ poll_cq_one_read_cqe:
 	wc->dlid_path_bits = cqe->dlid;
 	wc->src_qp = cqe->remote_qp_number;
 	wc->wc_flags = cqe->w_completion_flags;
-	wc->imm_data = cpu_to_be32(cqe->immediate_data);
+	wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
 	wc->sl = cqe->service_level;
 
 poll_cq_one_exit0:
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
index 5245e13c3a30..415d3a465de6 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.c
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -933,3 +933,13 @@ u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
 				       r_cb,
 				       0, 0, 0, 0);
 }
+
+u64 hipz_h_eoi(int irq)
+{
+	unsigned long xirr;
+
+	iosync();
+	xirr = (0xffULL << 24) | irq;
+
+	return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
index 60ce02b70663..2c3c6e0ea5c2 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.h
+++ b/drivers/infiniband/hw/ehca/hcp_if.h
@@ -260,5 +260,6 @@ u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
 		      const u64 ressource_handle,
 		      void *rblock,
 		      unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
 
 #endif /* __HCP_IF_H__ */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
index a03bd28d9b48..d385e4168c97 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -82,7 +82,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
 		wc->uqueue[head].opcode = entry->opcode;
 		wc->uqueue[head].vendor_err = entry->vendor_err;
 		wc->uqueue[head].byte_len = entry->byte_len;
-		wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data;
+		wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
 		wc->uqueue[head].qp_num = entry->qp->qp_num;
 		wc->uqueue[head].src_qp = entry->src_qp;
 		wc->uqueue[head].wc_flags = entry->wc_flags;
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index 8eee7830f042..fb70712ac85c 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -2228,8 +2228,8 @@ static void ipath_autoneg_send(struct ipath_devdata *dd, int which)
 		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 		0x40000001, 0x1388, 0x15e, /* rest 0's */
 		};
-	dcnt = sizeof(madpayload_start)/sizeof(madpayload_start[0]);
-	hcnt = sizeof(hdr)/sizeof(hdr[0]);
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
 	if (!swapped) {
 		/* for maintainability, do it at runtime */
 		for (i = 0; i < hcnt; i++) {
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
index 5f9315d77a43..be4fc9ada8e7 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -111,9 +111,9 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp,
 	nip->revision = cpu_to_be32((majrev << 16) | minrev);
 	nip->local_port_num = port;
 	vendor = dd->ipath_vendorid;
-	nip->vendor_id[0] = 0;
-	nip->vendor_id[1] = vendor >> 8;
-	nip->vendor_id[2] = vendor;
+	nip->vendor_id[0] = IPATH_SRC_OUI_1;
+	nip->vendor_id[1] = IPATH_SRC_OUI_2;
+	nip->vendor_id[2] = IPATH_SRC_OUI_3;
 
 	return reply(smp);
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index 108df667d2ee..97710522624d 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -1703,11 +1703,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 	send_last_imm:
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else {
 			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
+			wc.ex.imm_data = ohdr->u.imm_data;
 		}
 		hdrsize += 4;
 		wc.wc_flags = IB_WC_WITH_IMM;
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index a4b5521567fe..af051f757663 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -331,7 +331,7 @@ again:
 	switch (wqe->wr.opcode) {
 	case IB_WR_SEND_WITH_IMM:
 		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.imm_data = wqe->wr.ex.imm_data;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
 		/* FALLTHROUGH */
 	case IB_WR_SEND:
 		if (!ipath_get_rwqe(qp, 0))
@@ -342,7 +342,7 @@ again:
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
 			goto inv_err;
 		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.imm_data = wqe->wr.ex.imm_data;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
 		if (!ipath_get_rwqe(qp, 1))
 			goto rnr_nak;
 		/* FALLTHROUGH */
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
index 0596ec16fcbd..82cc588b8bf2 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -379,11 +379,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 	send_last_imm:
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else {
 			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
+			wc.ex.imm_data = ohdr->u.imm_data;
 		}
 		hdrsize += 4;
 		wc.wc_flags = IB_WC_WITH_IMM;
@@ -483,11 +483,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
 	rdma_last_imm:
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else {
 			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
+			wc.ex.imm_data = ohdr->u.imm_data;
 		}
 		hdrsize += 4;
 		wc.wc_flags = IB_WC_WITH_IMM;
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 77ca8ca74e78..36aa242c487c 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -96,7 +96,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 
 	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
 		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.imm_data = swqe->wr.ex.imm_data;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
 	}
 
 	/*
@@ -492,14 +492,14 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	if (qp->ibqp.qp_num > 1 &&
 	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else
-			wc.imm_data = ohdr->u.ud.imm_data;
+			wc.ex.imm_data = ohdr->u.ud.imm_data;
 		wc.wc_flags = IB_WC_WITH_IMM;
 		hdrsize += sizeof(u32);
 	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-		wc.imm_data = 0;
+		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;
 	} else {
 		dev->n_pkt_drops++;
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 7779165b2c2c..9e23ab0b51a1 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1497,7 +1497,8 @@ static int ipath_query_device(struct ib_device *ibdev,
 		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
 		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
 	props->page_size_cap = PAGE_SIZE;
-	props->vendor_id = dev->dd->ipath_vendorid;
+	props->vendor_id =
+		IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
 	props->vendor_part_id = dev->dd->ipath_deviceid;
 	props->hw_ver = dev->dd->ipath_pcirev;
 
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 4521319b1406..299f20832ab6 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -663,18 +663,18 @@ repoll:
 
 		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
 		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
-			wc->opcode   = IB_WC_RECV_RDMA_WITH_IMM;
-			wc->wc_flags = IB_WC_WITH_IMM;
-			wc->imm_data = cqe->immed_rss_invalid;
+			wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
 			break;
 		case MLX4_RECV_OPCODE_SEND:
 			wc->opcode   = IB_WC_RECV;
 			wc->wc_flags = 0;
 			break;
 		case MLX4_RECV_OPCODE_SEND_IMM:
-			wc->opcode   = IB_WC_RECV;
-			wc->wc_flags = IB_WC_WITH_IMM;
-			wc->imm_data = cqe->immed_rss_invalid;
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
 			break;
 		}
 
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 4c1e72fc8f57..cdca3a511e1c 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -255,7 +255,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
 			return IB_MAD_RESULT_SUCCESS;
 	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
 		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
-		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2) {
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
 		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
 		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
 			return IB_MAD_RESULT_SUCCESS;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 4d61e32866c6..bcf50648fa18 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -90,7 +90,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
 		IB_DEVICE_PORT_ACTIVE_EVENT		|
 		IB_DEVICE_SYS_IMAGE_GUID		|
-		IB_DEVICE_RC_RNR_NAK_GEN;
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
@@ -437,7 +438,9 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
 static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	return mlx4_multicast_attach(to_mdev(ibqp->device)->dev,
-				     &to_mqp(ibqp)->mqp, gid->raw);
+				     &to_mqp(ibqp)->mqp, gid->raw,
+				     !!(to_mqp(ibqp)->flags &
+					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
 }
 
 static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 5cf994794d25..c4cf5b69eefa 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -101,7 +101,8 @@ struct mlx4_ib_wq {
 };
 
 enum mlx4_ib_qp_flags {
-	MLX4_IB_QP_LSO		= 1 << 0
+	MLX4_IB_QP_LSO				= 1 << 0,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
 };
 
 struct mlx4_ib_qp {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index a80df22deae8..89eb6cbe592e 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -129,9 +129,10 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
 	int ind;
 	void *buf;
 	__be32 stamp;
+	struct mlx4_wqe_ctrl_seg *ctrl;
 
-	s = roundup(size, 1U << qp->sq.wqe_shift);
 	if (qp->sq_max_wqes_per_wr > 1) {
+		s = roundup(size, 1U << qp->sq.wqe_shift);
 		for (i = 0; i < s; i += 64) {
 			ind = (i >> qp->sq.wqe_shift) + n;
 			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
@@ -141,7 +142,8 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
 			*wqe = stamp;
 		}
 	} else {
-		buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		s = (ctrl->fence_size & 0x3f) << 4;
 		for (i = 64; i < s; i += 64) {
 			wqe = buf + i;
 			*wqe = cpu_to_be32(0xffffffff);
@@ -452,19 +454,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	spin_lock_init(&qp->rq.lock);
 
 	qp->state	 = IB_QPS_RESET;
-	qp->atomic_rd_en = 0;
-	qp->resp_depth   = 0;
-
-	qp->rq.head	    = 0;
-	qp->rq.tail	    = 0;
-	qp->sq.head	    = 0;
-	qp->sq.tail	    = 0;
-	qp->sq_next_wqe     = 0;
-
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
-	else
-		qp->sq_signal_bits = 0;
 
 	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
 	if (err)
@@ -509,6 +500,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	} else {
 		qp->sq_no_prefetch = 0;
 
+		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
 			qp->flags |= MLX4_IB_QP_LSO;
 
@@ -682,10 +676,15 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	struct mlx4_ib_qp *qp;
 	int err;
 
-	/* We only support LSO, and only for kernel UD QPs. */
-	if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO)
+	/*
+	 * We only support LSO and multicast loopback blocking, and
+	 * only for kernel UD QPs.
+	 */
+	if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
 		return ERR_PTR(-EINVAL);
-	if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO &&
+
+	if (init_attr->create_flags &&
 	    (pd->uobject || init_attr->qp_type != IB_QPT_UD))
 		return ERR_PTR(-EINVAL);
 
@@ -694,7 +693,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	case IB_QPT_UC:
 	case IB_QPT_UD:
 	{
-		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		qp = kzalloc(sizeof *qp, GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 
@@ -715,7 +714,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 		if (pd->uobject)
 			return ERR_PTR(-EINVAL);
 
-		sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+		sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
 		if (!sqp)
 			return ERR_PTR(-ENOMEM);
 
@@ -906,7 +905,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			       attr->path_mtu);
 			goto out;
 		}
-		context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+			ilog2(dev->dev->caps.max_msg_sz);
 	}
 
 	if (qp->rq.wqe_cnt)
@@ -1063,6 +1063,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
 			ctrl = get_send_wqe(qp, i);
 			ctrl->owner_opcode = cpu_to_be32(1 << 31);
+			if (qp->sq_max_wqes_per_wr == 1)
+				ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
 
 			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
 		}
@@ -1127,23 +1129,6 @@ out:
 	return err;
 }
 
-static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
-static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
-		[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
-				IB_QP_PORT			|
-				IB_QP_QKEY),
-		[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
-				IB_QP_PORT			|
-				IB_QP_ACCESS_FLAGS),
-		[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
-				IB_QP_PORT			|
-				IB_QP_ACCESS_FLAGS),
-		[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
-				IB_QP_QKEY),
-		[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
-				IB_QP_QKEY),
-};
-
 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		      int attr_mask, struct ib_udata *udata)
 {
@@ -1186,15 +1171,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 	}
 
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
-		err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
-					  mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
-					  IB_QPS_RESET, IB_QPS_INIT);
-		if (err)
-			goto out;
-		cur_state = IB_QPS_INIT;
-	}
-
 	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
 
 out:
@@ -1865,6 +1841,13 @@ done:
 
 	qp_init_attr->cap	     = qp_attr->cap;
 
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
 out:
 	mutex_unlock(&qp->mutex);
 	return err;
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
index a76306709618..c5ccc2daab60 100644
--- a/drivers/infiniband/hw/mthca/mthca_allocator.c
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_allocator.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/errno.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 4b111a852ff6..32f6c6315454 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_av.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/string.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
index e948158a28d9..cc440f90000b 100644
--- a/drivers/infiniband/hw/mthca/mthca_catas.c
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #include <linux/jiffies.h>
@@ -128,7 +126,6 @@ static void handle_catas(struct mthca_dev *dev)
 static void poll_catas(unsigned long dev_ptr)
 {
 	struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
-	unsigned long flags;
 	int i;
 
 	for (i = 0; i < dev->catas_err.size; ++i)
@@ -137,13 +134,8 @@ static void poll_catas(unsigned long dev_ptr)
 			return;
 		}
 
-	spin_lock_irqsave(&catas_lock, flags);
-	if (!dev->catas_err.stop)
-		mod_timer(&dev->catas_err.timer,
-			  jiffies + MTHCA_CATAS_POLL_INTERVAL);
-	spin_unlock_irqrestore(&catas_lock, flags);
-
-	return;
+	mod_timer(&dev->catas_err.timer,
+		  round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
 }
 
 void mthca_start_catas_poll(struct mthca_dev *dev)
@@ -151,7 +143,6 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
 	unsigned long addr;
 
 	init_timer(&dev->catas_err.timer);
-	dev->catas_err.stop = 0;
 	dev->catas_err.map  = NULL;
 
 	addr = pci_resource_start(dev->pdev, 0) +
@@ -182,10 +173,6 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
 
 void mthca_stop_catas_poll(struct mthca_dev *dev)
 {
-	spin_lock_irq(&catas_lock);
-	dev->catas_err.stop = 1;
-	spin_unlock_irq(&catas_lock);
-
 	del_timer_sync(&dev->catas_err.timer);
 
 	if (dev->catas_err.map) {
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 54d230ee7d63..c33e1c53c799 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_cmd.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/completion.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
index 8928ca4a9325..6efd3265f248 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.h
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_cmd.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef MTHCA_CMD_H
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
index afa56bfaab2e..75671f75cac4 100644
--- a/drivers/infiniband/hw/mthca/mthca_config_reg.h
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_config_reg.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef MTHCA_CONFIG_REG_H
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index 20401d2ba6b2..d9f4735c2b37 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_cq.c 1369 2004-12-20 16:17:07Z roland $
  */
 
 #include <linux/hardirq.h>
@@ -622,13 +620,13 @@ static inline int mthca_poll_one(struct mthca_dev *dev,
 		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
 		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
 			entry->wc_flags = IB_WC_WITH_IMM;
-			entry->imm_data = cqe->imm_etype_pkey_eec;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
 			entry->opcode = IB_WC_RECV;
 			break;
 		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
 		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
 			entry->wc_flags = IB_WC_WITH_IMM;
-			entry->imm_data = cqe->imm_etype_pkey_eec;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
 			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 			break;
 		default:
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 7bc32f8e377e..ee4d073c889f 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_dev.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef MTHCA_DEV_H
@@ -279,7 +277,6 @@ struct mthca_mcg_table {
 struct mthca_catas_err {
 	u64			addr;
 	u32 __iomem	       *map;
-	unsigned long		stop;
 	u32			size;
 	struct timer_list	timer;
 	struct list_head	list;
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
index b374dc395be1..14f51ef97d7e 100644
--- a/drivers/infiniband/hw/mthca/mthca_doorbell.h
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_doorbell.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/types.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 8bde7f98e58a..4e36aa7cb3d2 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_eq.c 1382 2004-12-24 02:21:02Z roland $
  */
 
 #include <linux/errno.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 8b7e83e6e88f..640449582aba 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_mad.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/string.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 200cf13fc9bb..fb9f91b60f30 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_main.c 1396 2004-12-28 04:10:27Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
index a8ad072be074..3f5f94879208 100644
--- a/drivers/infiniband/hw/mthca/mthca_mcg.c
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_mcg.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/string.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index d5862e5d99a0..1f7d1a29d2a8 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #include <linux/mm.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
index a1ab06847b75..da9b8f9b884f 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.h
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #ifndef MTHCA_MEMFREE_H
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index 820205dec560..8489b1e81c0f 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_mr.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/slab.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
index c1e950764bd8..266f14e47406 100644
--- a/drivers/infiniband/hw/mthca/mthca_pd.c
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_pd.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/errno.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
index 605a8d57fac6..d168c2540611 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.c
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_profile.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
index e76cb62d8e32..62b009cc8730 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.h
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_profile.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef MTHCA_PROFILE_H
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index be34f99ca625..87ad889e367b 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_provider.c 4859 2006-01-09 21:55:10Z roland $
  */
 
 #include <rdma/ib_smi.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
index 934bf9544037..c621f8794b88 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.h
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_provider.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef MTHCA_PROVIDER_H
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 09dc3614cf2c..f5081bfde6db 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_qp.c 1355 2004-12-17 15:23:43Z roland $
  */
 
 #include <linux/string.h>
@@ -850,23 +848,6 @@ out:
 	return err;
 }
 
-static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 };
-static const int dummy_init_attr_mask[] = {
-	[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
-			IB_QP_PORT			|
-			IB_QP_QKEY),
-	[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
-			IB_QP_PORT			|
-			IB_QP_ACCESS_FLAGS),
-	[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
-			IB_QP_PORT			|
-			IB_QP_ACCESS_FLAGS),
-	[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
-			IB_QP_QKEY),
-	[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
-			IB_QP_QKEY),
-};
-
 int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 		    struct ib_udata *udata)
 {
@@ -928,15 +909,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 		goto out;
 	}
 
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
-		err = __mthca_modify_qp(ibqp, &dummy_init_attr,
-					dummy_init_attr_mask[ibqp->qp_type],
-					IB_QPS_RESET, IB_QPS_INIT);
-		if (err)
-			goto out;
-		cur_state = IB_QPS_INIT;
-	}
-
 	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
 
 out:
@@ -1277,10 +1249,10 @@ static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
 		return -EINVAL;
 
 	/*
-	 * For MLX transport we need 2 extra S/G entries:
+	 * For MLX transport we need 2 extra send gather entries:
 	 * one for the header and one for the checksum at the end
 	 */
-	if (qp->transport == MLX && cap->max_recv_sge + 2 > dev->limits.max_sg)
+	if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
 		return -EINVAL;
 
 	if (mthca_is_memfree(dev)) {
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
index 91934f2d9dba..acb6817f6060 100644
--- a/drivers/infiniband/hw/mthca/mthca_reset.c
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_reset.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/init.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index a5ffff6e1026..4fabe62aab8a 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_srq.c 3047 2005-08-10 03:59:35Z roland $
  */
 
 #include <linux/slab.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
index 8b728486410d..ca5900c96fcf 100644
--- a/drivers/infiniband/hw/mthca/mthca_uar.c
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #include <asm/page.h>		/* PAGE_SHIFT */
diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h
index e1262c942db8..5fe56e810739 100644
--- a/drivers/infiniband/hw/mthca/mthca_user.h
+++ b/drivers/infiniband/hw/mthca/mthca_user.h
@@ -29,7 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
  */
 
 #ifndef MTHCA_USER_H
diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h
index b3551a8dea1d..341a5ae881c1 100644
--- a/drivers/infiniband/hw/mthca/mthca_wqe.h
+++ b/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mthca_wqe.h 3047 2005-08-10 03:59:35Z roland $
  */
 
 #ifndef MTHCA_WQE_H
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index a4e9269a29bd..d2884e778098 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -328,7 +328,7 @@ void nes_rem_ref(struct ib_qp *ibqp)
 		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
 		u64temp = (u64)nesqp->nesqp_context_pbase;
 		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-		nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+		nes_post_cqp_request(nesdev, cqp_request);
 	}
 }
 
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
index 61b46e9c7d2d..39bd897b40c6 100644
--- a/drivers/infiniband/hw/nes/nes.h
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -94,9 +94,6 @@
 
 #define MAX_DPC_ITERATIONS               128
 
-#define NES_CQP_REQUEST_NO_DOORBELL_RING 0
-#define NES_CQP_REQUEST_RING_DOORBELL    1
-
 #define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
 #define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
 #define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
@@ -538,7 +535,11 @@ void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
 void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
 void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
 struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
-void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *, int);
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request);
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request);
+void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
 int nes_arp_table(struct nes_device *, u32, u8 *, u32);
 void nes_mh_fix(unsigned long);
 void nes_clc(unsigned long);
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 9a4b40fae40d..6aa531d5276d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -1603,7 +1603,6 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
 			return NULL;
 		}
 
-		memset(listener, 0, sizeof(struct nes_cm_listener));
 		listener->loc_addr = htonl(cm_info->loc_addr);
 		listener->loc_port = htons(cm_info->loc_port);
 		listener->reused_node = 0;
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index d3278f111ca7..85f26d19a32b 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -398,7 +398,7 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
 	nesadapter->base_pd = 1;
 
 	nesadapter->device_cap_flags =
-		IB_DEVICE_ZERO_STAG | IB_DEVICE_MEM_WINDOW;
+		IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
 
 	nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
 			[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
@@ -2710,39 +2710,11 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
 					barrier();
 					cqp_request->request_done = 1;
 					wake_up(&cqp_request->waitq);
-					if (atomic_dec_and_test(&cqp_request->refcount)) {
-						nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
-								cqp_request,
-								le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f);
-						if (cqp_request->dynamic) {
-							kfree(cqp_request);
-						} else {
-							spin_lock_irqsave(&nesdev->cqp.lock, flags);
-							list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-							spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-						}
-					}
-				} else if (cqp_request->callback) {
-					/* Envoke the callback routine */
-					cqp_request->cqp_callback(nesdev, cqp_request);
-					if (cqp_request->dynamic) {
-						kfree(cqp_request);
-					} else {
-						spin_lock_irqsave(&nesdev->cqp.lock, flags);
-						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-					}
+					nes_put_cqp_request(nesdev, cqp_request);
 				} else {
-					nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
-							cqp_request,
-							le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
-					if (cqp_request->dynamic) {
-						kfree(cqp_request);
-					} else {
-						spin_lock_irqsave(&nesdev->cqp.lock, flags);
-						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-					}
+					if (cqp_request->callback)
+						cqp_request->cqp_callback(nesdev, cqp_request);
+					nes_free_cqp_request(nesdev, cqp_request);
 				}
 			} else {
 				wake_up(&nesdev->cqp.waitq);
@@ -3149,7 +3121,6 @@ int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
 {
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_hw_cqp_wqe *cqp_wqe;
-	unsigned long flags;
 	struct nes_cqp_request *cqp_request;
 	int ret = 0;
 	u16 major_code;
@@ -3176,7 +3147,7 @@ int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
 	nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	if (add_port == NES_MANAGE_APBVT_ADD)
 		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
@@ -3184,15 +3155,9 @@ int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
 	nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
 			ret, cqp_request->major_code, cqp_request->minor_code);
 	major_code = cqp_request->major_code;
-	if (atomic_dec_and_test(&cqp_request->refcount)) {
-		if (cqp_request->dynamic) {
-			kfree(cqp_request);
-		} else {
-			spin_lock_irqsave(&nesdev->cqp.lock, flags);
-			list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-		}
-	}
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
 	if (!ret)
 		return -ETIME;
 	else if (major_code)
@@ -3252,7 +3217,7 @@ void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
 			nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
 
 	atomic_set(&cqp_request->refcount, 1);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 }
 
 
@@ -3262,7 +3227,6 @@ void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
 void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
 		u32 which_wq, u32 wait_completion)
 {
-	unsigned long flags;
 	struct nes_cqp_request *cqp_request;
 	struct nes_hw_cqp_wqe *cqp_wqe;
 	int ret;
@@ -3285,7 +3249,7 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
 			cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
 	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
 
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	if (wait_completion) {
 		/* Wait for CQP */
@@ -3294,14 +3258,6 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
 		nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
 				" CQP Major:Minor codes = 0x%04X:0x%04X\n",
 				ret, cqp_request->major_code, cqp_request->minor_code);
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
+		nes_put_cqp_request(nesdev, cqp_request);
 	}
 }
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
index 745bf94f3f07..7b81e0ae0076 100644
--- a/drivers/infiniband/hw/nes/nes_hw.h
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -1172,7 +1172,7 @@ struct nes_vnic {
 	u32    mcrq_qp_id;
 	struct nes_ucontext *mcrq_ucontext;
 	struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
-	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *, int);
+	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
 	int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
 	struct net_device_stats netstats;
 	/* used to put the netdev on the adapters logical port list */
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index fe83d1b2b177..fb8cbd71a2ef 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -567,12 +567,36 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
 	return cqp_request;
 }
 
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+
+	nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+		  cqp_request,
+		  le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
+
+	if (cqp_request->dynamic) {
+		kfree(cqp_request);
+	} else {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+}
+
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request)
+{
+	if (atomic_dec_and_test(&cqp_request->refcount))
+		nes_free_cqp_request(nesdev, cqp_request);
+}
 
 /**
  * nes_post_cqp_request
  */
 void nes_post_cqp_request(struct nes_device *nesdev,
-		struct nes_cqp_request *cqp_request, int ring_doorbell)
+			  struct nes_cqp_request *cqp_request)
 {
 	struct nes_hw_cqp_wqe *cqp_wqe;
 	unsigned long flags;
@@ -600,10 +624,9 @@ void nes_post_cqp_request(struct nes_device *nesdev,
 				nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
 				cqp_request->waiting, atomic_read(&cqp_request->refcount));
 		barrier();
-		if (ring_doorbell) {
-			/* Ring doorbell (1 WQEs) */
-			nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
-		}
+
+		/* Ring doorbell (1 WQEs) */
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
 
 		barrier();
 	} else {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index d617da9bd351..e3939d13484e 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -55,7 +55,6 @@ static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
  * nes_alloc_mw
  */
 static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
-	unsigned long flags;
 	struct nes_pd *nespd = to_nespd(ibpd);
 	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
 	struct nes_device *nesdev = nesvnic->nesdev;
@@ -119,7 +118,7 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
 	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
@@ -128,15 +127,7 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
 			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
 			stag, ret, cqp_request->major_code, cqp_request->minor_code);
 	if ((!ret) || (cqp_request->major_code)) {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
+		nes_put_cqp_request(nesdev, cqp_request);
 		kfree(nesmr);
 		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
 		if (!ret) {
@@ -144,17 +135,8 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
 		} else {
 			return ERR_PTR(-ENOMEM);
 		}
-	} else {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
 	}
+	nes_put_cqp_request(nesdev, cqp_request);
 
 	nesmr->ibmw.rkey = stag;
 	nesmr->mode = IWNES_MEMREG_TYPE_MW;
@@ -178,7 +160,6 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
 	struct nes_hw_cqp_wqe *cqp_wqe;
 	struct nes_cqp_request *cqp_request;
 	int err = 0;
-	unsigned long flags;
 	int ret;
 
 	/* Deallocate the window with the adapter */
@@ -194,7 +175,7 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
 	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
@@ -204,32 +185,12 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
 	nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
 			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
 			ret, cqp_request->major_code, cqp_request->minor_code);
-	if ((!ret) || (cqp_request->major_code)) {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
-		if (!ret) {
-			err = -ETIME;
-		} else {
-			err = -EIO;
-		}
-	} else {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
-	}
+	if (!ret)
+		err = -ETIME;
+	else if (cqp_request->major_code)
+		err = -EIO;
+
+	nes_put_cqp_request(nesdev, cqp_request);
 
 	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
 			(ibmw->rkey & 0x0fffff00) >> 8);
@@ -516,7 +477,7 @@ static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd,
 			(nesfmr->nesmr.pbls_used-1) : nesfmr->nesmr.pbls_used);
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
@@ -526,29 +487,11 @@ static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd,
 			stag, ret, cqp_request->major_code, cqp_request->minor_code);
 
 	if ((!ret) || (cqp_request->major_code)) {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
+		nes_put_cqp_request(nesdev, cqp_request);
 		ret = (!ret) ? -ETIME : -EIO;
 		goto failed_leaf_vpbl_pages_alloc;
-	} else {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
 	}
-
+	nes_put_cqp_request(nesdev, cqp_request);
 	nesfmr->nesmr.ibfmr.lkey = stag;
 	nesfmr->nesmr.ibfmr.rkey = stag;
 	nesfmr->attr = *ibfmr_attr;
@@ -1474,7 +1417,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 			set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
 
 			atomic_set(&cqp_request->refcount, 2);
-			nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+			nes_post_cqp_request(nesdev, cqp_request);
 
 			/* Wait for CQP */
 			nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
@@ -1487,15 +1430,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 					nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
 					cqp_request->major_code, cqp_request->minor_code);
 			if ((!ret) || (cqp_request->major_code)) {
-				if (atomic_dec_and_test(&cqp_request->refcount)) {
-					if (cqp_request->dynamic) {
-						kfree(cqp_request);
-					} else {
-						spin_lock_irqsave(&nesdev->cqp.lock, flags);
-						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-					}
-				}
+				nes_put_cqp_request(nesdev, cqp_request);
 				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
 				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
 				kfree(nesqp->allocated_buffer);
@@ -1504,18 +1439,10 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 				} else {
 					return ERR_PTR(-EIO);
 				}
-			} else {
-				if (atomic_dec_and_test(&cqp_request->refcount)) {
-					if (cqp_request->dynamic) {
-						kfree(cqp_request);
-					} else {
-						spin_lock_irqsave(&nesdev->cqp.lock, flags);
-						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-					}
-				}
 			}
 
+			nes_put_cqp_request(nesdev, cqp_request);
+
 			if (ibpd->uobject) {
 				uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
 				uresp.actual_sq_size = sq_size;
@@ -1817,7 +1744,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
 			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
@@ -1827,32 +1754,15 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
 	nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
 			nescq->hw_cq.cq_number, ret);
 	if ((!ret) || (cqp_request->major_code)) {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
+		nes_put_cqp_request(nesdev, cqp_request);
 		if (!context)
 			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
 					nescq->hw_cq.cq_pbase);
 		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
 		kfree(nescq);
 		return ERR_PTR(-EIO);
-	} else {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
 	}
+	nes_put_cqp_request(nesdev, cqp_request);
 
 	if (context) {
 		/* free the nespbl */
@@ -1931,7 +1841,7 @@ static int nes_destroy_cq(struct ib_cq *ib_cq)
 		(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
 	nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
@@ -1942,37 +1852,18 @@ static int nes_destroy_cq(struct ib_cq *ib_cq)
 			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
 			nescq->hw_cq.cq_number, ret, cqp_request->major_code,
 			cqp_request->minor_code);
-	if ((!ret) || (cqp_request->major_code)) {
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
-		if (!ret) {
-			nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
+	if (!ret) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
 					nescq->hw_cq.cq_number);
-			ret = -ETIME;
-		} else {
-			nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
+		ret = -ETIME;
+	} else if (cqp_request->major_code) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
 					nescq->hw_cq.cq_number);
-			ret = -EIO;
-		}
+		ret = -EIO;
 	} else {
 		ret = 0;
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
 	}
+	nes_put_cqp_request(nesdev, cqp_request);
 
 	if (nescq->cq_mem_size)
 		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
@@ -2096,7 +1987,7 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	barrier();
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
@@ -2105,15 +1996,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
 			stag, ret, cqp_request->major_code, cqp_request->minor_code);
 	major_code = cqp_request->major_code;
-	if (atomic_dec_and_test(&cqp_request->refcount)) {
-		if (cqp_request->dynamic) {
-			kfree(cqp_request);
-		} else {
-			spin_lock_irqsave(&nesdev->cqp.lock, flags);
-			list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-		}
-	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
 	if (!ret)
 		return -ETIME;
 	else if (major_code)
@@ -2754,7 +2638,7 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
@@ -2771,15 +2655,9 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 
 	major_code = cqp_request->major_code;
 	minor_code = cqp_request->minor_code;
-	if (atomic_dec_and_test(&cqp_request->refcount)) {
-		if (cqp_request->dynamic) {
-			kfree(cqp_request);
-		} else {
-			spin_lock_irqsave(&nesdev->cqp.lock, flags);
-			list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-		}
-	}
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
 	if (!ret) {
 		nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
 				" ib_mr=%p, rkey = 0x%08X\n",
@@ -2904,7 +2782,6 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
 	/* struct iw_cm_id *cm_id = nesqp->cm_id; */
 	/* struct iw_cm_event cm_event; */
 	struct nes_cqp_request *cqp_request;
-	unsigned long flags;
 	int ret;
 	u16 major_code;
 
@@ -2932,7 +2809,7 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
 	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
 
 	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	nes_post_cqp_request(nesdev, cqp_request);
 
 	/* Wait for CQP */
 	if (wait_completion) {
@@ -2950,15 +2827,9 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
 					nesqp->hwqp.qp_id, cqp_request->major_code,
 					cqp_request->minor_code, next_iwarp_state);
 		}
-		if (atomic_dec_and_test(&cqp_request->refcount)) {
-			if (cqp_request->dynamic) {
-				kfree(cqp_request);
-			} else {
-				spin_lock_irqsave(&nesdev->cqp.lock, flags);
-				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-			}
-		}
+
+		nes_put_cqp_request(nesdev, cqp_request);
+
 		if (!ret)
 			return -ETIME;
 		else if (major_code)
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index 1f76bad020f3..691525cf394a 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,6 +1,7 @@
 config INFINIBAND_IPOIB
 	tristate "IP-over-InfiniBand"
 	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+	select INET_LRO
 	---help---
 	  Support for the IP-over-InfiniBand protocol (IPoIB). This
 	  transports IP packets over InfiniBand so you can use your IB
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc2b853..b0ffc9abe8c0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $
  */
 
 #ifndef _IPOIB_H
@@ -52,9 +50,16 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_sa.h>
+#include <linux/inet_lro.h>
 
 /* constants */
 
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
 enum {
 	IPOIB_ENCAP_LEN		  = 4,
 
@@ -65,8 +70,8 @@ enum {
 	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
 	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
 	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
-	IPOIB_RX_RING_SIZE	  = 128,
-	IPOIB_TX_RING_SIZE	  = 64,
+	IPOIB_RX_RING_SIZE	  = 256,
+	IPOIB_TX_RING_SIZE	  = 128,
 	IPOIB_MAX_QUEUE_SIZE	  = 8192,
 	IPOIB_MIN_QUEUE_SIZE	  = 2,
 	IPOIB_CM_MAX_CONN_QP	  = 4096,
@@ -84,7 +89,6 @@ enum {
 	IPOIB_FLAG_SUBINTERFACE	  = 5,
 	IPOIB_MCAST_RUN		  = 6,
 	IPOIB_STOP_REAPER	  = 7,
-	IPOIB_MCAST_STARTED	  = 8,
 	IPOIB_FLAG_ADMIN_CM	  = 9,
 	IPOIB_FLAG_UMCAST	  = 10,
 	IPOIB_FLAG_CSUM		  = 11,
@@ -96,7 +100,11 @@ enum {
 	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
 
+	IPOIB_MAX_LRO_DESCRIPTORS = 8,
+	IPOIB_LRO_MAX_AGGR 	  = 64,
+
 	MAX_SEND_CQE		  = 16,
+	IPOIB_CM_COPYBREAK	  = 256,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -149,6 +157,11 @@ struct ipoib_tx_buf {
 	u64		mapping[MAX_SKB_FRAGS + 1];
 };
 
+struct ipoib_cm_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping;
+};
+
 struct ib_cm_id;
 
 struct ipoib_cm_data {
@@ -207,7 +220,7 @@ struct ipoib_cm_tx {
 	struct net_device   *dev;
 	struct ipoib_neigh  *neigh;
 	struct ipoib_path   *path;
-	struct ipoib_tx_buf *tx_ring;
+	struct ipoib_cm_tx_buf *tx_ring;
 	unsigned	     tx_head;
 	unsigned	     tx_tail;
 	unsigned long	     flags;
@@ -249,6 +262,11 @@ struct ipoib_ethtool_st {
 	u16     max_coalesced_frames;
 };
 
+struct ipoib_lro {
+	struct net_lro_mgr lro_mgr;
+	struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS];
+};
+
 /*
  * Device private locking: tx_lock protects members used in TX fast
  * path (and we use LLTX so upper layers don't do extra locking).
@@ -264,7 +282,6 @@ struct ipoib_dev_priv {
 
 	unsigned long flags;
 
-	struct mutex mcast_mutex;
 	struct mutex vlan_mutex;
 
 	struct rb_root  path_tree;
@@ -276,10 +293,11 @@ struct ipoib_dev_priv {
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
-	struct work_struct flush_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
-	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
 	u8		  port;
@@ -335,6 +353,8 @@ struct ipoib_dev_priv {
 	int	hca_caps;
 	struct ipoib_ethtool_st ethtool;
 	struct timer_list poll_timer;
+
+	struct ipoib_lro lro;
 };
 
 struct ipoib_ah {
@@ -359,6 +379,7 @@ struct ipoib_path {
 
 	struct rb_node	      rb_node;
 	struct list_head      list;
+	int  		      valid;
 };
 
 struct ipoib_neigh {
@@ -423,11 +444,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(struct work_struct *work);
 
+void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
@@ -466,9 +490,7 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 #endif
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
-		       union ib_gid *mgid);
-int ipoib_mcast_detach(struct net_device *dev, u16 mlid,
-		       union ib_gid *mgid);
+		       union ib_gid *mgid, int set_qkey);
 
 int ipoib_init_qp(struct net_device *dev);
 int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 97e67d36378f..0f2d3045061a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #include <rdma/ib_cm.h>
@@ -113,18 +111,20 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
 }
 
 static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
-					struct ipoib_cm_rx *rx, int id)
+					struct ipoib_cm_rx *rx,
+					struct ib_recv_wr *wr,
+					struct ib_sge *sge, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_recv_wr *bad_wr;
 	int i, ret;
 
-	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
-		priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i];
+		sge[i].addr = rx->rx_ring[id].mapping[i];
 
-	ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr);
+	ret = ib_post_recv(rx->qp, wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
 		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
@@ -322,10 +322,33 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 	return 0;
 }
 
+static void ipoib_cm_init_rx_wr(struct net_device *dev,
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		sge[i].lkey = priv->mr->lkey;
+
+	sge[0].length = IPOIB_CM_HEAD_SIZE;
+	for (i = 1; i < priv->cm.num_frags; ++i)
+		sge[i].length = PAGE_SIZE;
+
+	wr->next    = NULL;
+	wr->sg_list = priv->cm.rx_sge;
+	wr->num_sge = priv->cm.num_frags;
+}
+
 static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
 				   struct ipoib_cm_rx *rx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct {
+		struct ib_recv_wr wr;
+		struct ib_sge sge[IPOIB_CM_RX_SG];
+	} *t;
 	int ret;
 	int i;
 
@@ -333,6 +356,14 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 	if (!rx->rx_ring)
 		return -ENOMEM;
 
+	t = kmalloc(sizeof *t, GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
+
 	spin_lock_irq(&priv->lock);
 
 	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
@@ -351,8 +382,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 				ret = -ENOMEM;
 				goto err_count;
-			}
-		ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
+		}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
 		if (ret) {
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
 				   "failed for buf %d\n", i);
@@ -363,6 +394,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 
 	rx->recv_count = ipoib_recvq_size;
 
+	kfree(t);
+
 	return 0;
 
 err_count:
@@ -371,6 +404,7 @@ err_count:
 	spin_unlock_irq(&priv->lock);
 
 err_free:
+	kfree(t);
 	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
 
 	return ret;
@@ -525,6 +559,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	u64 mapping[IPOIB_CM_RX_SG];
 	int frags;
 	int has_srq;
+	struct sk_buff *small_skb;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -579,6 +614,23 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		}
 	}
 
+	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
+		int dlen = wc->byte_len;
+
+		small_skb = dev_alloc_skb(dlen + 12);
+		if (small_skb) {
+			skb_reserve(small_skb, 12);
+			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
+						   dlen, DMA_FROM_DEVICE);
+			skb_copy_from_linear_data(skb, small_skb->data, dlen);
+			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
+						      dlen, DMA_FROM_DEVICE);
+			skb_put(small_skb, dlen);
+			skb = small_skb;
+			goto copied;
+		}
+	}
+
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
 					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
@@ -601,6 +653,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
 
+copied:
 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
@@ -620,7 +673,10 @@ repost:
 			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
 				   "for buf %d\n", wr_id);
 	} else {
-		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
 			--p->recv_count;
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
 				   "for buf %d\n", wr_id);
@@ -647,7 +703,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	u64 addr;
 
 	if (unlikely(skb->len > tx->mtu)) {
@@ -678,7 +734,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 		return;
 	}
 
-	tx_req->mapping[0] = addr;
+	tx_req->mapping = addr;
 
 	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
 			       addr, skb->len))) {
@@ -703,7 +759,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_tx *tx = wc->qp->qp_context;
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long flags;
 
 	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -717,7 +773,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	tx_req = &tx->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE);
+	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	++dev->stats.tx_packets;
@@ -1087,7 +1143,7 @@ err_tx:
 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long flags;
 	unsigned long begin;
 
@@ -1115,7 +1171,7 @@ timeout:
 
 	while ((int) p->tx_tail - (int) p->tx_head < 0) {
 		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len,
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
 				    DMA_TO_DEVICE);
 		dev_kfree_skb_any(tx_req->skb);
 		++p->tx_tail;
@@ -1384,7 +1440,9 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
 		ipoib_warn(priv, "enabling connected mode "
 			   "will cause multicast packet drops\n");
 
+		rtnl_lock();
 		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO);
+		rtnl_unlock();
 		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
 		ipoib_flush_paths(dev);
@@ -1393,14 +1451,16 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
 
 	if (!strcmp(buf, "datagram\n")) {
 		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
-		dev->mtu = min(priv->mcast_mtu, dev->mtu);
-		ipoib_flush_paths(dev);
 
+		rtnl_lock();
 		if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) {
 			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
 			if (priv->hca_caps & IB_DEVICE_UD_TSO)
 				dev->features |= NETIF_F_TSO;
 		}
+		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+		rtnl_unlock();
+		ipoib_flush_paths(dev);
 
 		return count;
 	}
@@ -1485,15 +1545,7 @@ int ipoib_cm_dev_init(struct net_device *dev)
 		priv->cm.num_frags  = IPOIB_CM_RX_SG;
 	}
 
-	for (i = 0; i < priv->cm.num_frags; ++i)
-		priv->cm.rx_sge[i].lkey	= priv->mr->lkey;
-
-	priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
-	for (i = 1; i < priv->cm.num_frags; ++i)
-		priv->cm.rx_sge[i].length = PAGE_SIZE;
-	priv->cm.rx_wr.next = NULL;
-	priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
-	priv->cm.rx_wr.num_sge = priv->cm.num_frags;
+	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
 
 	if (ipoib_cm_has_srq(dev)) {
 		for (i = 0; i < ipoib_recvq_size; ++i) {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 10279b79c44d..66af5c1a76e5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -86,11 +86,57 @@ static int ipoib_set_coalesce(struct net_device *dev,
 	return 0;
 }
 
+static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = {
+	"LRO aggregated", "LRO flushed",
+	"LRO avg aggr", "LRO no desc"
+};
+
+static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(data, *ipoib_stats_keys,	sizeof(ipoib_stats_keys));
+		break;
+	}
+}
+
+static int ipoib_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(ipoib_stats_keys);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+				struct ethtool_stats *stats, uint64_t *data)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int index = 0;
+
+	/* Get LRO statistics */
+	data[index++] = priv->lro.lro_mgr.stats.aggregated;
+	data[index++] = priv->lro.lro_mgr.stats.flushed;
+	if (priv->lro.lro_mgr.stats.flushed)
+		data[index++] = priv->lro.lro_mgr.stats.aggregated /
+				priv->lro.lro_mgr.stats.flushed;
+	else
+		data[index++] = 0;
+	data[index++] = priv->lro.lro_mgr.stats.no_desc;
+}
+
 static const struct ethtool_ops ipoib_ethtool_ops = {
 	.get_drvinfo		= ipoib_get_drvinfo,
 	.get_tso		= ethtool_op_get_tso,
 	.get_coalesce		= ipoib_get_coalesce,
 	.set_coalesce		= ipoib_set_coalesce,
+	.get_flags		= ethtool_op_get_flags,
+	.set_flags		= ethtool_op_set_flags,
+	.get_strings		= ipoib_get_strings,
+	.get_sset_count		= ipoib_get_sset_count,
+	.get_ethtool_stats	= ipoib_get_ethtool_stats,
 };
 
 void ipoib_set_ethtool_ops(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 8b882bbd1d05..961c585da216 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $
  */
 
 #include <linux/err.h>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f429bce24c20..66cafa20c246 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
  */
 
 #include <linux/delay.h>
@@ -290,7 +288,10 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	netif_receive_skb(skb);
+	if (dev->features & NETIF_F_LRO)
+		lro_receive_skb(&priv->lro.lro_mgr, skb, NULL);
+	else
+		netif_receive_skb(skb);
 
 repost:
 	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
@@ -442,6 +443,9 @@ poll_more:
 	}
 
 	if (done < budget) {
+		if (dev->features & NETIF_F_LRO)
+			lro_flush_all(&priv->lro.lro_mgr);
+
 		netif_rx_complete(dev, napi);
 		if (unlikely(ib_req_notify_cq(priv->recv_cq,
 					      IB_CQ_NEXT_COMP |
@@ -898,7 +902,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 }
 
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level)
 {
 	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
@@ -911,7 +916,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * the parent is down.
 	 */
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		__ipoib_ib_dev_flush(cpriv, pkey_event);
+		__ipoib_ib_dev_flush(cpriv, level);
 
 	mutex_unlock(&priv->vlan_mutex);
 
@@ -925,7 +930,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		return;
 	}
 
-	if (pkey_event) {
+	if (level == IPOIB_FLUSH_HEAVY) {
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
@@ -943,11 +948,15 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		priv->pkey_index = new_index;
 	}
 
-	ipoib_dbg(priv, "flushing\n");
+	if (level == IPOIB_FLUSH_LIGHT) {
+		ipoib_mark_paths_invalid(dev);
+		ipoib_mcast_dev_flush(dev);
+	}
 
-	ipoib_ib_dev_down(dev, 0);
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(dev, 0);
 
-	if (pkey_event) {
+	if (level == IPOIB_FLUSH_HEAVY) {
 		ipoib_ib_dev_stop(dev, 0);
 		ipoib_ib_dev_open(dev);
 	}
@@ -957,27 +966,34 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * we get here, don't bring it back up if it's not configured up
 	 */
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
-		ipoib_ib_dev_up(dev);
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+		container_of(work, struct ipoib_dev_priv, flush_normal);
 
-	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
-	__ipoib_ib_dev_flush(priv, 0);
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
 }
 
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
 
-	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
-	__ipoib_ib_dev_flush(priv, 1);
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090ac8d1..8be9ea0436e6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $
  */
 
 #include "ipoib.h"
@@ -62,6 +60,15 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
 
+static int lro;
+module_param(lro, bool, 0444);
+MODULE_PARM_DESC(lro,  "Enable LRO (Large Receive Offload)");
+
+static int lro_max_aggr = IPOIB_LRO_MAX_AGGR;
+module_param(lro_max_aggr, int, 0644);
+MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
+		"(default = 64)");
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
 
@@ -350,6 +357,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 
 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+
+	spin_lock_irq(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n",
+			be16_to_cpu(path->pathrec.dlid),
+			IPOIB_GID_ARG(path->pathrec.dgid));
+		path->valid =  0;
+	}
+
+	spin_unlock_irq(&priv->lock);
+}
+
 void ipoib_flush_paths(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -386,6 +410,7 @@ static void path_rec_completion(int status,
 	struct net_device *dev = path->dev;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_ah *ah = NULL;
+	struct ipoib_ah *old_ah;
 	struct ipoib_neigh *neigh, *tn;
 	struct sk_buff_head skqueue;
 	struct sk_buff *skb;
@@ -409,6 +434,7 @@ static void path_rec_completion(int status,
 
 	spin_lock_irqsave(&priv->lock, flags);
 
+	old_ah   = path->ah;
 	path->ah = ah;
 
 	if (ah) {
@@ -421,6 +447,17 @@ static void path_rec_completion(int status,
 			__skb_queue_tail(&skqueue, skb);
 
 		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+			if (neigh->ah) {
+				WARN_ON(neigh->ah != old_ah);
+				/*
+				 * Dropping the ah reference inside
+				 * priv->lock is safe here, because we
+				 * will hold one more reference from
+				 * the original value of path->ah (ie
+				 * old_ah).
+				 */
+				ipoib_put_ah(neigh->ah);
+			}
 			kref_get(&path->ah->ref);
 			neigh->ah = path->ah;
 			memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
@@ -443,6 +480,7 @@ static void path_rec_completion(int status,
 			while ((skb = __skb_dequeue(&neigh->queue)))
 				__skb_queue_tail(&skqueue, skb);
 		}
+		path->valid = 1;
 	}
 
 	path->query = NULL;
@@ -450,6 +488,9 @@ static void path_rec_completion(int status,
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
+	if (old_ah)
+		ipoib_put_ah(old_ah);
+
 	while ((skb = __skb_dequeue(&skqueue))) {
 		skb->dev = dev;
 		if (dev_queue_xmit(skb))
@@ -623,8 +664,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 	spin_lock(&priv->lock);
 
 	path = __path_find(dev, phdr->hwaddr + 4);
-	if (!path) {
-		path = path_rec_create(dev, phdr->hwaddr + 4);
+	if (!path || !path->valid) {
+		if (!path)
+			path = path_rec_create(dev, phdr->hwaddr + 4);
 		if (path) {
 			/* put pseudoheader back on for next time */
 			skb_push(skb, sizeof *phdr);
@@ -938,6 +980,54 @@ static const struct header_ops ipoib_header_ops = {
 	.create	= ipoib_hard_header,
 };
 
+static int get_skb_hdr(struct sk_buff *skb, void **iphdr,
+		       void **tcph, u64 *hdr_flags, void *priv)
+{
+	unsigned int ip_len;
+	struct iphdr *iph;
+
+	if (unlikely(skb->protocol != htons(ETH_P_IP)))
+		return -1;
+
+	/*
+	 * In the future we may add an else clause that verifies the
+	 * checksum and allows devices which do not calculate checksum
+	 * to use LRO.
+	 */
+	if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+		return -1;
+
+	/* Check for non-TCP packet */
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+
+	ip_len = ip_hdrlen(skb);
+	skb_set_transport_header(skb, ip_len);
+	*tcph = tcp_hdr(skb);
+
+	/* check if IP header and TCP header are complete */
+	if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb))
+		return -1;
+
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+	*iphdr = iph;
+
+	return 0;
+}
+
+static void ipoib_lro_setup(struct ipoib_dev_priv *priv)
+{
+	priv->lro.lro_mgr.max_aggr	 = lro_max_aggr;
+	priv->lro.lro_mgr.max_desc	 = IPOIB_MAX_LRO_DESCRIPTORS;
+	priv->lro.lro_mgr.lro_arr	 = priv->lro.lro_desc;
+	priv->lro.lro_mgr.get_skb_header = get_skb_hdr;
+	priv->lro.lro_mgr.features	 = LRO_F_NAPI;
+	priv->lro.lro_mgr.dev		 = priv->dev;
+	priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+}
+
 static void ipoib_setup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -977,10 +1067,11 @@ static void ipoib_setup(struct net_device *dev)
 
 	priv->dev = dev;
 
+	ipoib_lro_setup(priv);
+
 	spin_lock_init(&priv->lock);
 	spin_lock_init(&priv->tx_lock);
 
-	mutex_init(&priv->mcast_mutex);
 	mutex_init(&priv->vlan_mutex);
 
 	INIT_LIST_HEAD(&priv->path_list);
@@ -989,9 +1080,10 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 }
@@ -1154,6 +1246,9 @@ static struct net_device *ipoib_add_port(const char *format,
 		priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
 	}
 
+	if (lro)
+		priv->dev->features |= NETIF_F_LRO;
+
 	/*
 	 * Set the full membership bit, so that we join the right
 	 * broadcast group, etc.
@@ -1304,6 +1399,12 @@ static int __init ipoib_init_module(void)
 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
 #endif
 
+	/*
+	 * When copying small received packets, we only copy from the
+	 * linear data part of the SKB, so we rely on this condition.
+	 */
+	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
+
 	ret = ipoib_register_debugfs();
 	if (ret)
 		return ret;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3f663fb852c1..1fcc9a898d81 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $
  */
 
 #include <linux/skbuff.h>
@@ -188,6 +186,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_ah *ah;
 	int ret;
+	int set_qkey = 0;
 
 	mcast->mcmember = *mcmember;
 
@@ -202,6 +201,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
 		spin_unlock_irq(&priv->lock);
 		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+		set_qkey = 1;
 	}
 
 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -214,7 +214,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		}
 
 		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
-					 &mcast->mcmember.mgid);
+					 &mcast->mcmember.mgid, set_qkey);
 		if (ret < 0) {
 			ipoib_warn(priv, "couldn't attach QP to multicast group "
 				   IPOIB_GID_FMT "\n",
@@ -575,8 +575,11 @@ void ipoib_mcast_join_task(struct work_struct *work)
 
 	priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 
-	if (!ipoib_cm_admin_enabled(dev))
-		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+	if (!ipoib_cm_admin_enabled(dev)) {
+		rtnl_lock();
+		dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
+		rtnl_unlock();
+	}
 
 	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
 
@@ -594,10 +597,6 @@ int ipoib_mcast_start_thread(struct net_device *dev)
 		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
 	mutex_unlock(&mcast_mutex);
 
-	spin_lock_irq(&priv->lock);
-	set_bit(IPOIB_MCAST_STARTED, &priv->flags);
-	spin_unlock_irq(&priv->lock);
-
 	return 0;
 }
 
@@ -607,10 +606,6 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
 
 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
-	spin_lock_irq(&priv->lock);
-	clear_bit(IPOIB_MCAST_STARTED, &priv->flags);
-	spin_unlock_irq(&priv->lock);
-
 	mutex_lock(&mcast_mutex);
 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
 	cancel_delayed_work(&priv->mcast_task);
@@ -635,10 +630,10 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 				IPOIB_GID_ARG(mcast->mcmember.mgid));
 
 		/* Remove ourselves from the multicast group */
-		ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
-					 &mcast->mcmember.mgid);
+		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+				      be16_to_cpu(mcast->mcmember.mlid));
 		if (ret)
-			ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
+			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
 	}
 
 	return 0;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8766d29ce3b7..68325119f740 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -29,24 +29,17 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include "ipoib.h"
 
-int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ib_qp_attr *qp_attr;
+	struct ib_qp_attr *qp_attr = NULL;
 	int ret;
 	u16 pkey_index;
 
-	ret = -ENOMEM;
-	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
-	if (!qp_attr)
-		goto out;
-
 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 		ret = -ENXIO;
@@ -54,18 +47,23 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
 	}
 	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 
-	/* set correct QKey for QP */
-	qp_attr->qkey = priv->qkey;
-	ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
-	if (ret) {
-		ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
-		goto out;
+	if (set_qkey) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+		if (!qp_attr)
+			goto out;
+
+		/* set correct QKey for QP */
+		qp_attr->qkey = priv->qkey;
+		ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+		if (ret) {
+			ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+			goto out;
+		}
 	}
 
 	/* attach QP to multicast group */
-	mutex_lock(&priv->mcast_mutex);
 	ret = ib_attach_mcast(priv->qp, mgid, mlid);
-	mutex_unlock(&priv->mcast_mutex);
 	if (ret)
 		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
 
@@ -74,20 +72,6 @@ out:
 	return ret;
 }
 
-int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int ret;
-
-	mutex_lock(&priv->mcast_mutex);
-	ret = ib_detach_mcast(priv->qp, mgid, mlid);
-	mutex_unlock(&priv->mcast_mutex);
-	if (ret)
-		ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
-
-	return ret;
-}
-
 int ipoib_init_qp(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -201,7 +185,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	init_attr.recv_cq = priv->recv_cq;
 
 	if (priv->hca_caps & IB_DEVICE_UD_TSO)
-		init_attr.create_flags = IB_QP_CREATE_IPOIB_UD_LSO;
+		init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
 
 	if (dev->features & NETIF_F_SG)
 		init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
@@ -289,15 +276,17 @@ void ipoib_event(struct ib_event_handler *handler,
 	if (record->element.port_num != priv->port)
 		return;
 
-	if (record->event == IB_EVENT_PORT_ERR    ||
-	    record->event == IB_EVENT_PORT_ACTIVE ||
-	    record->event == IB_EVENT_LID_CHANGE  ||
-	    record->event == IB_EVENT_SM_CHANGE   ||
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+		  record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_SM_CHANGE ||
 	    record->event == IB_EVENT_CLIENT_REREGISTER) {
-		ipoib_dbg(priv, "Port state change event\n");
-		queue_work(ipoib_workqueue, &priv->flush_task);
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
-		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
-		queue_work(ipoib_workqueue, &priv->pkey_event_task);
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
 	}
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 1cdb5cfb0ff1..b08eb56196d3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index aeb58cae9a3f..356fac6d105a 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -42,9 +42,6 @@
  *	Zhenyu Wang
  * Modified by:
  *      Erez Zilber
- *
- *
- * $Id: iscsi_iser.c 6965 2006-05-07 11:36:20Z ogerlitz $
  */
 
 #include <linux/types.h>
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index a8c1b300e34d..0e10703cf59e 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -36,8 +36,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: iscsi_iser.h 7051 2006-05-10 12:29:11Z ogerlitz $
  */
 #ifndef __ISCSI_ISER_H__
 #define __ISCSI_ISER_H__
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 08dc81c46f41..31ad498bdc51 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: iser_initiator.c 6964 2006-05-07 11:11:43Z ogerlitz $
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index cac50c4dc159..81e49cb10ed3 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $
  */
 #include <linux/module.h>
 #include <linux/kernel.h>
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index d19cfe605ebb..77cabee7cc08 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: iser_verbs.c 7051 2006-05-10 12:29:11Z ogerlitz $
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 435145709dd6..ed7c5f72cb8b 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_srp.c 3932 2005-11-01 17:19:29Z roland $
  */
 
 #include <linux/module.h>
@@ -49,8 +47,6 @@
 #include <scsi/srp.h>
 #include <scsi/scsi_transport_srp.h>
 
-#include <rdma/ib_cache.h>
-
 #include "ib_srp.h"
 
 #define DRV_NAME	"ib_srp"
@@ -183,10 +179,10 @@ static int srp_init_qp(struct srp_target_port *target,
 	if (!attr)
 		return -ENOMEM;
 
-	ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev,
-				  target->srp_host->port,
-				  be16_to_cpu(target->path.pkey),
-				  &attr->pkey_index);
+	ret = ib_find_pkey(target->srp_host->srp_dev->dev,
+			   target->srp_host->port,
+			   be16_to_cpu(target->path.pkey),
+			   &attr->pkey_index);
 	if (ret)
 		goto out;
 
@@ -1883,8 +1879,7 @@ static ssize_t srp_create_target(struct device *dev,
 	if (ret)
 		goto err;
 
-	ib_get_cached_gid(host->srp_dev->dev, host->port, 0,
-			  &target->path.sgid);
+	ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid);
 
 	shost_printk(KERN_DEBUG, target->scsi_host, PFX
 		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x "
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 63d2ae724061..e185b907fc12 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_srp.h 3932 2005-11-01 17:19:29Z roland $
  */
 
 #ifndef IB_SRP_H
diff --git a/drivers/net/cxgb3/cxgb3_ctl_defs.h b/drivers/net/cxgb3/cxgb3_ctl_defs.h
index 6c4f32066919..ed0ecd9679cb 100644
--- a/drivers/net/cxgb3/cxgb3_ctl_defs.h
+++ b/drivers/net/cxgb3/cxgb3_ctl_defs.h
@@ -54,6 +54,7 @@ enum {
 	RDMA_CQ_DISABLE		= 16,
 	RDMA_CTRL_QP_SETUP	= 17,
 	RDMA_GET_MEM		= 18,
+	RDMA_GET_MIB		= 19,
 
 	GET_RX_PAGE_INFO	= 50,
 };
diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c
index ff9c013ce535..cf2696873796 100644
--- a/drivers/net/cxgb3/cxgb3_offload.c
+++ b/drivers/net/cxgb3/cxgb3_offload.c
@@ -303,6 +303,12 @@ static int cxgb_rdma_ctl(struct adapter *adapter, unsigned int req, void *data)
 		spin_unlock_irq(&adapter->sge.reg_lock);
 		break;
 	}
+	case RDMA_GET_MIB: {
+		spin_lock(&adapter->stats_lock);
+		t3_tp_get_mib_stats(adapter, (struct tp_mib_stats *)data);
+		spin_unlock(&adapter->stats_lock);
+		break;
+	}
 	default:
 		ret = -EOPNOTSUPP;
 	}
@@ -381,6 +387,7 @@ static int cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data)
 	case RDMA_CQ_DISABLE:
 	case RDMA_CTRL_QP_SETUP:
 	case RDMA_GET_MEM:
+	case RDMA_GET_MIB:
 		if (!offload_running(adapter))
 			return -EAGAIN;
 		return cxgb_rdma_ctl(adapter, req, data);
diff --git a/drivers/net/cxgb3/version.h b/drivers/net/cxgb3/version.h
index a0177fc55e28..29db711303b9 100644
--- a/drivers/net/cxgb3/version.h
+++ b/drivers/net/cxgb3/version.h
@@ -38,7 +38,7 @@
 #define DRV_VERSION "1.0-ko"
 
 /* Firmware version */
-#define FW_VERSION_MAJOR 6
+#define FW_VERSION_MAJOR 7
 #define FW_VERSION_MINOR 0
 #define FW_VERSION_MICRO 0
 #endif				/* __CHELSIO_VERSION_H */
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index d82f2751d2c7..2b5006b9be67 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -101,6 +101,34 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
 			mlx4_dbg(dev, "    %s\n", fname[i]);
 }
 
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE		0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET	0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET	0x003
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, MOD_STAT_CFG_IN_SIZE);
+
+	MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	struct mlx4_cmd_mailbox *mailbox;
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 306cb9b0242d..a0e046c149b7 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -38,6 +38,11 @@
 #include "mlx4.h"
 #include "icm.h"
 
+struct mlx4_mod_stat_cfg {
+	u8 log_pg_sz;
+	u8 log_pg_sz_m;
+};
+
 struct mlx4_dev_cap {
 	int max_srq_sz;
 	int max_qp_sz;
@@ -162,5 +167,6 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
 int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
 int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
 int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
 
 #endif /* MLX4_FW_H */
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index a6aa49fc1d68..d3736013fe9b 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -485,6 +485,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 	struct mlx4_priv	  *priv = mlx4_priv(dev);
 	struct mlx4_adapter	   adapter;
 	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_mod_stat_cfg   mlx4_cfg;
 	struct mlx4_profile	   profile;
 	struct mlx4_init_hca_param init_hca;
 	u64 icm_size;
@@ -502,6 +503,12 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		return err;
 	}
 
+	mlx4_cfg.log_pg_sz_m = 1;
+	mlx4_cfg.log_pg_sz = 0;
+	err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+	if (err)
+		mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+
 	err = mlx4_dev_cap(dev, &dev_cap);
 	if (err) {
 		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index 57f7f1f0d4ec..b4b57870ddfd 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -38,6 +38,9 @@
 
 #include "mlx4.h"
 
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
 struct mlx4_mgm {
 	__be32			next_gid_index;
 	__be32			members_count;
@@ -153,7 +156,8 @@ static int find_mgm(struct mlx4_dev *dev,
 	return err;
 }
 
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
@@ -202,13 +206,18 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 	}
 
 	for (i = 0; i < members_count; ++i)
-		if (mgm->qp[i] == cpu_to_be32(qp->qpn)) {
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
 			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
 			err = 0;
 			goto out;
 		}
 
-	mgm->qp[members_count++] = cpu_to_be32(qp->qpn);
+	if (block_mcast_loopback)
+		mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+						       (1 << MGM_BLCK_LB_BIT));
+	else
+		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
+
 	mgm->members_count       = cpu_to_be32(members_count);
 
 	err = mlx4_WRITE_MCG(dev, index, mailbox);
@@ -283,7 +292,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 
 	members_count = be32_to_cpu(mgm->members_count);
 	for (loc = -1, i = 0; i < members_count; ++i)
-		if (mgm->qp[i] == cpu_to_be32(qp->qpn))
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
 			loc = i;
 
 	if (loc == -1) {
diff --git a/firmware/Makefile b/firmware/Makefile
index 809a52624bda..e4f2fb3d1917 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -34,8 +34,6 @@ fw-shipped-$(CONFIG_SND_SB16_CSP) += sb16/mulaw_main.csp sb16/alaw_main.csp \
 				     sb16/ima_adpcm_capture.csp
 fw-shipped-$(CONFIG_SND_YMFPCI) += yamaha/ds1_ctrl.fw yamaha/ds1_dsp.fw \
 				   yamaha/ds1e_ctrl.fw
-fw-shipped-$(CONFIG_TIGON3) += tigon/tg3.bin tigon/tg3_tso.bin \
-			       tigon/tg3_tso5.bin
 fw-shipped-$(CONFIG_USB_DABUSB) += dabusb/firmware.fw dabusb/bitstream.bin
 fw-shipped-$(CONFIG_USB_EMI26) += emi26/loader.fw emi26/firmware.fw \
 				  emi26/bitstream.fw
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..5fa1512cd9a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
+	int i_size_changed = 0;
 
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
+		i_size_changed = 1;
 	}
 
 	unlock_page(page);
 	page_cache_release(page);
 
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 			ext4_group_t block_group)
 {
 	ext4_group_t actual_group;
-	ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
 	if (actual_group == block_group)
 		return 1;
 	return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
 		}
 	} else { /* For META_BG_BLOCK_GROUPS */
-		int group_rel = (block_group -
-				 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-				EXT4_DESC_PER_BLOCK(sb);
-		if (group_rel == 0 || group_rel == 1 ||
-		    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-			bit_max += 1;
+		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
 	if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
 	return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
  * @sb:			super block
  * @block_group:	given block group
  *
@@ -305,7 +300,7 @@ err_out:
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc * desc;
 	struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
 		prev = rsv;
 	}
 	printk("Window map complete.\n");
-	if (bad)
-		BUG();
+	BUG_ON(bad);
 }
 #define rsv_window_dump(root, verbose) \
 	__rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
 		count -= overflow;
 	}
 	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
 
 /**
  * ext4_has_free_blocks()
- * @sbi:		in-core super block structure.
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of neeed blocks
  *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks)
 {
-	ext4_fsblk_t free_blocks, root_blocks;
+	ext4_fsblk_t free_blocks;
+	ext4_fsblk_t root_blocks = 0;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = ext4_r_blocks_count(sbi->s_es);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		return 0;
-	}
-	return 1;
-}
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+	if (free_blocks - root_blocks < FBC_BATCH)
+		free_blocks =
+			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+	if (free_blocks - root_blocks < nblocks)
+		return free_blocks - root_blocks;
+	return nblocks;
+ }
+
 
 /**
  * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
  * @handle:		handle to this transaction
  * @inode:		file inode
  * @goal:		given target block(filesystem wide)
  * @count:		target number of blocks to allocate
  * @errp:		error code
  *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
  *
  */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 	ext4_group_t ngroups;
 	unsigned long num = *count;
 
-	*errp = -ENOSPC;
 	sb = inode->i_sb;
 	if (!sb) {
+		*errp = -ENODEV;
 		printk("ext4_new_block: nonexistent device");
 		return 0;
 	}
 
+	sbi = EXT4_SB(sb);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		*count = ext4_has_free_blocks(sbi, *count);
+	}
+	if (*count == 0) {
+		*errp = -ENOSPC;
+		return 0;	/*return with ENOSPC error */
+	}
+	num = *count;
+
 	/*
 	 * Check quota for allocation of this block.
 	 */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 		my_rsv = &block_i->rsv_window_node;
 
-	if (!ext4_has_free_blocks(sbi)) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
 	/*
 	 * First, test whether the goal block is free.
 	 */
@@ -1734,7 +1759,7 @@ retry_alloc:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
 			continue;
 
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		/*
@@ -1882,7 +1907,15 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= num;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
 	return 0;
 }
 
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp, int flags)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
 	if (!test_opt(inode->i_sb, MBALLOC)) {
-		unsigned long count = 1;
-		ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-		return ret;
+		return ext4_old_new_blocks(handle, inode, goal, count, errp);
 	}
 
 	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+
 	ar.inode = inode;
 	ar.goal = goal;
-	ar.len = 1;
+	ar.len = *count;
+	ar.logical = iblock;
+
+	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+		/* enable in-core preallocation for data block allocation */
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
+
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	*count = ar.len;
 	return ret;
 }
 
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
-
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
-		return ret;
+	ret = do_blk_alloc(handle, inode, 0, goal,
+				count, errp, EXT4_META_BLOCK);
+	/*
+	 * Account for the allocated meta blocks
+	 */
+	if (!(*errp)) {
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+		EXT4_I(inode)->i_allocated_meta_blocks += *count;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	}
-
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
 	return ret;
 }
 
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp)
+{
+	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, i);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+						0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 
 	while (n) {
 		/* Do the node's children first */
-		if ((n)->rb_left) {
+		if (n->rb_left) {
 			n = n->rb_left;
 			continue;
 		}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	root->rb_node = NULL;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
 	p->curr_hash = pos2maj_hash(pos);
 	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
 	return p;
 }
 
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext4_htree_create_dir_info(filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
  */
 
 /*
@@ -45,7 +45,7 @@
 #define ext4_debug(f, a...)						\
 	do {								\
 		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
-			__FILE__, __LINE__, __FUNCTION__);		\
+			__FILE__, __LINE__, __func__);			\
 		printk (KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY		256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL		512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
+
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
 	__u32	bg_reserved2[3];
 };
 
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	__u32 free_inodes;
+	__u32 free_blocks;
+};
+
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
 	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad2;
+	__le16  s_reserved_pad;
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+					ext4_lblk_t iblock, ext4_fsblk_t goal,
+					unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+		ext4_grpblk_t add);
 
 
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode (struct inode *, int);
 extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				struct kstat *stat);
 extern void ext4_delete_inode (struct inode *);
 extern int  ext4_sync_inode (handle_t *, struct inode *);
 extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 }
 
 
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
-		__ext4_std_error((sb), __FUNCTION__, (errno));	\
+		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
 /*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 			sector_t block, unsigned long max_blocks,
 			struct buffer_head *bh, int create,
-			int extend_disksize);
+			int extend_disksize, int flag);
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
 
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
+	struct jbd2_inode jinode;
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
 	/* mballoc */
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
+
+	/* allocation reservation info for delalloc */
+	unsigned long i_reserved_data_blocks;
+	unsigned long i_reserved_meta_blocks;
+	unsigned long i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+	spinlock_t i_block_reservation_lock;
 };
 
 #endif	/* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
 				handle_t *handle, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
-	__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-	__ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_write_access(__func__, (handle), (bh))
 #define ext4_journal_revoke(handle, blocknr, bh) \
-	__ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+	__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
-	__ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_create_access(__func__, (handle), (bh))
 #define ext4_journal_dirty_metadata(handle, bh) \
-	__ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+	__ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
-	__ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+	__ext4_journal_forget(__func__, (handle), (bh))
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 
 #define ext4_journal_stop(handle) \
-	__ext4_journal_stop(__FUNCTION__, (handle))
+	__ext4_journal_stop(__func__, (handle))
 
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 	return jbd2_journal_force_commit(journal);
 }
 
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
  */
 struct ext4_sb_info {
 	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
 
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
 };
 
 #endif	/* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
 
 	if (handle->h_buffer_credits > needed)
-		return handle;
-	if (!ext4_journal_extend(handle, needed))
-		return handle;
-	err = ext4_journal_restart(handle, needed);
-
-	return handle;
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err)
+		return err;
+	return ext4_journal_restart(handle, needed);
 }
 
 /*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	return bg_start + colour + block;
 }
 
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
 			struct ext4_extent *ex, int *err)
 {
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_block(handle, inode, goal, err);
+	newblock = ext4_new_meta_block(handle, inode, goal, err);
 	return newblock;
 }
 
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
 	return size;
 }
 
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int lcap, icap, rcap, leafs, idxs, num;
+	int newextents = blocks;
+
+	rcap = ext4_ext_space_root_idx(inode);
+	lcap = ext4_ext_space_block(inode);
+	icap = ext4_ext_space_block_idx(inode);
+
+	/* number of new leaf blocks needed */
+	num = leafs = (newextents + lcap - 1) / lcap;
+
+	/*
+	 * Worse case, we need separate index block(s)
+	 * to link all new leaf blocks
+	 */
+	idxs = (leafs + icap - 1) / icap;
+	do {
+		num += idxs;
+		idxs = (idxs + icap - 1) / icap;
+	} while (idxs > rcap);
+
+	return num;
+}
+
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		alloc = 1;
 	}
 	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
 
 	i = depth;
 	/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	}
 
 	path[ppos].p_depth = i;
-	path[ppos].p_hdr = eh;
 	path[ppos].p_ext = NULL;
 	path[ppos].p_idx = NULL;
 
 	/* find extent */
 	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	/* allocate all needed blocks */
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
-		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
 	if (newblock == 0)
 		return err;
 
@@ -981,6 +1017,8 @@ repeat:
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
 		err = ext4_ext_split(handle, inode, path, newext, i);
+		if (err)
+			goto out;
 
 		/* refill path */
 		ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
 
-		handle = ext4_ext_journal_restart(handle, credits);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
+		err = ext4_ext_journal_restart(handle, credits);
+		if (err)
 			goto out;
-		}
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret;
 	unsigned long allocated = 0;
 	struct ext4_allocation_request ar;
+	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				 */
 				if (allocated > max_blocks)
 					allocated = max_blocks;
-				/* mark the buffer unwritten */
-				__set_bit(BH_Unwritten, &bh_result->b_state);
+				set_buffer_unwritten(bh_result);
 				goto out2;
 			}
 
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	}
 
-	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = inode->i_size;
-
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	__set_bit(BH_New, &bh_result->b_state);
+	if (extend_disksize) {
+		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = disksize;
+	}
+
+	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
 	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
 	ext4_ext_show_leaf(inode, path);
-	__set_bit(BH_Mapped, &bh_result->b_state);
+	set_buffer_mapped(bh_result);
 	bh_result->b_bdev = inode->i_sb->s_bdev;
 	bh_result->b_blocknr = newblock;
 out2:
@@ -2744,7 +2785,7 @@ out2:
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	 */
 	err = ext4_writepage_trans_blocks(inode) + 3;
 	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;
-	}
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	 * Probably we need not scan at all,
 	 * because page truncation is enough.
 	 */
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 		handle->h_sync = 1;
 
 out_stop:
+	up_write(&EXT4_I(inode)->i_data_sem);
 	/*
 	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	up_write(&EXT4_I(inode)->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
 		}
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
+					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
 	return ret;
 }
 
+static struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext4_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
 	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
+	.getattr	= ext4_getattr,
 #ifdef CONFIG_EXT4DEV_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret = 0;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
+		if (journal && (journal->j_flags & JBD2_BARRIER))
+			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	}
 out:
 	return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
 				   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
 				       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
 				      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 				       struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	struct ext4_super_block * es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
+	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
+			if (sbi->s_log_groups_per_flex) {
+				flex_group = ext4_flex_group(sbi, block_group);
+				spin_lock(sb_bgl_lock(sbi, flex_group));
+				sbi->s_flex_groups[flex_group].free_inodes++;
+				spin_unlock(sb_bgl_lock(sbi, flex_group));
+			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
 		err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 	return ret;
 }
 
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+			   ext4_group_t *best_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+	struct flex_groups *flex_group = sbi->s_flex_groups;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+	ext4_group_t ngroups = sbi->s_groups_count;
+	int flex_size = ext4_flex_bg_size(sbi);
+	ext4_group_t best_flex = parent_fbg_group;
+	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+	int flexbg_free_blocks;
+	int flex_freeb_ratio;
+	ext4_group_t n_fbg_groups;
+	ext4_group_t i;
+
+	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+		sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+	if (flex_group[best_flex].free_inodes &&
+	    flex_freeb_ratio > free_block_ratio)
+		goto found_flexbg;
+
+	if (best_flex && best_flex == parent_fbg_group) {
+		best_flex--;
+		goto find_close_to_parent;
+	}
+
+	for (i = 0; i < n_fbg_groups; i++) {
+		if (i == parent_fbg_group || i == parent_fbg_group - 1)
+			continue;
+
+		flexbg_free_blocks = flex_group[i].free_blocks;
+		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+		if (flex_freeb_ratio > free_block_ratio &&
+		    flex_group[i].free_inodes) {
+			best_flex = i;
+			goto found_flexbg;
+		}
+
+		if (best_flex < 0 ||
+		    (flex_group[i].free_blocks >
+		     flex_group[best_flex].free_blocks &&
+		     flex_group[i].free_inodes))
+			best_flex = i;
+	}
+
+	if (!flex_group[best_flex].free_inodes ||
+	    !flex_group[best_flex].free_blocks)
+		return -1;
+
+found_flexbg:
+	for (i = best_flex * flex_size; i < ngroups &&
+		     i < (best_flex + 1) * flex_size; i++) {
+		desc = ext4_get_group_desc(sb, i, &bh);
+		if (le16_to_cpu(desc->bg_free_inodes_count)) {
+			*best_group = i;
+			goto out;
+		}
+	}
+
+	return -1;
+out:
+	return 0;
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
+	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
+
+	if (sbi->s_log_groups_per_flex) {
+		ret2 = find_group_flex(sb, dir, &group);
+		goto got_group;
+	}
+
 	if (S_ISDIR(mode)) {
 		if (test_opt (sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	} else
 		ret2 = find_group_other(sb, dir, &group);
 
+got_group:
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -600,7 +689,7 @@ got:
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = read_block_bitmap(sb, group);
+		struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
 
 		BUFFER_TRACE(block_bh, "get block bitmap access");
 		err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
 		percpu_counter_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_inodes--;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	inode->i_uid = current->fsuid;
 	if (test_opt (sb, GRPID))
 		inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
 		goto fail_free_drop;
 
 	if (test_opt(sb, EXTENTS)) {
-		/* set extent flag only for diretory, file and normal symlink*/
+		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
 			ext4_ext_tree_init(handle, inode);
-			err = ext4_update_incompat_feature(handle, sb,
-					EXT4_FEATURE_INCOMPAT_EXTENTS);
-			if (err)
-				goto fail_free_drop;
 		}
 	}
 
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	if (IS_ERR(inode))
 		goto iget_failed;
 
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext4_can_truncate(inode))
+		goto bad_orphan;
+
 	if (NEXT_ORPHAN(inode) > max_ino)
 		goto bad_orphan;
 	brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
 		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
 		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+						   new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int indirect_blks, int blks,
-			ext4_fsblk_t new_blocks[4], int *err)
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				int indirect_blks, int blks,
+				ext4_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
-	unsigned long count = 0;
+	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
-	target = blks + indirect_blks;
-
-	while (1) {
+	/* first we try to allocate the indirect blocks */
+	target = indirect_blks;
+	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+		current_block = ext4_new_meta_blocks(handle, inode,
+							goal, &count, err);
 		if (*err)
 			goto failed_out;
 
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			new_blocks[index++] = current_block++;
 			count--;
 		}
-
-		if (count > 0)
+		if (count > 0) {
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
+			new_blocks[index] = current_block;
+			printk(KERN_INFO "%s returned more blocks than "
+						"requested\n", __func__);
+			WARN_ON(1);
 			break;
+		}
 	}
 
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
+	target = blks - count ;
+	blk_allocated = count;
+	if (!target)
+		goto allocated;
+	/* Now allocate data blocks */
+	count = target;
+	/* allocating blocks for data blocks */
+	current_block = ext4_new_blocks(handle, inode, iblock,
+						goal, &count, err);
+	if (*err && (target == blks)) {
+		/*
+		 * if the allocation failed and we didn't allocate
+		 * any blocks before
+		 */
+		goto failed_out;
+	}
+	if (!*err) {
+		if (target == blks) {
+		/*
+		 * save the new block number
+		 * for the first direct block
+		 */
+			new_blocks[index] = current_block;
+		}
+		blk_allocated += count;
+	}
+allocated:
 	/* total number of blocks allocated for direct blocks */
-	ret = count;
+	ret = blk_allocated;
 	*err = 0;
 	return ret;
 failed_out:
@@ -584,8 +631,9 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext4_fsblk_t goal,
-			ext4_lblk_t *offsets, Indirect *branch)
+				ext4_lblk_t iblock, int indirect_blks,
+				int *blks, ext4_fsblk_t goal,
+				ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 
-	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
+	loff_t disksize;
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
+	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+					&count, goal,
+					offsets + (partial - chain), partial);
 
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-		ei->i_disksize = inode->i_size;
+	if (!err && extend_disksize) {
+		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > ei->i_disksize)
+			ei->i_disksize = disksize;
+	}
 	if (err)
 		goto cleanup;
 
@@ -934,7 +989,7 @@ out:
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize)
+			int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/*
+	 * if the caller is from delayed allocation writeout path
+	 * we have already reserved fs blocks for allocation
+	 * let the underlying get_block() function know to
+	 * avoid double accounting
+	 */
+	if (flag)
+		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 							~EXT4_EXT_MIGRATE;
 		}
 	}
+
+	if (flag) {
+		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+		/*
+		 * Update reserved blocks/metadata blocks
+		 * after successful block allocation
+		 * which were deferred till now
+		 */
+		if ((retval > 0) && buffer_delay(bh))
+			ext4_da_release_space(inode, retval, 0);
+	}
+
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0);
+					max_blocks, bh_result, create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1);
+					&dummy, create, 1, 0);
 	/*
 	 * ext4_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	to = from + len;
 
 retry:
- 	page = __grab_cache_page(mapping, index);
- 	if (!page)
- 		return -ENOMEM;
- 	*pagep = page;
-
   	handle = ext4_journal_start(inode, needed_blocks);
   	if (IS_ERR(handle)) {
- 		unlock_page(page);
- 		page_cache_release(page);
   		ret = PTR_ERR(handle);
   		goto out;
 	}
 
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 							ext4_get_block);
 
@@ -1225,8 +1302,8 @@ retry:
 	}
 
 	if (ret) {
-		ext4_journal_stop(handle);
  		unlock_page(page);
+		ext4_journal_stop(handle);
  		page_cache_release(page);
 	}
 
@@ -1236,15 +1313,6 @@ out:
 	return ret;
 }
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_dirty_data(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(__func__, __func__,
-						bh, handle, err);
-	return err;
-}
-
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	return copied;
-}
-
-/*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext4_journal_dirty_data);
+	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
 		/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
 
-	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
 			ret = ret2;
 	}
 
+	unlock_page(page);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
 	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ind_blks, dind_blks, tind_blks;
+
+	/* number of new indirect blocks needed */
+	ind_blks = (blocks + icap - 1) / icap;
+
+	dind_blks = (ind_blks + icap - 1) / icap;
+
+	tind_blks = 1;
+
+	return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_calc_metadata_amount(inode, blocks);
+
+	return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+	mdblocks = ext4_calc_metadata_amount(inode, total);
+	BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+	total = md_needed + nrblocks;
+
+	if (ext4_has_free_blocks(sbi, total) < total) {
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return -ENOSPC;
+	}
+
+	/* reduce fs free blocks counter */
+	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	return 0;       /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int total, mdb, mdb_free, release;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	/* recalculate the number of metablocks still need to be reserved */
+	total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+	mdb = ext4_calc_metadata_amount(inode, total);
+
+	/* figure out how many metablocks to release */
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+	/* Account for allocated meta_blocks */
+	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+	release = to_free + mdb_free;
+
+	/* update fs free blocks counter for truncate case */
+	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+	/* update per-inode reservations */
+	BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+	EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	EXT4_I(inode)->i_allocated_meta_blocks = 0;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+						unsigned long offset)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+	ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+	struct inode *inode;
+	struct buffer_head lbh;			/* extent of blocks */
+	unsigned long first_page, next_page;	/* extent of pages */
+	get_block_t *get_block;
+	struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct mpage_data mpd_pp = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = mpd->get_block,
+		.use_writepage = 1,
+	};
+	int ret = 0, err, nr_pages, i;
+	unsigned long index, end;
+	struct pagevec pvec;
+
+	BUG_ON(mpd->next_page <= mpd->first_page);
+
+	pagevec_init(&pvec, 0);
+	index = mpd->first_page;
+	end = mpd->next_page - 1;
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+			/*
+			 * In error case, we have to continue because
+			 * remaining pages are still locked
+			 * XXX: unlock and re-dirty them?
+			 */
+			if (ret == 0)
+				ret = err;
+		}
+		pagevec_release(&pvec);
+	}
+	if (mpd_pp.bio)
+		mpage_bio_submit(WRITE, mpd_pp.bio);
+
+	return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+				 struct buffer_head *exbh)
+{
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+	int blocks = exbh->b_size >> inode->i_blkbits;
+	sector_t pblock = exbh->b_blocknr, cur_logical;
+	struct buffer_head *head, *bh;
+	unsigned long index, end;
+	struct pagevec pvec;
+	int nr_pages, i;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			BUG_ON(!page_has_buffers(page));
+
+			bh = page_buffers(page);
+			head = bh;
+
+			/* skip blocks out of the range */
+			do {
+				if (cur_logical >= logical)
+					break;
+				cur_logical++;
+			} while ((bh = bh->b_this_page) != head);
+
+			do {
+				if (cur_logical >= logical + blocks)
+					break;
+				if (buffer_delay(bh)) {
+					bh->b_blocknr = pblock;
+					clear_buffer_delay(bh);
+				} else if (buffer_mapped(bh))
+					BUG_ON(bh->b_blocknr != pblock);
+
+				cur_logical++;
+				pblock++;
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+					     struct buffer_head *bh)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	int blocks, i;
+
+	blocks = bh->b_size >> inode->i_blkbits;
+	for (i = 0; i < blocks; i++)
+		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	int err = 0, remain = lbh->b_size;
+	sector_t next = lbh->b_blocknr;
+	struct buffer_head new;
+
+	/*
+	 * We consider only non-mapped and non-allocated blocks
+	 */
+	if (buffer_mapped(lbh) && !buffer_delay(lbh))
+		return;
+
+	while (remain) {
+		new.b_state = lbh->b_state;
+		new.b_blocknr = 0;
+		new.b_size = remain;
+		err = mpd->get_block(mpd->inode, next, &new, 1);
+		if (err) {
+			/*
+			 * Rather than implement own error handling
+			 * here, we just leave remaining blocks
+			 * unallocated and try again with ->writepage()
+			 */
+			break;
+		}
+		BUG_ON(new.b_size == 0);
+
+		if (buffer_new(&new))
+			__unmap_underlying_blocks(mpd->inode, &new);
+
+		/*
+		 * If blocks are delayed marked, we need to
+		 * put actual blocknr and drop delayed bit
+		 */
+		if (buffer_delay(lbh))
+			mpage_put_bnr_to_bhs(mpd, next, &new);
+
+		/* go for the remaining blocks */
+		next += new.b_size >> mpd->inode->i_blkbits;
+		remain -= new.b_size;
+	}
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+				   sector_t logical, struct buffer_head *bh)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	sector_t next;
+
+	next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+	/*
+	 * First block in the extent
+	 */
+	if (lbh->b_size == 0) {
+		lbh->b_blocknr = logical;
+		lbh->b_size = bh->b_size;
+		lbh->b_state = bh->b_state & BH_FLAGS;
+		return;
+	}
+
+	/*
+	 * Can we merge the block to our big extent?
+	 */
+	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+		lbh->b_size += bh->b_size;
+		return;
+	}
+
+	/*
+	 * We couldn't merge the block to our extent, so we
+	 * need to flush current  extent and start new one
+	 */
+	mpage_da_map_blocks(mpd);
+
+	/*
+	 * Now start a new extent
+	 */
+	lbh->b_size = bh->b_size;
+	lbh->b_state = bh->b_state & BH_FLAGS;
+	lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+				struct writeback_control *wbc, void *data)
+{
+	struct mpage_da_data *mpd = data;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *bh, *head, fake;
+	sector_t logical;
+
+	/*
+	 * Can we merge this page to current extent?
+	 */
+	if (mpd->next_page != page->index) {
+		/*
+		 * Nope, we can't. So, we map non-allocated blocks
+		 * and start IO on them using __mpage_writepage()
+		 */
+		if (mpd->next_page != mpd->first_page) {
+			mpage_da_map_blocks(mpd);
+			mpage_da_submit_io(mpd);
+		}
+
+		/*
+		 * Start next extent of pages ...
+		 */
+		mpd->first_page = page->index;
+
+		/*
+		 * ... and blocks
+		 */
+		mpd->lbh.b_size = 0;
+		mpd->lbh.b_state = 0;
+		mpd->lbh.b_blocknr = 0;
+	}
+
+	mpd->next_page = page->index + 1;
+	logical = (sector_t) page->index <<
+		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * There is no attached buffer heads yet (mmap?)
+		 * we treat the page asfull of dirty blocks
+		 */
+		bh = &fake;
+		bh->b_size = PAGE_CACHE_SIZE;
+		bh->b_state = 0;
+		set_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		mpage_add_bh_to_extent(mpd, logical, bh);
+	} else {
+		/*
+		 * Page with regular buffer heads, just add all dirty ones
+		 */
+		head = page_buffers(page);
+		bh = head;
+		do {
+			BUG_ON(buffer_locked(bh));
+			if (buffer_dirty(bh))
+				mpage_add_bh_to_extent(mpd, logical, bh);
+			logical++;
+		} while ((bh = bh->b_this_page) != head);
+	}
+
+	return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+			       struct writeback_control *wbc,
+			       get_block_t get_block)
+{
+	struct mpage_da_data mpd;
+	int ret;
+
+	if (!get_block)
+		return generic_writepages(mapping, wbc);
+
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
+	mpd.lbh.b_size = 0;
+	mpd.lbh.b_state = 0;
+	mpd.lbh.b_blocknr = 0;
+	mpd.first_page = 0;
+	mpd.next_page = 0;
+	mpd.get_block = get_block;
+
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+	/*
+	 * Handle last extent of pages
+	 */
+	if (mpd.next_page != mpd.first_page) {
+		mpage_da_map_blocks(&mpd);
+		mpage_da_submit_io(&mpd);
+	}
+
+	return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	if ((ret == 0) && !buffer_delay(bh_result)) {
+		/* the block isn't (pre)allocated yet, let's reserve space */
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		ret = ext4_da_reserve_space(inode, 1);
+		if (ret)
+			/* not enough space to reserve */
+			return ret;
+
+		map_bh(bh_result, inode->i_sb, 0);
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+	} else if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+
+	return ret;
+}
+#define		EXT4_DELALLOC_RSVED	1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	handle = ext4_journal_current_handle();
+	if (!handle) {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+		BUG_ON(!ret);
+	} else {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	}
+
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			up_write(&EXT4_I(inode)->i_data_sem);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				ret = ext4_mark_inode_dirty(handle, inode);
+				return ret;
+			}
+		}
+		ret = 0;
+	}
+	return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation
+	 */
+	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	/*
+	 * we don't want to do block allocation in writepage
+	 * so call get_block_wrap with create = 0
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	int ret = 0;
+	loff_t size;
+	unsigned long len;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		page_bufs = page_buffers(page);
+		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+					ext4_bh_unmapped_or_delay)) {
+			/*
+			 * We don't want to do  block allocation
+			 * So redirty the page and return
+			 * We may reach here when we do a journal commit
+			 * via journal_submit_inode_data_buffers.
+			 * If we don't have mapping block we just ignore
+			 * them. We can also reach here via shrink_page_list
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	} else {
+		/*
+		 * The test for page_has_buffers() is subtle:
+		 * We know the page is dirty but it lost buffers. That means
+		 * that at some moment in time after write_begin()/write_end()
+		 * has been called all buffers have been clean and thus they
+		 * must have been written at least once. So they are all
+		 * mapped and we can happily proceed with mapping them
+		 * and writing the page.
+		 *
+		 * Try to initialize the buffer_heads and check whether
+		 * all are mapped and non delay. We don't want to
+		 * do block allocation here.
+		 */
+		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+						ext4_normal_get_block_write);
+		if (!ret) {
+			page_bufs = page_buffers(page);
+			/* check whether all are mapped and non delay */
+			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+						ext4_bh_unmapped_or_delay)) {
+				redirty_page_for_writepage(wbc, page);
+				unlock_page(page);
+				return 0;
+			}
+		} else {
+			/*
+			 * We can't do block allocation here
+			 * so just redity the page and unlock
+			 * and return
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+	else
+		ret = block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
+
+	return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	handle_t *handle = NULL;
+	int needed_blocks;
+	int ret = 0;
+	long to_write;
+	loff_t range_start = 0;
+
+	/*
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
+	 */
+	if (!mapping->nrpages)
+		return 0;
+
+	/*
+	 * Estimate the worse case needed credits to write out
+	 * EXT4_MAX_BUF_BLOCKS pages
+	 */
+	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+	to_write = wbc->nr_to_write;
+	if (!wbc->range_cyclic) {
+		/*
+		 * If range_cyclic is not set force range_cont
+		 * and save the old writeback_index
+		 */
+		wbc->range_cont = 1;
+		range_start =  wbc->range_start;
+	}
+
+	while (!ret && to_write) {
+		/* start a new transaction*/
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		if (ext4_should_order_data(inode)) {
+			/*
+			 * With ordered mode we need to add
+			 * the inode to the journal handle
+			 * when we do block allocation.
+			 */
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out_writepages;
+			}
+
+		}
+		/*
+		 * set the max dirty pages could be write at a time
+		 * to fit into the reserved transaction credits
+		 */
+		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+		to_write -= wbc->nr_to_write;
+		ret = mpage_da_writepages(mapping, wbc,
+						ext4_da_get_block_write);
+		ext4_journal_stop(handle);
+		if (wbc->nr_to_write) {
+			/*
+			 * There is no more writeout needed
+			 * or we requested for a noblocking writeout
+			 * and we found the device congested
+			 */
+			to_write += wbc->nr_to_write;
+			break;
+		}
+		wbc->nr_to_write = to_write;
+	}
+
+out_writepages:
+	wbc->nr_to_write = to_write;
+	if (range_start)
+		wbc->range_start = range_start;
+	return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret, retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+retry:
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+							ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		page_cache_release(page);
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					 unsigned long offset)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	unsigned int idx;
+	int i;
+
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
+
+	for (i=0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+		return 0;
+	return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+	unsigned long start, end;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied -1;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+
+	new_i_size = pos + copied;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_da_should_update_i_disksize(page, end)) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (new_i_size > EXT4_I(inode)->i_disksize) {
+				/*
+				 * Updating i_disksize when extending file
+				 * without needing block allocation
+				 */
+				if (ext4_should_order_data(inode))
+					ret = ext4_jbd2_file_inode(handle,
+								   inode);
+
+				EXT4_I(inode)->i_disksize = new_i_size;
+			}
+			up_write(&EXT4_I(inode)->i_data_sem);
+		}
+	}
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	ext4_da_page_release_reservation(page, offset);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 	journal_t *journal;
 	int err;
 
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
 	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-	if (buffer_mapped(bh))
-		return ext4_journal_dirty_data(handle, bh);
-	return 0;
-}
-
 /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
  *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
  *
  * Problem:
  *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
  */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	J_ASSERT(PageLocked(page));
-
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (ext4_journal_current_handle())
-		goto out_fail;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page,
+					ext4_normal_get_block_write, wbc);
+	else
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
+}
 
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
+static int ext4_normal_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	J_ASSERT(PageLocked(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
 	}
-	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bget_one);
-
-	ret = block_write_full_page(page, ext4_get_block, wbc);
 
-	/*
-	 * The page can become unlocked at any point now, and
-	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_bufs is
-	 * safe due to elevated refcount.
-	 */
+	if (!ext4_journal_current_handle())
+		return __ext4_normal_writepage(page, wbc);
 
-	/*
-	 * And attach them to the current transaction.  But only if
-	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
-	 * and generally junk.
-	 */
-	if (ret == 0) {
-		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-					NULL, jbd2_journal_dirty_data_fn);
-		if (!ret)
-			ret = err;
-	}
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-
-out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
-	return ret;
+	return 0;
 }
 
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+					ext4_normal_get_block_write);
+	if (ret != 0)
+		goto out_unlock;
+
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+								bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
 
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_fail;
+		goto out;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
-	return ret;
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	goto out;
+
+out_unlock:
 	unlock_page(page);
+out:
 	return ret;
 }
 
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (ext4_journal_current_handle())
-		goto no_write;
+	J_ASSERT(PageLocked(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
+	}
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	if (ext4_journal_current_handle())
 		goto no_write;
-	}
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
+	if (PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_get_block);
-		if (ret != 0) {
-			ext4_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-		unlock_page(page);
+		return __ext4_journalled_writepage(page, wbc);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
 	}
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-
 no_write:
 	redirty_page_for_writepage(wbc, page);
-out_unlock:
 	unlock_page(page);
-	goto out;
+	return 0;
 }
 
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_ordered_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_writeback_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.releasepage	= ext4_releasepage,
 };
 
+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_da_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.write_begin	= ext4_da_write_begin,
+	.write_end	= ext4_da_write_end,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
+	if (ext4_should_order_data(inode) &&
+		test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+		 test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
+	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+	if (!page)
+		return -EINVAL;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		err = ext4_journal_dirty_metadata(handle, bh);
 	} else {
 		if (ext4_should_order_data(inode))
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
 
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, this_bh);
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext4_journal_dirty_metadata(handle, this_bh);
+		else
+			ext4_error(inode->i_sb, __func__,
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long) this_bh->b_blocknr);
 	}
 }
 
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 	}
 }
 
+int ext4_can_truncate(struct inode *inode)
+{
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext4_inode_is_fast_symlink(inode);
+	return 0;
+}
+
 /*
  * ext4_truncate()
  *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
 	int n;
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
 
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext4_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+	if (!ext4_can_truncate(inode))
 		return;
 
-	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside jbd2_journal_start().
-	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			return;
-	}
-
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		ext4_ext_truncate(inode, page);
+		ext4_ext_truncate(inode);
 		return;
 	}
 
 	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
-	}
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (blocksize - 1))
+		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+			goto out_stop;
 
 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
 		goto out_stop;
 
 	/*
+	 * From here we block out all ext4_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	down_write(&ei->i_data_sem);
+	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
 	 */
 	ei->i_disksize = inode->i_size;
 
-	/*
-	 * From here we block out all ext4_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	down_write(&ei->i_data_sem);
-
 	if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
+
+		if (ext4_should_order_data(inode)) {
+			error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+			if (error) {
+				/* Do as much error cleanup as possible */
+				handle = ext4_journal_start(inode, 3);
+				if (IS_ERR(handle)) {
+					ext4_orphan_del(NULL, inode);
+					goto err_out;
+				}
+				ext4_orphan_del(handle, inode);
+				ext4_journal_stop(handle);
+				goto err_out;
+			}
+		}
 	}
 
 	rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
 	return error;
 }
 
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode;
+	unsigned long delalloc_blocks;
+
+	inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+
+	/*
+	 * We can't update i_blocks if the block allocation is delayed
+	 * otherwise in the case of system crash before the real block
+	 * allocation is done, we will have i_blocks inconsistent with
+	 * on-disk file blocks.
+	 * We always keep i_blocks updated together with real
+	 * allocation. But to not confuse with user, stat
+	 * will return the blocks that include the delayed allocation
+	 * blocks for this file.
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+	return 0;
+}
 
 /*
  * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	return err;
 }
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	loff_t size;
+	unsigned long len;
+	int ret = -EINVAL;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+	 * get i_mutex because we are already holding mmap_sem.
+	 */
+	down_read(&inode->i_alloc_sem);
+	size = i_size_read(inode);
+	if (page->mapping != mapping || size <= page_offset(page)
+	    || !PageUptodate(page)) {
+		/* page got truncated from under us? */
+		goto out_unlock;
+	}
+	ret = 0;
+	if (PageMappedToDisk(page))
+		goto out_unlock;
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* return if we have all the buffers mapped */
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				       ext4_bh_unmapped))
+			goto out_unlock;
+	}
+	/*
+	 * OK, we need to fill the hole... Do write_begin write_end
+	 * to do block allocation/reservation.We are not holding
+	 * inode.i__mutex here. That allow * parallel write_begin,
+	 * write_end call. lock_page prevent this from happening
+	 * on the same page though
+	 */
+	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+			len, len, page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = 0;
+out_unlock:
+	up_read(&inode->i_alloc_sem);
+	return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_zero_bit(addr, max, start) - fix;
+	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_bit(addr, max, start) - fix;
+	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (!buffer_uptodate(bh[i]))
 			goto out;
 
+	err = 0;
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	int pnum;
 	int poff;
 	struct page *page;
+	int ret;
 
 	mb_debug("load group %lu\n", group);
 
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
 			if (!PageUptodate(page)) {
-				ext4_mb_init_cache(page, NULL);
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
 				mb_cmp_bitmaps(e4b, page_address(page) +
 					       (poff * sb->s_blocksize));
 			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_bitmap_page = page;
 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
-			if (!PageUptodate(page))
-				ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_buddy_page = page;
 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
-	return -EIO;
+	return ret;
 }
 
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 	}
 }
 
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			  int first, int count)
 {
 	int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr += block;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+			ext4_unlock_group(sb, e4b->bd_group);
 			ext4_error(sb, __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %lu)\n",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
+			ext4_lock_group(sb, e4b->bd_group);
 		}
 		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
 		e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 		} while (1);
 	}
 	mb_check_buddy(e4b);
-
-	return 0;
 }
 
 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
 		spin_unlock(&sbi->s_md_lock);
 	}
-
-	/* searching for the right group start from the goal value specified */
-	group = ac->ac_g_ex.fe_group;
-
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
+
 		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
 	int rc;
 	int size;
 
+	if (unlikely(sbi->s_mb_history == NULL))
+		return -ENOMEM;
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (s == NULL)
 		return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
-	if (likely(sbi->s_mb_history != NULL))
-		memset(sbi->s_mb_history, 0, i);
+	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define ext4_mb_history_init(sb)
 #endif
 
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
+{
+	int i, len;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+			       "buddy group\n");
+			goto exit_meta_group_info;
+		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
+	}
+
+	/*
+	 * calculate needed size. if change bb_counters size,
+	 * don't forget about ext4_mb_generate_buddy()
+	 */
+	len = offsetof(typeof(**meta_group_info),
+		       bb_counters[sb->s_blocksize_bits + 2]);
+
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	if (meta_group_info[i] == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+		goto exit_group_info;
+	}
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
+
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_blocks_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			le16_to_cpu(desc->bg_free_blocks_count);
+	}
+
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_KERNEL);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
+	}
+#endif
+
+	return 0;
+
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+			       struct ext4_group_desc *desc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	int blocks_per_page;
+	int block;
+	int pnum;
+	struct page *page;
+	int err;
+
+	/* Add group based on group descriptor*/
+	err = ext4_mb_add_groupinfo(sb, group, desc);
+	if (err)
+		return err;
+
+	/*
+	 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+	 * datas) are set not up to date so that they will be re-initilaized
+	 * during the next call to ext4_mb_load_buddy
+	 */
+
+	/* Set buddy page as not up to date */
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	/* Set bitmap page as not up to date */
+	block++;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+	grp->bb_free += add;
+}
+
 static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t i;
-	int j, len, metalen;
+	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int num_meta_group_infos =
-		(sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
-			EXT4_DESC_PER_BLOCK_BITS(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int num_meta_group_infos;
+	int num_meta_group_infos_max;
+	int array_size;
 	struct ext4_group_info **meta_group_info;
+	struct ext4_group_desc *desc;
+
+	/* This is the number of blocks used by GDT */
+	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+	/*
+	 * This is the total number of blocks used by GDT including
+	 * the number of reserved blocks for GDT.
+	 * The s_group_info array is allocated with this value
+	 * to allow a clean online resize without a complex
+	 * manipulation of pointer.
+	 * The drawback is the unused memory when no resize
+	 * occurs but it's very low in terms of pages
+	 * (see comments below)
+	 * Need to handle this properly when META_BG resizing is allowed
+	 */
+	num_meta_group_infos_max = num_meta_group_infos +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
 
+	/*
+	 * array_size is the size of s_group_info array. We round it
+	 * to the next power of two because this approximation is done
+	 * internally by kmalloc so we can have some more memory
+	 * for free here (e.g. may be used for META_BG resize).
+	 */
+	array_size = 1;
+	while (array_size < sizeof(*sbi->s_group_info) *
+	       num_meta_group_infos_max)
+		array_size = array_size << 1;
 	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
 	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
 	 * So a two level scheme suffices for now. */
-	sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
-				    num_meta_group_infos, GFP_KERNEL);
+	sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
 	if (sbi->s_group_info == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
 		return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = sizeof(struct ext4_group_info);
-	len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		struct ext4_group_desc *desc;
-
-		meta_group_info =
-			sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-		j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
-		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-		if (meta_group_info[j] == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-			goto err_freebuddy;
-		}
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
 				"EXT4-fs: can't read descriptor %lu\n", i);
-			i++;
 			goto err_freebuddy;
 		}
-		memset(meta_group_info[j], 0, len);
-		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
-			&(meta_group_info[j]->bb_state));
-
-		/*
-		 * initialize bb_free to be able to skip
-		 * empty groups without initialization
-		 */
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			meta_group_info[j]->bb_free =
-				ext4_free_blocks_after_init(sb, i, desc);
-		} else {
-			meta_group_info[j]->bb_free =
-				le16_to_cpu(desc->bg_free_blocks_count);
-		}
-
-		INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
-		{
-			struct buffer_head *bh;
-			meta_group_info[j]->bb_bitmap =
-				kmalloc(sb->s_blocksize, GFP_KERNEL);
-			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-			bh = read_block_bitmap(sb, i);
-			BUG_ON(bh == NULL);
-			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-					sb->s_blocksize);
-			put_bh(bh);
-		}
-#endif
-
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
 	}
 
 	return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned i;
 	unsigned offset;
 	unsigned max;
+	int ret;
 
 	if (!test_opt(sb, MBALLOC))
 		return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	} while (i <= sb->s_blocksize_bits + 1);
 
 	/* init file for buddy data */
-	i = ext4_mb_init_backend(sb);
-	if (i) {
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0) {
 		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
-		return i;
+		return ret;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 		ext4_lock_group(sb, md->group);
 		for (i = 0; i < md->num; i++) {
 			mb_debug(" %u", md->blocks[i]);
-			err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-			BUG_ON(err != 0);
+			mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
 		}
 		mb_debug("\n");
 		ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 
 
 
-#define MB_PROC_VALUE_READ(name)				\
-static int ext4_mb_read_##name(char *page, char **start,	\
-		off_t off, int count, int *eof, void *data)	\
+#define MB_PROC_FOPS(name)					\
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
-	int len;						\
-	*eof = 1;						\
-	if (off != 0)						\
-		return 0;					\
-	len = sprintf(page, "%ld\n", sbi->s_mb_##name);		\
-	*start = page;						\
-	return len;						\
-}
-
-#define MB_PROC_VALUE_WRITE(name)				\
-static int ext4_mb_write_##name(struct file *file,		\
-		const char __user *buf, unsigned long cnt, void *data)	\
+	struct ext4_sb_info *sbi = m->private;			\
+								\
+	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
+	return 0;						\
+}								\
+								\
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{								\
+	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}								\
+								\
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
+		const char __user *buf, size_t cnt, loff_t *ppos)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
+	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
 	char str[32];						\
 	long value;						\
 	if (cnt >= sizeof(str))					\
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file,		\
 		return -ERANGE;					\
 	sbi->s_mb_##name = value;				\
 	return cnt;						\
-}
+}								\
+								\
+static const struct file_operations ext4_mb_##name##_proc_fops = {	\
+	.owner		= THIS_MODULE,				\
+	.open		= ext4_mb_##name##_proc_open,		\
+	.read		= seq_read,				\
+	.llseek		= seq_lseek,				\
+	.release	= single_release,			\
+	.write		= ext4_mb_##name##_proc_write,		\
+};
 
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
 
 #define	MB_PROC_HANDLER(name, var)					\
 do {									\
-	proc = create_proc_entry(name, mode, sbi->s_mb_proc);		\
+	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
+				&ext4_mb_##var##_proc_fops, sbi);	\
 	if (proc == NULL) {						\
 		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
 		goto err_out;						\
 	}								\
-	proc->data = sbi;						\
-	proc->read_proc  = ext4_mb_read_##var ;				\
-	proc->write_proc = ext4_mb_write_##var;				\
 } while (0)
 
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	struct proc_dir_entry *proc;
 	char devname[64];
 
+	if (proc_root_ext4 == NULL) {
+		sbi->s_mb_proc = NULL;
+		return -EINVAL;
+	}
 	bdevname(sb->s_bdev, devname);
 	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
 
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
 
 	err = -EIO;
-	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
 	if (!bitmap_bh)
 		goto out_err;
 
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+	/*
+	 * free blocks account has already be reduced/reserved
+	 * at write_begin() time for delayed allocation
+	 * do not double accounting
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		percpu_counter_sub(&sbi->s_freeblocks_counter,
+					ac->ac_b_ex.fe_len);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
 
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 	if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		if (next > end)
-			next = end;
 		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
 				le32_to_cpu(sbi->s_es->s_first_data_block);
 		mb_debug("    free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
 
-	bitmap_bh = read_block_bitmap(sb, group);
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		/* error handling here */
 		ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		BUG_ON(err != 0); /* error handling here */
 
-		bitmap_bh = read_block_bitmap(sb, group);
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			/* error handling here */
 			ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sbi = EXT4_SB(sb);
 
 	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
 					    &(ar->len), errp);
 		return block;
 	}
+	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		ar->len = ext4_has_free_blocks(sbi, ar->len);
+	}
+
+	if (ar->len == 0) {
+		*errp = -ENOSPC;
+		return 0;
+	}
 
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 	inquota = ar->len;
 
+	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
+		ar->len = 0;
 		*errp = -ENOMEM;
-		return 0;
+		goto out1;
 	}
 
 	ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
-		goto out;
+		goto out2;
 	}
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
 	if (!ext4_mb_use_preallocated(ac)) {
-
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
 repeat:
@@ -4085,11 +4261,12 @@ repeat:
 
 	ext4_mb_release_context(ac);
 
-out:
+out2:
+	kmem_cache_free(ext4_ac_cachep, ac);
+out1:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
 
-	kmem_cache_free(ext4_ac_cachep, ac);
 	return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
 		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
 		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
 	} else {
 		ext4_lock_group(sb, block_group);
-		err = mb_free_blocks(inode, &e4b, bit, count);
+		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 		ext4_unlock_group(sb, block_group);
-		BUG_ON(err != 0);
 	}
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	ext4_mb_release_desc(&e4b);
 
 	*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
 /*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
  * Future: use high four bits of block for coalesce-on-delete flags
  * Mask them off for now.
  */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit (struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 
 
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
  * This function fills a red-black tree with information from a
  * directory block.  It returns the number directory entries loaded
  * into the tree.  If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 		de = (struct ext4_dir_entry_2 *) bh->b_data;
 		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de))
-		if (ext4_match (namelen, name, de)) {
-			if (!ext4_check_dir_entry("ext4_find_entry",
-						  dir, de, bh,
-				  (block<<EXT4_BLOCK_SIZE_BITS(sb))
-					  +((char *)de - bh->b_data))) {
-				brelse (bh);
+		for (; de < top; de = ext4_next_entry(de)) {
+			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+				  + ((char *) de - bh->b_data);
+
+			if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+				brelse(bh);
 				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
-			*res_dir = de;
-			dx_release (frames);
-			return bh;
+
+			if (ext4_match(namelen, name, de)) {
+				*res_dir = de;
+				dx_release(frames);
+				return bh;
+			}
 		}
 		brelse (bh);
 		/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
+	 * We can allocate memory for mb_alloc based on the new group
+	 * descriptor
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+		if (err)
+			goto exit_journal;
+	}
+	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
 	 * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
+	ext4_group_t group;
+	struct ext4_group_info *grp;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	}
 
 	/* Handle the remaining blocks in the last group only. */
-	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
 		ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
 		goto exit_put;
+
+	/*
+	 * Mark mballoc pages as not up to date so that they will be updated
+	 * next time they are loaded by ext4_mb_load_buddy.
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		struct inode *inode = sbi->s_buddy_cache;
+		int blocks_per_page;
+		int block;
+		int pnum;
+		struct page *page;
+
+		/* Set buddy page as not up to date */
+		blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+		block = group * 2;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Set bitmap page as not up to date */
+		block++;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Get the info on the last group */
+		grp = ext4_get_group_info(sb, group);
+
+		/* Update free blocks in group info */
+		ext4_mb_update_group_info(grp, add);
+	}
+
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
 		       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf24343979..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
+	kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_delalloc_reserved_flag = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
 	return &ei->vfs_inode;
 }
 
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
 	EXT4_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
+	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+				       &EXT4_I(inode)->jinode);
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	unsigned long def_mount_opts;
 	struct super_block *sb = vfs->mnt_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	journal_t *journal = sbi->s_journal;
 	struct ext4_super_block *es = sbi->s_es;
 
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
+	if (!test_opt(sb, DELALLOC))
+		seq_puts(seq, ",nodelalloc");
+
 
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 
 static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
 	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
+	{Opt_delalloc, "delalloc"},
+	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_err, NULL},
 };
 
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
 	int qtype, qfmt;
 	char *qname;
 #endif
+	ext4_fsblk_t last_block;
 
 	if (!options)
 		return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
 		case Opt_extents:
+			if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+				ext4_warning(sb, __func__,
+					"extents feature not enabled "
+					"on this filesystem, use tune2fs\n");
+				return 0;
+			}
 			set_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_noextents:
+			/*
+			 * When e2fsprogs support resizing an already existing
+			 * ext3 file system to greater than 2**32 we need to
+			 * add support to block allocator to handle growing
+			 * already existing block  mapped inode so that blocks
+			 * allocated for them fall within 2**32
+			 */
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+			if (last_block  > 0xffffffffULL) {
+				printk(KERN_ERR "EXT4-fs: Filesystem too "
+						"large to mount with "
+						"-o noextents options\n");
+				return 0;
+			}
 			clear_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
 			break;
+		case Opt_nodelalloc:
+			clear_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		case Opt_mballoc:
 			set_opt(sbi->s_mount_opt, MBALLOC);
 			break;
@@ -1331,6 +1370,9 @@ set_qf_format:
 				return 0;
 			sbi->s_stripe = option;
 			break;
+		case Opt_delalloc:
+			set_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	return res;
 }
 
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *bh;
+	ext4_group_t flex_group_count;
+	ext4_group_t flex_group;
+	int groups_per_flex = 0;
+	__u64 block_bitmap = 0;
+	int i;
+
+	if (!sbi->s_es->s_log_groups_per_flex) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+		groups_per_flex;
+	sbi->s_flex_groups = kmalloc(flex_group_count *
+				     sizeof(struct flex_groups), GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		goto failed;
+	}
+	memset(sbi->s_flex_groups, 0, flex_group_count *
+	       sizeof(struct flex_groups));
+
+	gdp = ext4_get_group_desc(sb, 1, &bh);
+	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, &bh);
+
+		flex_group = ext4_flex_group(sbi, i);
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			le16_to_cpu(gdp->bg_free_inodes_count);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return 1;
+failed:
+	return 0;
+}
+
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 			    struct ext4_group_desc *gdp)
 {
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 }
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-				__releases(kernel_sem)
-				__acquires(kernel_sem)
+				__releases(kernel_lock)
+				__acquires(kernel_lock)
 
 {
 	struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto out_fail;
 	}
 
-	if (!sb_set_blocksize(sb, blocksize)) {
-		printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-		goto out_fail;
-	}
-
 	/*
 	 * The ext4 superblock will not be buffer aligned for other than 1kB
 	 * block sizes.  We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 
 	/*
 	 * turn on extents feature by default in ext4 filesystem
-	 * User -o noextents to turn it off
+	 * only if feature flag already set by mkfs or tune2fs.
+	 * Use -o noextents to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, EXTENTS);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		set_opt(sbi->s_mount_opt, EXTENTS);
+	else
+		ext4_warning(sb, __func__,
+			"extents feature not enabled on this filesystem, "
+			"use tune2fs.\n");
 	/*
-	 * turn on mballoc feature by default in ext4 filesystem
-	 * User -o nomballoc to turn it off
+	 * turn on mballoc code by default in ext4 filesystem
+	 * Use -o nomballoc to turn it off
 	 */
 	set_opt(sbi->s_mount_opt, MBALLOC);
 
+	/*
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
+	 */
+	set_opt(sbi->s_mount_opt, DELALLOC);
+
+
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
 			    NULL, 0))
 		goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			printk(KERN_ERR
+			       "EXT4-fs: unable to initialize "
+			       "flex_bg meta info!\n");
+			goto failed_mount2;
+		}
+
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+				"requested data journaling mode\n");
+		clear_opt(sbi->s_mount_opt, DELALLOC);
+	} else if (test_opt(sb, DELALLOC))
+		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
 	ext4_ext_init(sb);
 	ext4_mb_init(sb, needs_recovery);
 
@@ -2372,6 +2485,7 @@ cantfind_ext4:
 
 failed_mount4:
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 			err = ext4_journal_dirty_metadata(handle, bh);
 		else {
 			/* Always do at least ordered writes for quotas */
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 			mark_buffer_dirty(bh);
 		}
 		brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
 			/* We need to allocate a new block */
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_block(handle, inode,
+			ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
 							goal, &error);
 			if (error)
 				goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
-/*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
  * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	int ret;
 	int barrier_done = 0;
+	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
 	if (JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
  */
-static int journal_wait_on_locked_list(journal_t *journal,
-				       transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
 {
-	int ret = 0;
-	struct journal_head *jh;
-
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				ret = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+		.for_writepages = 1,
+	};
+
+	ret = generic_writepages(mapping, &wbc);
 	return ret;
-  }
+}
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
 {
-	int i;
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
 
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		err = journal_submit_inode_data_buffers(mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
 }
 
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
  */
-static void journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
 {
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
 
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
+	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
 
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
 		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			jbd2_journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
-			 * jbd2_journal_remove_journal_head() */
-			put_bh(bh);
-			put_bh(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
+			jinode->i_transaction = NULL;
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
+
+	return ret;
 }
 
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
-
-	/*
-	 * Wait for all previously submitted IO to complete if commit
-	 * record is to be written synchronously.
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-
-	spin_unlock(&journal->j_list_lock);
-
+	err = journal_submit_data_buffers(journal, commit_transaction);
 	if (err)
 		jbd2_journal_abort(journal, err);
 
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd_debug(3, "JBD: commit phase 2\n");
 
 	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
-	/*
 	 * Way to go: we have now written out all of the data for a
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
 
+	err = 0;
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-
-		spin_lock(&journal->j_list_lock);
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-		spin_unlock(&journal->j_list_lock);
-		if (err)
-			__jbd2_journal_abort_hard(journal);
 	}
 
+	/*
+	 * This is the right place to wait for data buffers both for ASYNC
+	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+	 * the commit block went to disk (which happens above). If commit is
+	 * SYNC, we need to wait for data buffers before we start writing
+	 * commit block, which happens below in such setting.
+	 */
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD: commit phase 6\n");
 
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 8\n");
+	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 }
 
 /*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	int writeout = 0;
+
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	/* Do we need to wait for data writeback? */
+	if (journal->j_committing_transaction == jinode->i_transaction)
+		writeout = 1;
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
+/*
  * debugfs tunables
  */
 #ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
  *	new transaction	and we can't block without protecting against other
  *	processes trying to touch the journal while it is in transition.
  *
- * Called under j_state_lock
  */
 
 static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
 }
 
 /**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-
-	if (is_handle_aborted(handle))
-		return 0;
-
-	jh = jbd2_journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls jbd2_journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/* journal_clean_data_list() may have got there first */
-			if (jh->b_transaction != NULL) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__jbd2_journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__jbd2_journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
-			JBUFFER_TRACE(jh, "file as data");
-			__jbd2_journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	jbd2_journal_put_journal_head(jh);
-	return 0;
-}
-
-/**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	switch (jh->b_jlist) {
 	case BJ_None:
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers--;
 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
 	return;
 }
 
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	jbd2_log_wait_commit(journal, tid);
+}
 
 /**
  * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
  * journal_try_to_free_buffer() is changing its state.  But that
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
  */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t unused_gfp_mask)
+				struct page *page, gfp_t gfp_mask)
 {
 	struct buffer_head *head;
 	struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		/*
 		 * We take our own ref against the journal_head here to avoid
 		 * having to add tons of locking around each instance of
-		 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+		 * jbd2_journal_remove_journal_head() and
+		 * jbd2_journal_put_journal_head().
 		 */
 		jh = jbd2_journal_grab_journal_head(bh);
 		if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		if (buffer_jbd(bh))
 			goto busy;
 	} while ((bh = bh->b_this_page) != head);
+
 	ret = try_to_free_buffers(page);
+
+	/*
+	 * There are a number of places where jbd2_journal_try_to_free_buffers()
+	 * could race with jbd2_journal_commit_transaction(), the later still
+	 * holds the reference to the buffers to free while processing them.
+	 * try_to_free_buffers() failed to free those buffers. Some of the
+	 * caller of releasepage() request page buffers to be dropped, otherwise
+	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
+	 *
+	 * So, if the caller of try_to_release_page() wants the synchronous
+	 * behaviour(i.e make sure buffers are dropped upon return),
+	 * let's wait for the current transaction to finish flush of
+	 * dirty data buffers, then try to free those buffers again,
+	 * with the journal locked.
+	 */
+	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+		jbd2_journal_wait_for_transaction_sync_data(journal);
+		ret = try_to_free_buffers(page);
+	}
+
 busy:
 	return ret;
 }
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!buffer_jbd(bh))
 		goto zap_buffer_unlocked;
 
+	/* OK, we have data buffer in journaled mode */
 	spin_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		}
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
 		/*
 		 * If it is committing, we simply cannot touch it.  We
 		 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 		J_ASSERT_JH(jh, !jh->b_committed_data);
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers++;
 		list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+					loff_t new_size)
+{
+	journal_t *journal;
+	transaction_t *commit_trans;
+	int ret = 0;
+
+	if (!inode->i_transaction && !inode->i_next_transaction)
+		goto out;
+	journal = inode->i_transaction->t_journal;
+	spin_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	spin_unlock(&journal->j_state_lock);
+	if (inode->i_transaction == commit_trans) {
+		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
 
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-			     void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data)
 {
 	struct mpage_data *mpd = data;
 	struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 0764b662b339..1c1b13e29223 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1089,6 +1089,7 @@ extern int drm_mm_remove_space_from_tail(struct drm_mm *mm, unsigned long size);
 extern int drm_mm_add_space_to_tail(struct drm_mm *mm, unsigned long size);
 
 extern void drm_core_ioremap(struct drm_map *map, struct drm_device *dev);
+extern void drm_core_ioremap_wc(struct drm_map *map, struct drm_device *dev);
 extern void drm_core_ioremapfree(struct drm_map *map, struct drm_device *dev);
 
 static __inline__ struct drm_map *drm_core_findmap(struct drm_device *dev,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index faac13e2cc5c..52e510a0aec2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end, int sync_mode);
+extern int filemap_fdatawrite_range(struct address_space *mapping,
+				loff_t start, loff_t end);
 
 extern long do_fsync(struct file *file, int datasync);
 extern void sync_supers(void);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d147f0f90360..3dd209007098 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -168,6 +168,8 @@ struct commit_header {
 	unsigned char   h_chksum_size;
 	unsigned char 	h_padding[2];
 	__be32 		h_chksum[JBD2_CHECKSUM_BYTES];
+	__be64		h_commit_sec;
+	__be32		h_commit_nsec;
 };
 
 /*
@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
 }
 
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ *   present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+	/* Which transaction does this inode belong to? Either the running
+	 * transaction or the committing one. [j_list_lock] */
+	transaction_t *i_transaction;
+
+	/* Pointer to the running transaction modifying inode's data in case
+	 * there is already a committing transaction touching it. [j_list_lock] */
+	transaction_t *i_next_transaction;
+
+	/* List of inodes in the i_transaction [j_list_lock] */
+	struct list_head i_list;
+
+	/* VFS inode this inode belongs to [constant during the lifetime
+	 * of the structure] */
+	struct inode *i_vfs_inode;
+
+	/* Flags of inode [j_list_lock] */
+	unsigned int i_flags;
+};
+
 struct jbd2_revoke_table_s;
 
 /**
@@ -509,24 +543,12 @@ struct transaction_s
 	struct journal_head	*t_reserved_list;
 
 	/*
-	 * Doubly-linked circular list of all buffers under writeout during
-	 * commit [j_list_lock]
-	 */
-	struct journal_head	*t_locked_list;
-
-	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
 	 */
 	struct journal_head	*t_buffers;
 
 	/*
-	 * Doubly-linked circular list of all data buffers still to be
-	 * flushed before this transaction can be committed [j_list_lock]
-	 */
-	struct journal_head	*t_sync_datalist;
-
-	/*
 	 * Doubly-linked circular list of all forget buffers (superseded
 	 * buffers which we can un-checkpoint once this transaction commits)
 	 * [j_list_lock]
@@ -565,6 +587,12 @@ struct transaction_s
 	struct journal_head	*t_log_list;
 
 	/*
+	 * List of inodes whose data we've modified in data=ordered mode.
+	 * [j_list_lock]
+	 */
+	struct list_head	t_inode_list;
+
+	/*
 	 * Protects info related to handles
 	 */
 	spinlock_t		t_handle_lock;
@@ -1004,7 +1032,6 @@ extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int	 jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1044,6 +1071,10 @@ extern void	   jbd2_journal_ack_err    (journal_t *);
 extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
+extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
 /*
  * journal_head management
@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
 
 /* journaling buffer types */
 #define BJ_None		0	/* Not journaled */
-#define BJ_SyncData	1	/* Normal data: flush before commit */
-#define BJ_Metadata	2	/* Normal journaled metadata */
-#define BJ_Forget	3	/* Buffer superseded by this transaction */
-#define BJ_IO		4	/* Buffer is for temporary IO use */
-#define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	6	/* Buffer contains log descriptors */
-#define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
+#define BJ_Metadata	1	/* Normal journaled metadata */
+#define BJ_Forget	2	/* Buffer superseded by this transaction */
+#define BJ_IO		3	/* Buffer is for temporary IO use */
+#define BJ_Shadow	4	/* Buffer contents being shadowed to the log */
+#define BJ_LogCtl	5	/* Buffer contains log descriptors */
+#define BJ_Reserved	6	/* Buffer is reserved for access by journal */
+#define BJ_Types	7
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a744383d16e9..81b3dd5206e0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -398,7 +398,8 @@ int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_waterm
 int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
 
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
 
 int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 068a0c9946af..5c42821da2d1 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,11 +11,21 @@
  */
 #ifdef CONFIG_BLOCK
 
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
 struct writeback_control;
 
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..208388835357 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index bd91987c065f..12b15c561a1f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	unsigned range_cont:1;
 };
 
 /*
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index c36750ff6ae8..483057b2f4b4 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -2,29 +2,33 @@
  * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
  *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
  *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
  *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #if !defined(IB_ADDR_H)
@@ -57,6 +61,7 @@ struct rdma_dev_addr {
 	unsigned char dst_dev_addr[MAX_ADDR_LEN];
 	unsigned char broadcast[MAX_ADDR_LEN];
 	enum rdma_node_type dev_type;
+	struct net_device *src_dev;
 };
 
 /**
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index f179d233ffc3..00a2b8ec327f 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef _IB_CACHE_H
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index a627c8682d2f..ec7c6d99ed3f 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_cm.h 4311 2005-12-05 18:42:01Z sean.hefty $
  */
 #if !defined(IB_CM_H)
 #define IB_CM_H
diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h
index 00dadbf94e1d..f62b842e6596 100644
--- a/include/rdma/ib_fmr_pool.h
+++ b/include/rdma/ib_fmr_pool.h
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_fmr_pool.h 2730 2005-06-28 16:43:03Z sean.hefty $
  */
 
 #if !defined(IB_FMR_POOL_H)
@@ -61,7 +59,7 @@ struct ib_fmr_pool_param {
 	int                     pool_size;
 	int                     dirty_watermark;
 	void                  (*flush_function)(struct ib_fmr_pool *pool,
-						void *              arg);
+						void               *arg);
 	void                   *flush_arg;
 	unsigned                cache:1;
 };
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 7228c056b9e9..5f6c40fffcf4 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -32,11 +32,9 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_mad.h 5596 2006-03-03 01:00:07Z sean.hefty $
  */
 
-#if !defined( IB_MAD_H )
+#if !defined(IB_MAD_H)
 #define IB_MAD_H
 
 #include <linux/list.h>
@@ -194,8 +192,7 @@ struct ib_vendor_mad {
 	u8			data[IB_MGMT_VENDOR_DATA];
 };
 
-struct ib_class_port_info
-{
+struct ib_class_port_info {
 	u8			base_version;
 	u8			class_version;
 	__be16			capability_mask;
@@ -614,11 +611,11 @@ int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
  * any class specific header, and MAD data area.
  * If @rmpp_active is set, the RMPP header will be initialized for sending.
  */
-struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
-					    u32 remote_qpn, u16 pkey_index,
-					    int rmpp_active,
-					    int hdr_len, int data_len,
-					    gfp_t gfp_mask);
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					   u32 remote_qpn, u16 pkey_index,
+					   int rmpp_active,
+					   int hdr_len, int data_len,
+					   gfp_t gfp_mask);
 
 /**
  * ib_is_mad_class_rmpp - returns whether given management class
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index f926020d6331..d7fc45c4eba9 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef IB_PACK_H
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 942692b0b92e..3841c1aff692 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_sa.h 2811 2005-07-06 18:11:43Z halr $
  */
 
 #ifndef IB_SA_H
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
index f29af135ba83..aaca0878668f 100644
--- a/include/rdma/ib_smi.h
+++ b/include/rdma/ib_smi.h
@@ -32,11 +32,9 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $
  */
 
-#if !defined( IB_SMI_H )
+#if !defined(IB_SMI_H)
 #define IB_SMI_H
 
 #include <rdma/ib_mad.h>
diff --git a/include/rdma/ib_user_cm.h b/include/rdma/ib_user_cm.h
index 37650afb982c..bd3d380781e0 100644
--- a/include/rdma/ib_user_cm.h
+++ b/include/rdma/ib_user_cm.h
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_user_cm.h 4019 2005-11-11 00:33:09Z sean.hefty $
  */
 
 #ifndef IB_USER_CM_H
diff --git a/include/rdma/ib_user_mad.h b/include/rdma/ib_user_mad.h
index 29d2c7205a90..d6fce1cbdb90 100644
--- a/include/rdma/ib_user_mad.h
+++ b/include/rdma/ib_user_mad.h
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_user_mad.h 2814 2005-07-06 19:14:09Z halr $
  */
 
 #ifndef IB_USER_MAD_H
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index 8d65bf0a625b..a17f77106149 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_user_verbs.h 4019 2005-11-11 00:33:09Z sean.hefty $
  */
 
 #ifndef IB_USER_VERBS_H
@@ -291,7 +289,10 @@ struct ib_uverbs_wc {
 	__u32 opcode;
 	__u32 vendor_err;
 	__u32 byte_len;
-	__u32 imm_data;
+	union {
+		__u32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
 	__u32 qp_num;
 	__u32 src_qp;
 	__u32 wc_flags;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 31d30b1852e8..90b529f7a154 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -34,8 +34,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #if !defined(IB_VERBS_H)
@@ -93,7 +91,7 @@ enum ib_device_cap_flags {
 	IB_DEVICE_RC_RNR_NAK_GEN	= (1<<12),
 	IB_DEVICE_SRQ_RESIZE		= (1<<13),
 	IB_DEVICE_N_NOTIFY_CQ		= (1<<14),
-	IB_DEVICE_ZERO_STAG		= (1<<15),
+	IB_DEVICE_LOCAL_DMA_LKEY	= (1<<15),
 	IB_DEVICE_RESERVED		= (1<<16), /* old SEND_W_INV */
 	IB_DEVICE_MEM_WINDOW		= (1<<17),
 	/*
@@ -105,6 +103,8 @@ enum ib_device_cap_flags {
 	 */
 	IB_DEVICE_UD_IP_CSUM		= (1<<18),
 	IB_DEVICE_UD_TSO		= (1<<19),
+	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<21),
+	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
 };
 
 enum ib_atomic_cap {
@@ -150,6 +150,7 @@ struct ib_device_attr {
 	int			max_srq;
 	int			max_srq_wr;
 	int			max_srq_sge;
+	unsigned int		max_fast_reg_page_list_len;
 	u16			max_pkeys;
 	u8			local_ca_ack_delay;
 };
@@ -226,6 +227,57 @@ static inline int ib_width_enum_to_int(enum ib_port_width width)
 	}
 }
 
+struct ib_protocol_stats {
+	/* TBD... */
+};
+
+struct iw_protocol_stats {
+	u64	ipInReceives;
+	u64	ipInHdrErrors;
+	u64	ipInTooBigErrors;
+	u64	ipInNoRoutes;
+	u64	ipInAddrErrors;
+	u64	ipInUnknownProtos;
+	u64	ipInTruncatedPkts;
+	u64	ipInDiscards;
+	u64	ipInDelivers;
+	u64	ipOutForwDatagrams;
+	u64	ipOutRequests;
+	u64	ipOutDiscards;
+	u64	ipOutNoRoutes;
+	u64	ipReasmTimeout;
+	u64	ipReasmReqds;
+	u64	ipReasmOKs;
+	u64	ipReasmFails;
+	u64	ipFragOKs;
+	u64	ipFragFails;
+	u64	ipFragCreates;
+	u64	ipInMcastPkts;
+	u64	ipOutMcastPkts;
+	u64	ipInBcastPkts;
+	u64	ipOutBcastPkts;
+
+	u64	tcpRtoAlgorithm;
+	u64	tcpRtoMin;
+	u64	tcpRtoMax;
+	u64	tcpMaxConn;
+	u64	tcpActiveOpens;
+	u64	tcpPassiveOpens;
+	u64	tcpAttemptFails;
+	u64	tcpEstabResets;
+	u64	tcpCurrEstab;
+	u64	tcpInSegs;
+	u64	tcpOutSegs;
+	u64	tcpRetransSegs;
+	u64	tcpInErrs;
+	u64	tcpOutRsts;
+};
+
+union rdma_protocol_stats {
+	struct ib_protocol_stats	ib;
+	struct iw_protocol_stats	iw;
+};
+
 struct ib_port_attr {
 	enum ib_port_state	state;
 	enum ib_mtu		max_mtu;
@@ -413,6 +465,8 @@ enum ib_wc_opcode {
 	IB_WC_FETCH_ADD,
 	IB_WC_BIND_MW,
 	IB_WC_LSO,
+	IB_WC_LOCAL_INV,
+	IB_WC_FAST_REG_MR,
 /*
  * Set value of IB_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IB_WC_RECV).
@@ -423,7 +477,8 @@ enum ib_wc_opcode {
 
 enum ib_wc_flags {
 	IB_WC_GRH		= 1,
-	IB_WC_WITH_IMM		= (1<<1)
+	IB_WC_WITH_IMM		= (1<<1),
+	IB_WC_WITH_INVALIDATE	= (1<<2),
 };
 
 struct ib_wc {
@@ -433,7 +488,10 @@ struct ib_wc {
 	u32			vendor_err;
 	u32			byte_len;
 	struct ib_qp	       *qp;
-	__be32			imm_data;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
 	u32			src_qp;
 	int			wc_flags;
 	u16			pkey_index;
@@ -498,7 +556,8 @@ enum ib_qp_type {
 };
 
 enum ib_qp_create_flags {
-	IB_QP_CREATE_IPOIB_UD_LSO	= 1 << 0,
+	IB_QP_CREATE_IPOIB_UD_LSO		= 1 << 0,
+	IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
 };
 
 struct ib_qp_init_attr {
@@ -627,6 +686,9 @@ enum ib_wr_opcode {
 	IB_WR_ATOMIC_FETCH_AND_ADD,
 	IB_WR_LSO,
 	IB_WR_SEND_WITH_INV,
+	IB_WR_RDMA_READ_WITH_INV,
+	IB_WR_LOCAL_INV,
+	IB_WR_FAST_REG_MR,
 };
 
 enum ib_send_flags {
@@ -643,6 +705,12 @@ struct ib_sge {
 	u32	lkey;
 };
 
+struct ib_fast_reg_page_list {
+	struct ib_device       *device;
+	u64		       *page_list;
+	unsigned int		max_page_list_len;
+};
+
 struct ib_send_wr {
 	struct ib_send_wr      *next;
 	u64			wr_id;
@@ -675,6 +743,15 @@ struct ib_send_wr {
 			u16	pkey_index; /* valid for GSI only */
 			u8	port_num;   /* valid for DR SMPs on switch only */
 		} ud;
+		struct {
+			u64				iova_start;
+			struct ib_fast_reg_page_list   *page_list;
+			unsigned int			page_shift;
+			unsigned int			page_list_len;
+			u32				length;
+			int				access_flags;
+			u32				rkey;
+		} fast_reg;
 	} wr;
 };
 
@@ -777,7 +854,7 @@ struct ib_cq {
 	struct ib_uobject      *uobject;
 	ib_comp_handler   	comp_handler;
 	void                  (*event_handler)(struct ib_event *, void *);
-	void *            	cq_context;
+	void                   *cq_context;
 	int               	cqe;
 	atomic_t          	usecnt; /* count number of work queues */
 };
@@ -883,7 +960,7 @@ struct ib_dma_mapping_ops {
 	void		(*sync_single_for_cpu)(struct ib_device *dev,
 					       u64 dma_handle,
 					       size_t size,
-				               enum dma_data_direction dir);
+					       enum dma_data_direction dir);
 	void		(*sync_single_for_device)(struct ib_device *dev,
 						  u64 dma_handle,
 						  size_t size,
@@ -919,6 +996,8 @@ struct ib_device {
 
 	struct iw_cm_verbs	     *iwcm;
 
+	int		           (*get_protocol_stats)(struct ib_device *device,
+							 union rdma_protocol_stats *stats);
 	int		           (*query_device)(struct ib_device *device,
 						   struct ib_device_attr *device_attr);
 	int		           (*query_port)(struct ib_device *device,
@@ -1013,6 +1092,11 @@ struct ib_device {
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
+	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
+					       int max_page_list_len);
+	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
+								   int page_list_len);
+	void			   (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
 	int                        (*rereg_phys_mr)(struct ib_mr *mr,
 						    int mr_rereg_mask,
 						    struct ib_pd *pd,
@@ -1065,6 +1149,7 @@ struct ib_device {
 
 	char			     node_desc[64];
 	__be64			     node_guid;
+	u32			     local_dma_lkey;
 	u8                           node_type;
 	u8                           phys_port_cnt;
 };
@@ -1807,6 +1892,54 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
 int ib_dereg_mr(struct ib_mr *mr);
 
 /**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ *   IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list length to be
+ *   used with fast register work requests for this MR.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list * and a
+ * page_list array that is at least page_list_len in size.  The actual
+ * size is returned in max_page_list_len.  The caller is responsible
+ * for initializing the contents of the page_list array before posting
+ * a send work request with the IB_WC_FAST_REG_MR opcode.
+ *
+ * The page_list array entries must be translated using one of the
+ * ib_dma_*() functions just like the addresses passed to
+ * ib_map_phys_fmr().  Once the ib_post_send() is issued, the struct
+ * ib_fast_reg_page_list must not be modified by the caller until the
+ * IB_WC_FAST_REG_MR work request completes.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+				struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ *   page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ *   R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+	mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+	mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
  * ib_alloc_mw - Allocates a memory window.
  * @pd: The protection domain associated with the memory window.
  */
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
index aeefa9b740dc..cbb822e8d791 100644
--- a/include/rdma/iw_cm.h
+++ b/include/rdma/iw_cm.h
@@ -62,7 +62,7 @@ struct iw_cm_event {
 	struct sockaddr_in remote_addr;
 	void *private_data;
 	u8 private_data_len;
-	void* provider_data;
+	void *provider_data;
 };
 
 /**
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 010f876f41d8..22bb2e7bab1a 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -2,29 +2,33 @@
  * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
  *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
  *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
  *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #if !defined(RDMA_CM_H)
@@ -57,11 +61,11 @@ enum rdma_cm_event_type {
 };
 
 enum rdma_port_space {
-	RDMA_PS_SDP  = 0x0001,
-	RDMA_PS_IPOIB= 0x0002,
-	RDMA_PS_TCP  = 0x0106,
-	RDMA_PS_UDP  = 0x0111,
-	RDMA_PS_SCTP = 0x0183
+	RDMA_PS_SDP   = 0x0001,
+	RDMA_PS_IPOIB = 0x0002,
+	RDMA_PS_TCP   = 0x0106,
+	RDMA_PS_UDP   = 0x0111,
+	RDMA_PS_SCTP  = 0x0183
 };
 
 struct rdma_addr {
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
index 950424b38f16..2389c3b45404 100644
--- a/include/rdma/rdma_cm_ib.h
+++ b/include/rdma/rdma_cm_ib.h
@@ -1,29 +1,33 @@
 /*
  * Copyright (c) 2006 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
- *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
- *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
- *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
- *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #if !defined(RDMA_CM_IB_H)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 119174494cb5..4a8ba4bf5f6f 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
+		if (set)
+			*pcount = 0;
 	}
+	if (set)
+		fbc->count = ret;
+
 	spin_unlock(&fbc->lock);
 	return ret;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6a7d34874f..65d9d9e2b755 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
+EXPORT_SYMBOL(filemap_fdatawrite_range);
 
 /**
  * filemap_flush - mostly a non-blocking flush
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b38f700825fc..94c6d8988ab3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -960,6 +960,9 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+
+	if (wbc->range_cont)
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 91200feb3f9c..63f131fc42e4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -555,15 +555,13 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 	struct task_security_struct *tsec = current->security;
 	struct superblock_security_struct *sbsec = sb->s_security;
 	const char *name = sb->s_type->name;
-	struct dentry *root = sb->s_root;
-	struct inode *root_inode = root->d_inode;
-	struct inode_security_struct *root_isec = root_inode->i_security;
+	struct inode *inode = sbsec->sb->s_root->d_inode;
+	struct inode_security_struct *root_isec = inode->i_security;
 	u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
 	u32 defcontext_sid = 0;
 	char **mount_options = opts->mnt_opts;
 	int *flags = opts->mnt_opts_flags;
 	int num_opts = opts->num_mnt_opts;
-	bool can_xattr = false;
 
 	mutex_lock(&sbsec->lock);
 
@@ -667,24 +665,14 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 		goto out;
 	}
 
-	if (strcmp(name, "proc") == 0)
+	if (strcmp(sb->s_type->name, "proc") == 0)
 		sbsec->proc = 1;
 
-	/*
-	 * test if the fs supports xattrs, fs_use might make use of this if the
-	 * fs has no definition in policy.
-	 */
-	if (root_inode->i_op->getxattr) {
-		rc = root_inode->i_op->getxattr(root, XATTR_NAME_SELINUX, NULL, 0);
-		if (rc >= 0 || rc == -ENODATA)
-			can_xattr = true;
-	}
-
 	/* Determine the labeling behavior to use for this filesystem type. */
-	rc = security_fs_use(name, &sbsec->behavior, &sbsec->sid, can_xattr);
+	rc = security_fs_use(sb->s_type->name, &sbsec->behavior, &sbsec->sid);
 	if (rc) {
 		printk(KERN_WARNING "%s: security_fs_use(%s) returned %d\n",
-		       __func__, name, rc);
+		       __func__, sb->s_type->name, rc);
 		goto out;
 	}
 
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 44cba2e21dcf..7c543003d653 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -136,7 +136,7 @@ int security_get_allow_unknown(void);
 #define SECURITY_FS_USE_MNTPOINT	6 /* use mountpoint labeling */
 
 int security_fs_use(const char *fstype, unsigned int *behavior,
-	u32 *sid, bool can_xattr);
+	u32 *sid);
 
 int security_genfs_sid(const char *fstype, char *name, u16 sclass,
 	u32 *sid);
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 8e42da120101..b52f923ce680 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -1934,8 +1934,7 @@ out:
 int security_fs_use(
 	const char *fstype,
 	unsigned int *behavior,
-	u32 *sid,
-	bool can_xattr)
+	u32 *sid)
 {
 	int rc = 0;
 	struct ocontext *c;
@@ -1949,7 +1948,6 @@ int security_fs_use(
 		c = c->next;
 	}
 
-	/* look for labeling behavior defined in policy */
 	if (c) {
 		*behavior = c->v.behavior;
 		if (!c->sid[0]) {
@@ -1960,23 +1958,14 @@ int security_fs_use(
 				goto out;
 		}
 		*sid = c->sid[0];
-		goto out;
-	}
-
-	/* labeling behavior not in policy, use xattrs if possible */
-	if (can_xattr) {
-		*behavior = SECURITY_FS_USE_XATTR;
-		*sid = SECINITSID_FS;
-		goto out;
-	}
-
-	/* no behavior in policy and can't use xattrs, try GENFS */
-	rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, sid);
-	if (rc) {
-		*behavior = SECURITY_FS_USE_NONE;
-		rc = 0;
 	} else {
-		*behavior = SECURITY_FS_USE_GENFS;
+		rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, sid);
+		if (rc) {
+			*behavior = SECURITY_FS_USE_NONE;
+			rc = 0;
+		} else {
+			*behavior = SECURITY_FS_USE_GENFS;
+		}
 	}
 
 out: