47 files changed, 1834 insertions, 208 deletions
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9fdccf..47c32c3bfa1d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,9 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
 #include <linux/timekeeping.h>
 
 #include <asm/uaccess.h>
@@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo)
 		}
 	} else {
 		struct inode *inode;
+		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+				 O_LARGEFILE | O_EXCL;
 
 		if (cprm.limit < binfmt->min_coredump)
 			goto fail_unlock;
@@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo)
 		 * what matters is that at least one of the two processes
 		 * writes its coredump successfully, not which one.
 		 */
-		cprm.file = filp_open(cn.corename,
-				 O_CREAT | 2 | O_NOFOLLOW |
-				 O_LARGEFILE | O_EXCL,
-				 0600);
+		if (need_suid_safe) {
+			/*
+			 * Using user namespaces, normal user tasks can change
+			 * their current->fs->root to point to arbitrary
+			 * directories. Since the intention of the "only dump
+			 * with a fully qualified path" rule is to control where
+			 * coredumps may be placed using root privileges,
+			 * current->fs->root must not be used. Instead, use the
+			 * root directory of init_task.
+			 */
+			struct path root;
+
+			task_lock(&init_task);
+			get_fs_root(init_task.fs, &root);
+			task_unlock(&init_task);
+			cprm.file = file_open_root(root.dentry, root.mnt,
+				cn.corename, open_flags, 0600);
+			path_put(&root);
+		} else {
+			cprm.file = filp_open(cn.corename, open_flags, 0600);
+		}
 		if (IS_ERR(cprm.file))
 			goto fail_unlock;
 
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c5c84dfb5b3e..9893d1538122 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -635,8 +635,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
 		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
-		rc = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 	(*packet_size) = 0;
 	rc = ecryptfs_find_auth_tok_for_sig(
@@ -922,8 +921,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
 		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
-		rc = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 	if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
 		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ed70cf9fdc7b..1231cd1999d8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 	u64 count;
 
 	poll_wait(file, &ctx->wqh, wait);
-	smp_rmb();
-	count = ctx->count;
+
+	/*
+	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
+	 * can be done outside ctx->wqh.lock because we know that poll_wait
+	 * takes that lock (through add_wait_queue) if our caller will sleep.
+	 *
+	 * The read _can_ therefore seep into add_wait_queue's critical
+	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
+	 * as an acquire barrier and ensures that the read be ordered properly
+	 * against the writes.  The following CAN happen and is safe:
+	 *
+	 *     poll                               write
+	 *     -----------------                  ------------
+	 *     lock ctx->wqh.lock (in poll_wait)
+	 *     count = ctx->count
+	 *     __add_wait_queue
+	 *     unlock ctx->wqh.lock
+	 *                                        lock ctx->qwh.lock
+	 *                                        ctx->count += n
+	 *                                        if (waitqueue_active)
+	 *                                          wake_up_locked_poll
+	 *                                        unlock ctx->qwh.lock
+	 *     eventfd_poll returns 0
+	 *
+	 * but the following, which would miss a wakeup, cannot happen:
+	 *
+	 *     poll                               write
+	 *     -----------------                  ------------
+	 *     count = ctx->count (INVALID!)
+	 *                                        lock ctx->qwh.lock
+	 *                                        ctx->count += n
+	 *                                        **waitqueue_active is false**
+	 *                                        **no wake_up_locked_poll!**
+	 *                                        unlock ctx->qwh.lock
+	 *     lock ctx->wqh.lock (in poll_wait)
+	 *     __add_wait_queue
+	 *     unlock ctx->wqh.lock
+	 *     eventfd_poll returns 0
+	 */
+	count = READ_ONCE(ctx->count);
 
 	if (count > 0)
 		events |= POLLIN;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 33f5e2a50cf8..50ba27cbed03 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -285,7 +285,7 @@ errout:
 static inline int is_32bit_api(void)
 {
 #ifdef CONFIG_COMPAT
-	return is_compat_task();
+	return in_compat_syscall();
 #else
 	return (BITS_PER_LONG == 32);
 #endif
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 182f9ffe2b51..3ff1772f612e 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET
 	  that most of your FAT filesystems use, and can be overridden
 	  with the "iocharset" mount option for FAT filesystems.
 	  Note that "utf8" is not recommended for FAT filesystems.
-	  If unsure, you shouldn't set "utf8" here.
+	  If unsure, you shouldn't set "utf8" here - select the next option
+	  instead if you would like to use UTF-8 encoded file names by default.
 	  See <file:Documentation/filesystems/vfat.txt> for more information.
 
 	  Enable any character sets you need in File Systems/Native Language
 	  Support.
+
+config FAT_DEFAULT_UTF8
+	bool "Enable FAT UTF-8 option by default"
+	depends on VFAT_FS
+	default n
+	help
+	  Set this if you would like to have "utf8" mount option set
+	  by default when mounting FAT filesystems.
+
+	  Even if you say Y here can always disable UTF-8 for
+	  particular mount by adding "utf8=0" to mount options.
+
+	  Say Y if you use UTF-8 encoding for file names, N otherwise.
+
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a5599052116c..226281068a46 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1127,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 	}
 	opts->name_check = 'n';
 	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
-	opts->utf8 = opts->unicode_xlate = 0;
+	opts->unicode_xlate = 0;
 	opts->numtail = 1;
 	opts->usefree = opts->nocase = 0;
 	opts->tz_set = 0;
@@ -1135,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 	opts->errors = FAT_ERRORS_RO;
 	*debug = 0;
 
+	opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+
 	if (!options)
 		goto out;
 
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712dfa3e7..ca3c3dd01789 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd,
 		path_put(&path);
 		return fd;
 	}
-	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
 	if (IS_ERR(file)) {
 		put_unused_fd(fd);
 		retval =  PTR_ERR(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c46ed9f3e14..fee81e8768c9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
 		wb_get(wb);
 		spin_unlock(&inode->i_lock);
 		spin_lock(&wb->list_lock);
-		wb_put(wb);		/* not gonna deref it anymore */
 
 		/* i_wb may have changed inbetween, can't use inode_to_wb() */
-		if (likely(wb == inode->i_wb))
-			return wb;	/* @inode already has ref */
+		if (likely(wb == inode->i_wb)) {
+			wb_put(wb);	/* @inode already has ref */
+			return wb;
+		}
 
 		spin_unlock(&wb->list_lock);
+		wb_put(wb);
 		cpu_relax();
 		spin_lock(&inode->i_lock);
 	}
@@ -1337,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
  * and does more profound writeback list handling in writeback_sb_inodes().
  */
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
-		       struct writeback_control *wbc)
+static int writeback_single_inode(struct inode *inode,
+				  struct writeback_control *wbc)
 {
+	struct bdi_writeback *wb;
 	int ret = 0;
 
 	spin_lock(&inode->i_lock);
@@ -1378,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	ret = __writeback_single_inode(inode, wbc);
 
 	wbc_detach_inode(wbc);
-	spin_lock(&wb->list_lock);
+
+	wb = inode_to_wb_and_lock_list(inode);
 	spin_lock(&inode->i_lock);
 	/*
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1453,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
+		struct bdi_writeback *tmp_wb;
 
 		if (inode->i_sb != sb) {
 			if (work->sb) {
@@ -1543,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
 			cond_resched();
 		}
 
-
-		spin_lock(&wb->list_lock);
+		/*
+		 * Requeue @inode if still dirty.  Be careful as @inode may
+		 * have been switched to another wb in the meantime.
+		 */
+		tmp_wb = inode_to_wb_and_lock_list(inode);
 		spin_lock(&inode->i_lock);
 		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
-		requeue_inode(inode, wb, &wbc);
+		requeue_inode(inode, tmp_wb, &wbc);
 		inode_sync_complete(inode);
 		spin_unlock(&inode->i_lock);
 
+		if (unlikely(tmp_wb != wb)) {
+			spin_unlock(&tmp_wb->list_lock);
+			spin_lock(&wb->list_lock);
+		}
+
 		/*
 		 * bail out to wb_writeback() often enough to check
 		 * background threshold and other termination conditions.
@@ -2338,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
  */
 int write_inode_now(struct inode *inode, int sync)
 {
-	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2350,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync)
 		wbc.nr_to_write = 0;
 
 	might_sleep();
-	return writeback_single_inode(inode, wb, &wbc);
+	return writeback_single_inode(inode, &wbc);
 }
 EXPORT_SYMBOL(write_inode_now);
 
@@ -2367,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now);
  */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+	return writeback_single_inode(inode, wbc);
 }
 EXPORT_SYMBOL(sync_inode);
 
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 95d5880a63ee..7e553f286775 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -134,37 +134,59 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	if (mutex_lock_interruptible(&c->alloc_sem))
 		return -EINTR;
 
+
 	for (;;) {
+		/* We can't start doing GC until we've finished checking
+		   the node CRCs etc. */
+		int bucket, want_ino;
+
 		spin_lock(&c->erase_completion_lock);
 		if (!c->unchecked_size)
 			break;
-
-		/* We can't start doing GC yet. We haven't finished checking
-		   the node CRCs etc. Do it now. */
-
-		/* checked_ino is protected by the alloc_sem */
-		if (c->checked_ino > c->highest_ino && xattr) {
-			pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
-				c->unchecked_size);
-			jffs2_dbg_dump_block_lists_nolock(c);
-			spin_unlock(&c->erase_completion_lock);
-			mutex_unlock(&c->alloc_sem);
-			return -ENOSPC;
-		}
-
 		spin_unlock(&c->erase_completion_lock);
 
 		if (!xattr)
 			xattr = jffs2_verify_xattr(c);
 
 		spin_lock(&c->inocache_lock);
+		/* Instead of doing the inodes in numeric order, doing a lookup
+		 * in the hash for each possible number, just walk the hash
+		 * buckets of *existing* inodes. This means that we process
+		 * them out-of-order, but it can be a lot faster if there's
+		 * a sparse inode# space. Which there often is. */
+		want_ino = c->check_ino;
+		for (bucket = c->check_ino % c->inocache_hashsize ; bucket < c->inocache_hashsize; bucket++) {
+			for (ic = c->inocache_list[bucket]; ic; ic = ic->next) {
+				if (ic->ino < want_ino)
+					continue;
+
+				if (ic->state != INO_STATE_CHECKEDABSENT &&
+				    ic->state != INO_STATE_PRESENT)
+					goto got_next; /* with inocache_lock held */
+
+				jffs2_dbg(1, "Skipping ino #%u already checked\n",
+					  ic->ino);
+			}
+			want_ino = 0;
+		}
 
-		ic = jffs2_get_ino_cache(c, c->checked_ino++);
+		/* Point c->check_ino past the end of the last bucket. */
+		c->check_ino = ((c->highest_ino + c->inocache_hashsize + 1) &
+				~c->inocache_hashsize) - 1;
 
-		if (!ic) {
-			spin_unlock(&c->inocache_lock);
-			continue;
-		}
+		spin_unlock(&c->inocache_lock);
+
+		pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
+			c->unchecked_size);
+		jffs2_dbg_dump_block_lists_nolock(c);
+		mutex_unlock(&c->alloc_sem);
+		return -ENOSPC;
+
+	got_next:
+		/* For next time round the loop, we want c->checked_ino to indicate
+		 * the *next* one we want to check. And since we're walking the
+		 * buckets rather than doing it sequentially, it's: */
+		c->check_ino = ic->ino + c->inocache_hashsize;
 
 		if (!ic->pino_nlink) {
 			jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
@@ -176,8 +198,6 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 		switch(ic->state) {
 		case INO_STATE_CHECKEDABSENT:
 		case INO_STATE_PRESENT:
-			jffs2_dbg(1, "Skipping ino #%u already checked\n",
-				  ic->ino);
 			spin_unlock(&c->inocache_lock);
 			continue;
 
@@ -196,7 +216,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 				  ic->ino);
 			/* We need to come back again for the _same_ inode. We've
 			 made no progress in this case, but that should be OK */
-			c->checked_ino--;
+			c->check_ino = ic->ino;
 
 			mutex_unlock(&c->alloc_sem);
 			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 046fee8b6e9b..778275f48a87 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -49,7 +49,7 @@ struct jffs2_sb_info {
 	struct mtd_info *mtd;
 
 	uint32_t highest_ino;
-	uint32_t checked_ino;
+	uint32_t check_ino;		/* *NEXT* inode to be checked */
 
 	unsigned int flags;
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index b6bd4affd9ad..cda0774c2c9c 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -846,8 +846,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
 		return 1;
 
 	if (c->unchecked_size) {
-		jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
-			  c->unchecked_size, c->checked_ino);
+		jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, check_ino #%d\n",
+			  c->unchecked_size, c->check_ino);
 		return 1;
 	}
 
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5a3da3f52908..b25d28a21212 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1183,22 +1183,20 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
 
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 {
-	struct nand_ecclayout *oinfo = c->mtd->ecclayout;
-
 	if (!c->mtd->oobsize)
 		return 0;
 
 	/* Cleanmarker is out-of-band, so inline size zero */
 	c->cleanmarker_size = 0;
 
-	if (!oinfo || oinfo->oobavail == 0) {
+	if (c->mtd->oobavail == 0) {
 		pr_err("inconsistent device description\n");
 		return -EINVAL;
 	}
 
 	jffs2_dbg(1, "using OOB on NAND\n");
 
-	c->oobavail = oinfo->oobavail;
+	c->oobavail = c->mtd->oobavail;
 
 	/* Initialise write buffer */
 	init_rwsem(&c->wbuf_sem);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 8bc870e4c467..02e4d87d2ed3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
 	kfree(bl);
 }
 
-static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
-						   gfp_t gfp_flags)
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
+		gfp_t gfp_flags, bool is_scsi_layout)
 {
 	struct pnfs_block_layout *bl;
 
@@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
 	bl->bl_ext_ro = RB_ROOT;
 	spin_lock_init(&bl->bl_ext_lock);
 
+	bl->bl_scsi_layout = is_scsi_layout;
 	return &bl->bl_layout;
 }
 
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+						   gfp_t gfp_flags)
+{
+	return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+						   gfp_t gfp_flags)
+{
+	return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
+
 static void bl_free_lseg(struct pnfs_layout_segment *lseg)
 {
 	dprintk("%s enter\n", __func__);
@@ -889,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.sync				= pnfs_generic_sync,
 };
 
+static struct pnfs_layoutdriver_type scsilayout_type = {
+	.id				= LAYOUT_SCSI,
+	.name				= "LAYOUT_SCSI",
+	.owner				= THIS_MODULE,
+	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
+					  PNFS_READ_WHOLE_PAGE,
+	.read_pagelist			= bl_read_pagelist,
+	.write_pagelist			= bl_write_pagelist,
+	.alloc_layout_hdr		= sl_alloc_layout_hdr,
+	.free_layout_hdr		= bl_free_layout_hdr,
+	.alloc_lseg			= bl_alloc_lseg,
+	.free_lseg			= bl_free_lseg,
+	.return_range			= bl_return_range,
+	.prepare_layoutcommit		= bl_prepare_layoutcommit,
+	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
+	.set_layoutdriver		= bl_set_layoutdriver,
+	.alloc_deviceid_node		= bl_alloc_deviceid_node,
+	.free_deviceid_node		= bl_free_deviceid_node,
+	.pg_read_ops			= &bl_pg_read_ops,
+	.pg_write_ops			= &bl_pg_write_ops,
+	.sync				= pnfs_generic_sync,
+};
+
+
 static int __init nfs4blocklayout_init(void)
 {
 	int ret;
 
 	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
 
-	ret = pnfs_register_layoutdriver(&blocklayout_type);
+	ret = bl_init_pipefs();
 	if (ret)
 		goto out;
-	ret = bl_init_pipefs();
+
+	ret = pnfs_register_layoutdriver(&blocklayout_type);
 	if (ret)
-		goto out_unregister;
+		goto out_cleanup_pipe;
+
+	ret = pnfs_register_layoutdriver(&scsilayout_type);
+	if (ret)
+		goto out_unregister_block;
 	return 0;
 
-out_unregister:
+out_unregister_block:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+	bl_cleanup_pipefs();
 out:
 	return ret;
 }
@@ -914,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void)
 	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
 	       __func__);
 
-	bl_cleanup_pipefs();
+	pnfs_unregister_layoutdriver(&scsilayout_type);
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+	bl_cleanup_pipefs();
 }
 
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c556640dcf3b..bc21205309e0 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -55,7 +55,6 @@ struct pnfs_block_dev;
  */
 #define PNFS_BLOCK_UUID_LEN	128
 
-
 struct pnfs_block_volume {
 	enum pnfs_block_volume_type	type;
 	union {
@@ -82,6 +81,13 @@ struct pnfs_block_volume {
 			u32		volumes_count;
 			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
 		} stripe;
+		struct {
+			enum scsi_code_set		code_set;
+			enum scsi_designator_type	designator_type;
+			int				designator_len;
+			u8				designator[256];
+			u64				pr_key;
+		} scsi;
 	};
 };
 
@@ -106,6 +112,9 @@ struct pnfs_block_dev {
 	struct block_device		*bdev;
 	u64				disk_offset;
 
+	u64				pr_key;
+	bool				pr_registered;
+
 	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
 			struct pnfs_block_dev_map *map);
 };
@@ -131,6 +140,7 @@ struct pnfs_block_layout {
 	struct rb_root		bl_ext_rw;
 	struct rb_root		bl_ext_ro;
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
+	bool			bl_scsi_layout;
 };
 
 static inline struct pnfs_block_layout *
@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
 dev_t bl_resolve_deviceid(struct nfs_server *server,
 		struct pnfs_block_volume *b, gfp_t gfp_mask);
 int __init bl_init_pipefs(void);
-void __exit bl_cleanup_pipefs(void);
+void bl_cleanup_pipefs(void);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a861bbdfe577..e5b89675263e 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/blkdev.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_xdr.h>
+#include <linux/pr.h>
 
 #include "blocklayout.h"
 
@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
 			bl_free_device(&dev->children[i]);
 		kfree(dev->children);
 	} else {
+		if (dev->pr_registered) {
+			const struct pr_ops *ops =
+				dev->bdev->bd_disk->fops->pr_ops;
+			int error;
+
+			error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+				false);
+			if (error)
+				pr_err("failed to unregister PR key.\n");
+		}
+
 		if (dev->bdev)
 			blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
 	}
@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		for (i = 0; i < b->stripe.volumes_count; i++)
 			b->stripe.volumes[i] = be32_to_cpup(p++);
 		break;
+	case PNFS_BLOCK_VOLUME_SCSI:
+		p = xdr_inline_decode(xdr, 4 + 4 + 4);
+		if (!p)
+			return -EIO;
+		b->scsi.code_set = be32_to_cpup(p++);
+		b->scsi.designator_type = be32_to_cpup(p++);
+		b->scsi.designator_len = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, b->scsi.designator_len);
+		if (!p)
+			return -EIO;
+		if (b->scsi.designator_len > 256)
+			return -EIO;
+		memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+		p = xdr_inline_decode(xdr, 8);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->scsi.pr_key);
+		break;
 	default:
 		dprintk("unknown volume type!\n");
 		return -EIO;
@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 }
 
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+	switch (v->scsi.designator_type) {
+	case PS_DESIGNATOR_EUI64:
+		if (v->scsi.code_set != PS_CODE_SET_BINARY)
+			return false;
+
+		if (v->scsi.designator_len != 8 &&
+		    v->scsi.designator_len != 10 &&
+		    v->scsi.designator_len != 16)
+			return false;
+
+		return true;
+	case PS_DESIGNATOR_NAA:
+		if (v->scsi.code_set != PS_CODE_SET_BINARY)
+			return false;
+
+		if (v->scsi.designator_len != 8 &&
+		    v->scsi.designator_len != 16)
+			return false;
+
+		return true;
+	case PS_DESIGNATOR_T10:
+	case PS_DESIGNATOR_NAME:
+		pr_err("pNFS: unsupported designator "
+			"(code set %d, type %d, len %d.\n",
+			v->scsi.code_set,
+			v->scsi.designator_type,
+			v->scsi.designator_len);
+		return false;
+	default:
+		pr_err("pNFS: invalid designator "
+			"(code set %d, type %d, len %d.\n",
+			v->scsi.code_set,
+			v->scsi.designator_type,
+			v->scsi.designator_len);
+		return false;
+	}
+}
+
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	const struct pr_ops *ops;
+	const char *devname;
+	int error;
+
+	if (!bl_validate_designator(v))
+		return -EINVAL;
+
+	switch (v->scsi.designator_len) {
+	case 8:
+		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
+				v->scsi.designator);
+		break;
+	case 12:
+		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
+				v->scsi.designator);
+		break;
+	case 16:
+		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
+				v->scsi.designator);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
+	if (IS_ERR(d->bdev)) {
+		pr_warn("pNFS: failed to open device %s (%ld)\n",
+			devname, PTR_ERR(d->bdev));
+		kfree(devname);
+		return PTR_ERR(d->bdev);
+	}
+
+	kfree(devname);
+
+	d->len = i_size_read(d->bdev->bd_inode);
+	d->map = bl_map_simple;
+	d->pr_key = v->scsi.pr_key;
+
+	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+		d->bdev->bd_disk->disk_name, d->pr_key);
+
+	ops = d->bdev->bd_disk->fops->pr_ops;
+	if (!ops) {
+		pr_err("pNFS: block device %s does not support reservations.",
+				d->bdev->bd_disk->disk_name);
+		error = -EINVAL;
+		goto out_blkdev_put;
+	}
+
+	error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+	if (error) {
+		pr_err("pNFS: failed to register key for block device %s.",
+				d->bdev->bd_disk->disk_name);
+		goto out_blkdev_put;
+	}
+
+	d->pr_registered = true;
+	return 0;
+
+out_blkdev_put:
+	blkdev_put(d->bdev, FMODE_READ);
+	return error;
+}
+
 static int
 bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
 	case PNFS_BLOCK_VOLUME_STRIPE:
 		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_SCSI:
+		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
 	default:
 		dprintk("unsupported volume type: %d\n", volumes[idx].type);
 		return -EIO;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 35ab51c04814..720b3ff55fa9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 
 #include <linux/vmalloc.h>
@@ -462,10 +462,12 @@ out:
 	return err;
 }
 
-static size_t ext_tree_layoutupdate_size(size_t count)
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 {
-	return sizeof(__be32) /* number of entries */ +
-		PNFS_BLOCK_EXTENT_SIZE * count;
+	if (bl->bl_scsi_layout)
+		return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+	else
+		return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 }
 
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
@@ -483,6 +485,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 	}
 }
 
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+	p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+			NFS4_DEVICEID4_SIZE);
+	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+	p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+	p = xdr_encode_hyper(p, 0LL);
+	*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+	return p;
+}
+
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+	p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+	return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
+
 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 		size_t buffer_size, size_t *count)
 {
@@ -496,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 			continue;
 
 		(*count)++;
-		if (ext_tree_layoutupdate_size(*count) > buffer_size) {
+		if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
 			/* keep counting.. */
 			ret = -ENOSPC;
 			continue;
 		}
 
-		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
-				NFS4_DEVICEID4_SIZE);
-		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, 0LL);
-		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-
+		if (bl->bl_scsi_layout)
+			p = encode_scsi_range(be, p);
+		else
+			p = encode_block_extent(be, p);
 		be->be_tag = EXTENT_COMMITTING;
 	}
 	spin_unlock(&bl->bl_ext_lock);
@@ -537,7 +553,7 @@ retry:
 	if (unlikely(ret)) {
 		ext_tree_free_commitdata(arg, buffer_size);
 
-		buffer_size = ext_tree_layoutupdate_size(count);
+		buffer_size = ext_tree_layoutupdate_size(bl, count);
 		count = 0;
 
 		arg->layoutupdate_pages =
@@ -556,7 +572,7 @@ retry:
 	}
 
 	*start_p = cpu_to_be32(count);
-	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
+	arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
 
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 		void *p = start_p, *end = p + arg->layoutupdate_len;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index dbe5839cdeba..9fb067a6f7e0 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -281,7 +281,7 @@ out:
 	return ret;
 }
 
-void __exit bl_cleanup_pipefs(void)
+void bl_cleanup_pipefs(void)
 {
 	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
 	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index a0b77fc1bd39..c9f583d7bac8 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -84,12 +84,30 @@ config NFSD_V4
 	  If unsure, say N.
 
 config NFSD_PNFS
-	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
-	depends on NFSD_V4
+	bool
+
+config NFSD_BLOCKLAYOUT
+	bool "NFSv4.1 server support for pNFS block layouts"
+	depends on NFSD_V4 && BLOCK
+	select NFSD_PNFS
+	help
+	  This option enables support for the exporting pNFS block layouts
+	  in the kernel's NFS server. The pNFS block layout enables NFS
+	  clients to directly perform I/O to block devices accesible to both
+	  the server and the clients.  See RFC 5663 for more details.
+
+	  If unsure, say N.
+
+config NFSD_SCSILAYOUT
+	bool "NFSv4.1 server support for pNFS SCSI layouts"
+	depends on NFSD_V4 && BLOCK
+	select NFSD_PNFS
 	help
-	  This option enables support for the parallel NFS features of the
-	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
-	  server.
+	  This option enables support for the exporting pNFS SCSI layouts
+	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
+	  clients to directly perform I/O to SCSI devices accesible to both
+	  the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
+	  more details.
 
 	  If unsure, say N.
 
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e120c6..3ae5f3c77e28 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
+nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d9421bd5e..e55b5242614d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,11 +1,14 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/exportfs.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
+#include <linux/pr.h>
 
 #include <linux/nfsd/debug.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
@@ -13,37 +16,6 @@
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
 
-static int
-nfsd4_block_get_device_info_simple(struct super_block *sb,
-		struct nfsd4_getdeviceinfo *gdp)
-{
-	struct pnfs_block_deviceaddr *dev;
-	struct pnfs_block_volume *b;
-
-	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
-		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
-	gdp->gd_device = dev;
-
-	dev->nr_volumes = 1;
-	b = &dev->volumes[0];
-
-	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
-	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
-	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
-			&b->simple.offset);
-}
-
-static __be32
-nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
-		struct nfsd4_getdeviceinfo *gdp)
-{
-	if (sb->s_bdev != sb->s_bdev->bd_contains)
-		return nfserr_inval;
-	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
-}
-
 static __be32
 nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 		struct nfsd4_layoutget *args)
@@ -141,20 +113,13 @@ out_layoutunavailable:
 }
 
 static __be32
-nfsd4_block_proc_layoutcommit(struct inode *inode,
-		struct nfsd4_layoutcommit *lcp)
+nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
+		struct iomap *iomaps, int nr_iomaps)
 {
 	loff_t new_size = lcp->lc_last_wr + 1;
 	struct iattr iattr = { .ia_valid = 0 };
-	struct iomap *iomaps;
-	int nr_iomaps;
 	int error;
 
-	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
-			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
-	if (nr_iomaps < 0)
-		return nfserrno(nr_iomaps);
-
 	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
 	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
 		lcp->lc_mtime = current_fs_time(inode->i_sb);
@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
 	return nfserrno(error);
 }
 
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev;
+	struct pnfs_block_volume *b;
+
+	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	gdp->gd_device = dev;
+
+	dev->nr_volumes = 1;
+	b = &dev->volumes[0];
+
+	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+			&b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+		struct nfs4_client *clp,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	if (sb->s_bdev != sb->s_bdev->bd_contains)
+		return nfserr_inval;
+	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+		struct nfsd4_layoutcommit *lcp)
+{
+	struct iomap *iomaps;
+	int nr_iomaps;
+
+	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+	if (nr_iomaps < 0)
+		return nfserrno(nr_iomaps);
+
+	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
 const struct nfsd4_layout_ops bl_layout_ops = {
 	/*
 	 * Pretend that we send notification to the client.  This is a blatant
@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
 	.encode_layoutget	= nfsd4_block_encode_layoutget,
 	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
 };
+#endif /* CONFIG_NFSD_BLOCKLAYOUT */
+
+#ifdef CONFIG_NFSD_SCSILAYOUT
+static int nfsd4_scsi_identify_device(struct block_device *bdev,
+		struct pnfs_block_volume *b)
+{
+	struct request_queue *q = bdev->bd_disk->queue;
+	struct request *rq;
+	size_t bufflen = 252, len, id_len;
+	u8 *buf, *d, type, assoc;
+	int error;
+
+	buf = kzalloc(bufflen, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rq = blk_get_request(q, READ, GFP_KERNEL);
+	if (IS_ERR(rq)) {
+		error = -ENOMEM;
+		goto out_free_buf;
+	}
+	blk_rq_set_block_pc(rq);
+
+	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
+	if (error)
+		goto out_put_request;
+
+	rq->cmd[0] = INQUIRY;
+	rq->cmd[1] = 1;
+	rq->cmd[2] = 0x83;
+	rq->cmd[3] = bufflen >> 8;
+	rq->cmd[4] = bufflen & 0xff;
+	rq->cmd_len = COMMAND_SIZE(INQUIRY);
+
+	error = blk_execute_rq(rq->q, NULL, rq, 1);
+	if (error) {
+		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
+			rq->errors);
+		goto out_put_request;
+	}
+
+	len = (buf[2] << 8) + buf[3] + 4;
+	if (len > bufflen) {
+		pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
+			len);
+		goto out_put_request;
+	}
+
+	d = buf + 4;
+	for (d = buf + 4; d < buf + len; d += id_len + 4) {
+		id_len = d[3];
+		type = d[1] & 0xf;
+		assoc = (d[1] >> 4) & 0x3;
+
+		/*
+		 * We only care about a EUI-64 and NAA designator types
+		 * with LU association.
+		 */
+		if (assoc != 0x00)
+			continue;
+		if (type != 0x02 && type != 0x03)
+			continue;
+		if (id_len != 8 && id_len != 12 && id_len != 16)
+			continue;
+
+		b->scsi.code_set = PS_CODE_SET_BINARY;
+		b->scsi.designator_type = type == 0x02 ?
+			PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
+		b->scsi.designator_len = id_len;
+		memcpy(b->scsi.designator, d + 4, id_len);
+
+		/*
+		 * If we found a 8 or 12 byte descriptor continue on to
+		 * see if a 16 byte one is available.  If we find a
+		 * 16 byte descriptor we're done.
+		 */
+		if (id_len == 16)
+			break;
+	}
+
+out_put_request:
+	blk_put_request(rq);
+out_free_buf:
+	kfree(buf);
+	return error;
+}
+
+#define NFSD_MDS_PR_KEY		0x0100000000000000
+
+/*
+ * We use the client ID as a unique key for the reservations.
+ * This allows us to easily fence a client when recalls fail.
+ */
+static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
+{
+	return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
+}
+
+static int
+nfsd4_block_get_device_info_scsi(struct super_block *sb,
+		struct nfs4_client *clp,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev;
+	struct pnfs_block_volume *b;
+	const struct pr_ops *ops;
+	int error;
+
+	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	gdp->gd_device = dev;
+
+	dev->nr_volumes = 1;
+	b = &dev->volumes[0];
+
+	b->type = PNFS_BLOCK_VOLUME_SCSI;
+	b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
+
+	error = nfsd4_scsi_identify_device(sb->s_bdev, b);
+	if (error)
+		return error;
+
+	ops = sb->s_bdev->bd_disk->fops->pr_ops;
+	if (!ops) {
+		pr_err("pNFS: device %s does not support PRs.\n",
+			sb->s_id);
+		return -EINVAL;
+	}
+
+	error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+	if (error) {
+		pr_err("pNFS: failed to register key for device %s.\n",
+			sb->s_id);
+		return -EINVAL;
+	}
+
+	error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+			PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
+	if (error) {
+		pr_err("pNFS: failed to reserve device %s.\n",
+			sb->s_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __be32
+nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+		struct nfs4_client *clp,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	if (sb->s_bdev != sb->s_bdev->bd_contains)
+		return nfserr_inval;
+	return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
+}
+static __be32
+nfsd4_scsi_proc_layoutcommit(struct inode *inode,
+		struct nfsd4_layoutcommit *lcp)
+{
+	struct iomap *iomaps;
+	int nr_iomaps;
+
+	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+	if (nr_iomaps < 0)
+		return nfserrno(nr_iomaps);
+
+	return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
+static void
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+{
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+
+	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
+			nfsd4_scsi_pr_key(clp), 0, true);
+}
+
+const struct nfsd4_layout_ops scsi_layout_ops = {
+	/*
+	 * Pretend that we send notification to the client.  This is a blatant
+	 * lie to force recent Linux clients to cache our device IDs.
+	 * We rarely ever change the device ID, so the harm of leaking deviceids
+	 * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
+	 * in this regard, but I filed errata 4119 for this a while ago, and
+	 * hopefully the Linux client will eventually start caching deviceids
+	 * without this again.
+	 */
+	.notify_types		=
+			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+	.proc_getdeviceinfo	= nfsd4_scsi_proc_getdeviceinfo,
+	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
+	.proc_layoutget		= nfsd4_block_proc_layoutget,
+	.encode_layoutget	= nfsd4_block_encode_layoutget,
+	.proc_layoutcommit	= nfsd4_scsi_proc_layoutcommit,
+	.fence_client		= nfsd4_scsi_fence_client,
+};
+#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc9bbc8..6c3b316f932e 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		p = xdr_encode_hyper(p, b->simple.offset);
 		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
 		break;
+	case PNFS_BLOCK_VOLUME_SCSI:
+		len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+		p = xdr_reserve_space(xdr, len);
+		if (!p)
+			return -ETOOSMALL;
+
+		*p++ = cpu_to_be32(b->type);
+		*p++ = cpu_to_be32(b->scsi.code_set);
+		*p++ = cpu_to_be32(b->scsi.designator_type);
+		p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
+		p = xdr_encode_hyper(p, b->scsi.pr_key);
+		break;
 	default:
 		return -ENOTSUPP;
 	}
@@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		u32 block_size)
 {
 	struct iomap *iomaps;
-	u32 nr_iomaps, expected, i;
+	u32 nr_iomaps, i;
 
 	if (len < sizeof(u32)) {
 		dprintk("%s: extent array too small: %u\n", __func__, len);
 		return -EINVAL;
 	}
+	len -= sizeof(u32);
+	if (len % PNFS_BLOCK_EXTENT_SIZE) {
+		dprintk("%s: extent array invalid: %u\n", __func__, len);
+		return -EINVAL;
+	}
 
 	nr_iomaps = be32_to_cpup(p++);
-	expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
-	if (len != expected) {
+	if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
-			__func__, len, expected);
+			__func__, len, nr_iomaps);
 		return -EINVAL;
 	}
 
@@ -155,3 +171,54 @@ fail:
 	kfree(iomaps);
 	return -EINVAL;
 }
+
+int
+nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size)
+{
+	struct iomap *iomaps;
+	u32 nr_iomaps, expected, i;
+
+	if (len < sizeof(u32)) {
+		dprintk("%s: extent array too small: %u\n", __func__, len);
+		return -EINVAL;
+	}
+
+	nr_iomaps = be32_to_cpup(p++);
+	expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
+	if (len != expected) {
+		dprintk("%s: extent array size mismatch: %u/%u\n",
+			__func__, len, expected);
+		return -EINVAL;
+	}
+
+	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+	if (!iomaps) {
+		dprintk("%s: failed to allocate extent array\n", __func__);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_iomaps; i++) {
+		u64 val;
+
+		p = xdr_decode_hyper(p, &val);
+		if (val & (block_size - 1)) {
+			dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
+			goto fail;
+		}
+		iomaps[i].offset = val;
+
+		p = xdr_decode_hyper(p, &val);
+		if (val & (block_size - 1)) {
+			dprintk("%s: unaligned length 0x%llx\n", __func__, val);
+			goto fail;
+		}
+		iomaps[i].length = val;
+	}
+
+	*iomapp = iomaps;
+	return nr_iomaps;
+fail:
+	kfree(iomaps);
+	return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 6de925fe8499..397bc7563a49 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -15,6 +15,11 @@ struct pnfs_block_extent {
 	enum pnfs_block_extent_state	es;
 };
 
+struct pnfs_block_range {
+	u64				foff;
+	u64				len;
+};
+
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
  * Not actually limited by the protocol.
@@ -29,6 +34,13 @@ struct pnfs_block_volume {
 			u32		sig_len;
 			u8		sig[PNFS_BLOCK_UUID_LEN];
 		} simple;
+		struct {
+			enum scsi_code_set		code_set;
+			enum scsi_designator_type	designator_type;
+			int				designator_len;
+			u8				designator[256];
+			u64				pr_key;
+		} scsi;
 	};
 };
 
@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		struct nfsd4_layoutget *lgp);
 int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size);
 
 #endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7b755b7f785c..51c3b06e8036 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 {
 	__be32	nfserr;
 	u32	max_blocksize = svc_max_payload(rqstp);
+	unsigned long cnt = min(argp->count, max_blocksize);
 
 	dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
 				SVCFH_fmt(&argp->fh),
@@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
 	 * + 1 (xdr opaque byte count) = 26
 	 */
-	resp->count = min(argp->count, max_blocksize);
+	resp->count = cnt;
 	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
@@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 				  &resp->count);
 	if (nfserr == 0) {
 		struct inode	*inode = d_inode(resp->fh.fh_dentry);
-
-		resp->eof = (argp->offset + resp->count) >= inode->i_size;
+		resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
+							inode->i_size);
 	}
 
 	RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ce2d010d3b17..825c7bc8d789 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2014 Christoph Hellwig.
  */
+#include <linux/blkdev.h>
 #include <linux/kmod.h>
 #include <linux/file.h>
 #include <linux/jhash.h>
@@ -26,7 +27,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
 static const struct lock_manager_operations nfsd4_layouts_lm_ops;
 
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+	[LAYOUT_SCSI]		= &scsi_layout_ops,
+#endif
 };
 
 /* pNFS device ID to export fsid mapping */
@@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 	if (!(exp->ex_flags & NFSEXP_PNFS))
 		return;
 
+	/*
+	 * Check if the file system supports exporting a block-like layout.
+	 * If the block device supports reservations prefer the SCSI layout,
+	 * otherwise advertise the block layout.
+	 */
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 	if (sb->s_export_op->get_uuid &&
 	    sb->s_export_op->map_blocks &&
 	    sb->s_export_op->commit_blocks)
 		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+	/* overwrite block layout selection if needed */
+	if (sb->s_export_op->map_blocks &&
+	    sb->s_export_op->commit_blocks &&
+	    sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
+		exp->ex_layout_type = LAYOUT_SCSI;
+#endif
 }
 
 static void
@@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
 	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
 
-	trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
 	printk(KERN_WARNING
 		"nfsd: client %s failed to respond to layout recall. "
 		"  Fencing..\n", addr_str);
@@ -626,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 	struct nfsd_net *nn;
 	ktime_t now, cutoff;
+	const struct nfsd4_layout_ops *ops;
 	LIST_HEAD(reaplist);
 
 
@@ -661,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		/*
 		 * Unknown error or non-responding client, we'll need to fence.
 		 */
-		nfsd4_cb_layout_fail(ls);
+		trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
+		ops = nfsd4_layout_ops[ls->ls_layout_type];
+		if (ops->fence_client)
+			ops->fence_client(ls);
+		else
+			nfsd4_cb_layout_fail(ls);
 		return -1;
 	}
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4cba7865f496..de1ff1d98bb1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -864,12 +864,10 @@ static __be32
 nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	      struct nfsd4_secinfo *secinfo)
 {
-	struct svc_fh resfh;
 	struct svc_export *exp;
 	struct dentry *dentry;
 	__be32 err;
 
-	fh_init(&resfh, NFS4_FHSIZE);
 	err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
 	if (err)
 		return err;
@@ -878,6 +876,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				    &exp, &dentry);
 	if (err)
 		return err;
+	fh_unlock(&cstate->current_fh);
 	if (d_really_is_negative(dentry)) {
 		exp_put(exp);
 		err = nfserr_noent;
@@ -1269,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
 		goto out;
 
 	nfserr = nfs_ok;
-	if (gdp->gd_maxcount != 0)
-		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+	if (gdp->gd_maxcount != 0) {
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+					cstate->session->se_client, gdp);
+	}
 
 	gdp->gd_notify_types &= ops->notify_types;
 out:
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 195fe2668207..66eaeb1e8c2c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1266,6 +1266,7 @@ nfsd4_umh_cltrack_init(struct net *net)
 	/* XXX: The usermode helper s not working in container yet. */
 	if (net != &init_net) {
 		pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
+		kfree(grace_start);
 		return -EINVAL;
 	}
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c484a2b6cd10..0462eeddfff9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2408,7 +2408,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	default:				/* checked by xdr code */
 		WARN_ON_ONCE(1);
 	case SP4_SSV:
-		return nfserr_encr_alg_unsupp;
+		status = nfserr_encr_alg_unsupp;
+		goto out_nolock;
 	}
 
 	/* Cases below refer to rfc 5661 section 18.35.4: */
@@ -2586,21 +2587,26 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
 	return nfs_ok;
 }
 
+/*
+ * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now.
+ * These are based on similar macros in linux/sunrpc/msg_prot.h .
+ */
+#define RPC_MAX_HEADER_WITH_AUTH_SYS \
+	(RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK))
+
+#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \
+	(RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK))
+
 #define NFSD_CB_MAX_REQ_SZ	((NFS4_enc_cb_recall_sz + \
-				 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+				 RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32))
 #define NFSD_CB_MAX_RESP_SZ	((NFS4_dec_cb_recall_sz + \
-				 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+				 RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \
+				 sizeof(__be32))
 
 static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
 {
 	ca->headerpadsz = 0;
 
-	/*
-	 * These RPC_MAX_HEADER macros are overkill, especially since we
-	 * don't even do gss on the backchannel yet.  But this is still
-	 * less than 1k.  Tighten up this estimate in the unlikely event
-	 * it turns out to be a problem for some client:
-	 */
 	if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
 		return nfserr_toosmall;
 	if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
@@ -2710,10 +2716,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		goto out_free_conn;
 	}
 	status = nfs_ok;
-	/*
-	 * We do not support RDMA or persistent sessions
-	 */
+	/* Persistent sessions are not supported */
 	cr_ses->flags &= ~SESSION4_PERSIST;
+	/* Upshifting from TCP to RDMA is not supported */
 	cr_ses->flags &= ~SESSION4_RDMA;
 
 	init_session(rqstp, new, conf, cr_ses);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d6ef0955a979..9df898ba648f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
 
 	READ_BUF(4);
 	rename->rn_snamelen = be32_to_cpup(p++);
-	READ_BUF(rename->rn_snamelen + 4);
+	READ_BUF(rename->rn_snamelen);
 	SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+	READ_BUF(4);
 	rename->rn_tnamelen = be32_to_cpup(p++);
 	READ_BUF(rename->rn_tnamelen);
 	SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
@@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 	READ_BUF(8);
 	setclientid->se_callback_prog = be32_to_cpup(p++);
 	setclientid->se_callback_netid_len = be32_to_cpup(p++);
-
-	READ_BUF(setclientid->se_callback_netid_len + 4);
+	READ_BUF(setclientid->se_callback_netid_len);
 	SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+	READ_BUF(4);
 	setclientid->se_callback_addr_len = be32_to_cpup(p++);
 
-	READ_BUF(setclientid->se_callback_addr_len + 4);
+	READ_BUF(setclientid->se_callback_addr_len);
 	SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+	READ_BUF(4);
 	setclientid->se_callback_ident = be32_to_cpup(p++);
 
 	DECODE_TAIL;
@@ -1835,8 +1837,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 
 	READ_BUF(4);
 	argp->taglen = be32_to_cpup(p++);
-	READ_BUF(argp->taglen + 8);
+	READ_BUF(argp->taglen);
 	SAVEMEM(argp->tag, argp->taglen);
+	READ_BUF(8);
 	argp->minorversion = be32_to_cpup(p++);
 	argp->opcnt = be32_to_cpup(p++);
 	max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
@@ -3060,7 +3063,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 		p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
 						NFS4_MAX_SESSIONID_LEN);
 		*p++ = cpu_to_be32(bcts->dir);
-		/* Sorry, we do not yet support RDMA over 4.1: */
+		/* Upshifting from TCP to RDMA is not supported */
 		*p++ = cpu_to_be32(0);
 	}
 	return nfserr;
@@ -3362,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read(
 	struct xdr_stream *xdr = &resp->xdr;
 	struct xdr_buf *buf = xdr->buf;
 	u32 eof;
+	long len;
 	int space_left;
 	__be32 nfserr;
 	__be32 *p = xdr->p - 2;
@@ -3370,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read(
 	if (xdr->end - xdr->p < 1)
 		return nfserr_resource;
 
+	len = maxcount;
 	nfserr = nfsd_splice_read(read->rd_rqstp, file,
 				  read->rd_offset, &maxcount);
 	if (nfserr) {
@@ -3382,8 +3387,8 @@ static __be32 nfsd4_encode_splice_read(
 		return nfserr;
 	}
 
-	eof = (read->rd_offset + maxcount >=
-	       d_inode(read->rd_fhp->fh_dentry)->i_size);
+	eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
+				d_inode(read->rd_fhp->fh_dentry)->i_size);
 
 	*(p++) = htonl(eof);
 	*(p++) = htonl(maxcount);
@@ -3453,14 +3458,15 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
 	}
 	read->rd_vlen = v;
 
+	len = maxcount;
 	nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
 			read->rd_vlen, &maxcount);
 	if (nfserr)
 		return nfserr;
 	xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
 
-	eof = (read->rd_offset + maxcount >=
-	       d_inode(read->rd_fhp->fh_dentry)->i_size);
+	eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
+				d_inode(read->rd_fhp->fh_dentry)->i_size);
 
 	tmp = htonl(eof);
 	write_bytes_to_xdr_buf(xdr->buf, starting_len    , &tmp, 4);
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d4c4453674c6..7d073b9b1553 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
 	u32		notify_types;
 
 	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfs4_client *clp,
 			struct nfsd4_getdeviceinfo *gdevp);
 	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
 			struct nfsd4_getdeviceinfo *gdevp);
@@ -32,10 +33,17 @@ struct nfsd4_layout_ops {
 
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
+
+	void (*fence_client)(struct nfs4_layout_stateid *ls);
 };
 
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 extern const struct nfsd4_layout_ops bl_layout_ops;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+extern const struct nfsd4_layout_ops scsi_layout_ops;
+#endif
 
 __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c11ba316f23f..2d573ec057f8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -139,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode)
 	       || createmode == NFS4_CREATE_EXCLUSIVE4_1;
 }
 
+static inline bool nfsd_eof_on_read(long requested, long read,
+				loff_t offset, loff_t size)
+{
+	/* We assume a short read means eof: */
+	if (requested > read)
+		return true;
+	/*
+	 * A non-short read might also reach end of file.  The spec
+	 * still requires us to set eof in that case.
+	 *
+	 * Further operations may have modified the file size since
+	 * the read, so the following check is not atomic with the read.
+	 * We've only seen that cause a problem for a client in the case
+	 * where the read returned a count of 0 without setting eof.
+	 * That case was fixed by the addition of the above check.
+	 */
+	return (offset + read >= size);
+}
+
 #endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
 	quota_local.o		\
 	quota_global.o		\
 	xattr.o			\
-	acl.o
+	acl.o	\
+	filecheck.o
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..2cabbcf2f28e
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,606 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2016 SuSE.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+	"SUCCESS",
+	"FAILED",
+	"INPROGRESS",
+	"READONLY",
+	"INJBD",
+	"INVALIDINO",
+	"BLOCKECC",
+	"BLOCKNO",
+	"VALIDFLAG",
+	"GENERATION",
+	"UNSUPPORTED"
+};
+
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+
+struct ocfs2_filecheck {
+	struct list_head fc_head;	/* File check entry list head */
+	spinlock_t fc_lock;
+	unsigned int fc_max;	/* Maximum number of entry in list */
+	unsigned int fc_size;	/* Current entry count in list */
+	unsigned int fc_done;	/* Finished entry count in list */
+};
+
+struct ocfs2_filecheck_sysfs_entry {	/* sysfs entry per mounting */
+	struct list_head fs_list;
+	atomic_t fs_count;
+	struct super_block *fs_sb;
+	struct kset *fs_devicekset;
+	struct kset *fs_fcheckkset;
+	struct ocfs2_filecheck *fs_fcheck;
+};
+
+#define OCFS2_FILECHECK_MAXSIZE		100
+#define OCFS2_FILECHECK_MINSIZE		10
+
+/* File check operation type */
+enum {
+	OCFS2_FILECHECK_TYPE_CHK = 0,	/* Check a file(inode) */
+	OCFS2_FILECHECK_TYPE_FIX,	/* Fix a file(inode) */
+	OCFS2_FILECHECK_TYPE_SET = 100	/* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_entry {
+	struct list_head fe_list;
+	unsigned long fe_ino;
+	unsigned int fe_type;
+	unsigned int fe_done:1;
+	unsigned int fe_status:31;
+};
+
+struct ocfs2_filecheck_args {
+	unsigned int fa_type;
+	union {
+		unsigned long fa_ino;
+		unsigned int fa_len;
+	};
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+	if (!errno)
+		return ocfs2_filecheck_errs[errno];
+
+	BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+	       errno > OCFS2_FILECHECK_ERR_END);
+	return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck_chk =
+					__ATTR(check, S_IRUSR | S_IWUSR,
+					ocfs2_filecheck_show,
+					ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_fix =
+					__ATTR(fix, S_IRUSR | S_IWUSR,
+					ocfs2_filecheck_show,
+					ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_set =
+					__ATTR(set, S_IRUSR | S_IWUSR,
+					ocfs2_filecheck_show,
+					ocfs2_filecheck_store);
+
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+	schedule();
+	return 0;
+}
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+	struct ocfs2_filecheck_entry *p;
+
+	if (!atomic_dec_and_test(&entry->fs_count))
+		wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+				 TASK_UNINTERRUPTIBLE);
+
+	spin_lock(&entry->fs_fcheck->fc_lock);
+	while (!list_empty(&entry->fs_fcheck->fc_head)) {
+		p = list_first_entry(&entry->fs_fcheck->fc_head,
+				     struct ocfs2_filecheck_entry, fe_list);
+		list_del(&p->fe_list);
+		BUG_ON(!p->fe_done); /* To free a undone file check entry */
+		kfree(p);
+	}
+	spin_unlock(&entry->fs_fcheck->fc_lock);
+
+	kset_unregister(entry->fs_fcheckkset);
+	kset_unregister(entry->fs_devicekset);
+	kfree(entry->fs_fcheck);
+	kfree(entry);
+}
+
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+	spin_lock(&ocfs2_filecheck_sysfs_lock);
+	list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+	spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+	struct ocfs2_filecheck_sysfs_entry *p;
+
+	spin_lock(&ocfs2_filecheck_sysfs_lock);
+	list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+		if (!strcmp(p->fs_sb->s_id, devname)) {
+			list_del(&p->fs_list);
+			spin_unlock(&ocfs2_filecheck_sysfs_lock);
+			ocfs2_filecheck_sysfs_free(p);
+			return 0;
+		}
+	}
+	spin_unlock(&ocfs2_filecheck_sysfs_lock);
+	return 1;
+}
+
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->fs_count))
+		wake_up_atomic_t(&entry->fs_count);
+}
+
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+	struct ocfs2_filecheck_sysfs_entry *p = NULL;
+
+	spin_lock(&ocfs2_filecheck_sysfs_lock);
+	list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+		if (!strcmp(p->fs_sb->s_id, devname)) {
+			atomic_inc(&p->fs_count);
+			spin_unlock(&ocfs2_filecheck_sysfs_lock);
+			return p;
+		}
+	}
+	spin_unlock(&ocfs2_filecheck_sysfs_lock);
+	return NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+	int ret = 0;
+	struct kset *device_kset = NULL;
+	struct kset *fcheck_kset = NULL;
+	struct ocfs2_filecheck *fcheck = NULL;
+	struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+	struct attribute **attrs = NULL;
+	struct attribute_group attrgp;
+
+	if (!ocfs2_kset)
+		return -ENOMEM;
+
+	attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
+	if (!attrs) {
+		ret = -ENOMEM;
+		goto error;
+	} else {
+		attrs[0] = &ocfs2_attr_filecheck_chk.attr;
+		attrs[1] = &ocfs2_attr_filecheck_fix.attr;
+		attrs[2] = &ocfs2_attr_filecheck_set.attr;
+		attrs[3] = NULL;
+		memset(&attrgp, 0, sizeof(attrgp));
+		attrgp.attrs = attrs;
+	}
+
+	fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+	if (!fcheck) {
+		ret = -ENOMEM;
+		goto error;
+	} else {
+		INIT_LIST_HEAD(&fcheck->fc_head);
+		spin_lock_init(&fcheck->fc_lock);
+		fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+		fcheck->fc_size = 0;
+		fcheck->fc_done = 0;
+	}
+
+	if (strlen(sb->s_id) <= 0) {
+		mlog(ML_ERROR,
+		"Cannot get device basename when create filecheck sysfs\n");
+		ret = -ENODEV;
+		goto error;
+	}
+
+	device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
+	if (!device_kset) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	fcheck_kset = kset_create_and_add("filecheck", NULL,
+					  &device_kset->kobj);
+	if (!fcheck_kset) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
+	if (ret)
+		goto error;
+
+	entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+	if (!entry) {
+		ret = -ENOMEM;
+		goto error;
+	} else {
+		atomic_set(&entry->fs_count, 1);
+		entry->fs_sb = sb;
+		entry->fs_devicekset = device_kset;
+		entry->fs_fcheckkset = fcheck_kset;
+		entry->fs_fcheck = fcheck;
+		ocfs2_filecheck_sysfs_add(entry);
+	}
+
+	kfree(attrs);
+	return 0;
+
+error:
+	kfree(attrs);
+	kfree(entry);
+	kfree(fcheck);
+	kset_unregister(fcheck_kset);
+	kset_unregister(device_kset);
+	return ret;
+}
+
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+	return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+			      unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+			   unsigned int len)
+{
+	int ret;
+
+	if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+		return -EINVAL;
+
+	spin_lock(&ent->fs_fcheck->fc_lock);
+	if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+		mlog(ML_ERROR,
+		"Cannot set online file check maximum entry number "
+		"to %u due to too many pending entries(%u)\n",
+		len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+		ret = -EBUSY;
+	} else {
+		if (len < ent->fs_fcheck->fc_size)
+			BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+				ent->fs_fcheck->fc_size - len));
+
+		ent->fs_fcheck->fc_max = len;
+		ret = 0;
+	}
+	spin_unlock(&ent->fs_fcheck->fc_lock);
+
+	return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN	24
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+			      unsigned long *val)
+{
+	char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+	memcpy(buffer, buf, count);
+	buffer[count] = '\0';
+
+	if (kstrtoul(buffer, 0, val))
+		return 1;
+
+	return 0;
+}
+
+static int
+ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
+{
+	if (!strncmp(name, "fix", 4))
+		*type = OCFS2_FILECHECK_TYPE_FIX;
+	else if (!strncmp(name, "check", 6))
+		*type = OCFS2_FILECHECK_TYPE_CHK;
+	else if (!strncmp(name, "set", 4))
+		*type = OCFS2_FILECHECK_TYPE_SET;
+	else
+		return 1;
+
+	return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
+			   struct ocfs2_filecheck_args *args)
+{
+	unsigned long val = 0;
+	unsigned int type;
+
+	/* too short/long args length */
+	if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
+		return 1;
+
+	if (ocfs2_filecheck_type_parse(name, &type))
+		return 1;
+	if (ocfs2_filecheck_args_get_long(buf, count, &val))
+		return 1;
+
+	if (val <= 0)
+		return 1;
+
+	args->fa_type = type;
+	if (type == OCFS2_FILECHECK_TYPE_SET)
+		args->fa_len = (unsigned int)val;
+	else
+		args->fa_ino = val;
+
+	return 0;
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+
+	ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+	unsigned int type;
+	struct ocfs2_filecheck_entry *p;
+	struct ocfs2_filecheck_sysfs_entry *ent;
+
+	if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
+		return -EINVAL;
+
+	ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+	if (!ent) {
+		mlog(ML_ERROR,
+		"Cannot get the corresponding entry via device basename %s\n",
+		kobj->name);
+		return -ENODEV;
+	}
+
+	if (type == OCFS2_FILECHECK_TYPE_SET) {
+		spin_lock(&ent->fs_fcheck->fc_lock);
+		total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
+		spin_unlock(&ent->fs_fcheck->fc_lock);
+		goto exit;
+	}
+
+	ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
+	total += ret;
+	remain -= ret;
+	spin_lock(&ent->fs_fcheck->fc_lock);
+	list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+		if (p->fe_type != type)
+			continue;
+
+		ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
+			       p->fe_ino, p->fe_done,
+			       ocfs2_filecheck_error(p->fe_status));
+		if (ret < 0) {
+			total = ret;
+			break;
+		}
+		if (ret == remain) {
+			/* snprintf() didn't fit */
+			total = -E2BIG;
+			break;
+		}
+		total += ret;
+		remain -= ret;
+	}
+	spin_unlock(&ent->fs_fcheck->fc_lock);
+
+exit:
+	ocfs2_filecheck_sysfs_put(ent);
+	return total;
+}
+
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+	struct ocfs2_filecheck_entry *p;
+
+	list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+		if (p->fe_done) {
+			list_del(&p->fe_list);
+			kfree(p);
+			ent->fs_fcheck->fc_size--;
+			ent->fs_fcheck->fc_done--;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+			      unsigned int count)
+{
+	unsigned int i = 0;
+	unsigned int ret = 0;
+
+	while (i++ < count) {
+		if (ocfs2_filecheck_erase_entry(ent))
+			ret++;
+		else
+			break;
+	}
+
+	return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+			   struct ocfs2_filecheck_entry *entry)
+{
+	entry->fe_done = 1;
+	spin_lock(&ent->fs_fcheck->fc_lock);
+	ent->fs_fcheck->fc_done++;
+	spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned int
+ocfs2_filecheck_handle(struct super_block *sb,
+		       unsigned long ino, unsigned int flags)
+{
+	unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
+	struct inode *inode = NULL;
+	int rc;
+
+	inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+	if (IS_ERR(inode)) {
+		rc = (int)(-(long)inode);
+		if (rc >= OCFS2_FILECHECK_ERR_START &&
+		    rc < OCFS2_FILECHECK_ERR_END)
+			ret = rc;
+		else
+			ret = OCFS2_FILECHECK_ERR_FAILED;
+	} else
+		iput(inode);
+
+	return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+			     struct ocfs2_filecheck_entry *entry)
+{
+	if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+		entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+				entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+	else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+		entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+				entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+	else
+		entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+	ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct ocfs2_filecheck_args args;
+	struct ocfs2_filecheck_entry *entry;
+	struct ocfs2_filecheck_sysfs_entry *ent;
+	ssize_t ret = 0;
+
+	if (count == 0)
+		return count;
+
+	if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
+		mlog(ML_ERROR, "Invalid arguments for online file check\n");
+		return -EINVAL;
+	}
+
+	ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+	if (!ent) {
+		mlog(ML_ERROR,
+		"Cannot get the corresponding entry via device basename %s\n",
+		kobj->parent->name);
+		return -ENODEV;
+	}
+
+	if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+		ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+		goto exit;
+	}
+
+	entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+	if (!entry) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	spin_lock(&ent->fs_fcheck->fc_lock);
+	if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+	    (ent->fs_fcheck->fc_done == 0)) {
+		mlog(ML_ERROR,
+		"Cannot do more file check "
+		"since file check queue(%u) is full now\n",
+		ent->fs_fcheck->fc_max);
+		ret = -EBUSY;
+		kfree(entry);
+	} else {
+		if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+		    (ent->fs_fcheck->fc_done > 0)) {
+			/* Delete the oldest entry which was done,
+			 * make sure the entry size in list does
+			 * not exceed maximum value
+			 */
+			BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+		}
+
+		entry->fe_ino = args.fa_ino;
+		entry->fe_type = args.fa_type;
+		entry->fe_done = 0;
+		entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+		list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+		ent->fs_fcheck->fc_size++;
+	}
+	spin_unlock(&ent->fs_fcheck->fc_lock);
+
+	if (!ret)
+		ocfs2_filecheck_handle_entry(ent, entry);
+
+exit:
+	ocfs2_filecheck_sysfs_put(ent);
+	return (!ret ? count : ret);
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..e5cd002a2c09
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,49 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2016 SuSE.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+	OCFS2_FILECHECK_ERR_SUCCESS = 0,	/* Success */
+	OCFS2_FILECHECK_ERR_FAILED = 1000,	/* Other failure */
+	OCFS2_FILECHECK_ERR_INPROGRESS,		/* In progress */
+	OCFS2_FILECHECK_ERR_READONLY,		/* Read only */
+	OCFS2_FILECHECK_ERR_INJBD,		/* Buffer in jbd */
+	OCFS2_FILECHECK_ERR_INVALIDINO,		/* Invalid ino */
+	OCFS2_FILECHECK_ERR_BLOCKECC,		/* Block ecc */
+	OCFS2_FILECHECK_ERR_BLOCKNO,		/* Block number */
+	OCFS2_FILECHECK_ERR_VALIDFLAG,		/* Inode valid flag */
+	OCFS2_FILECHECK_ERR_GENERATION,		/* Inode generation */
+	OCFS2_FILECHECK_ERR_UNSUPPORTED		/* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START	OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END		OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+
+#endif  /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 36294446d960..ba495beff1c2 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
 #include "xattr.h"
 #include "refcounttree.h"
 #include "ocfs2_trace.h"
+#include "filecheck.h"
 
 #include "buffer_head_io.h"
 
@@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 				    struct inode *inode,
 				    struct buffer_head *fe_bh);
 
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+						 struct buffer_head **bh,
+						 int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+						struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+					      struct buffer_head *bh);
+
 void ocfs2_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 			 int sysfile_type)
 {
+	int rc = 0;
 	struct inode *inode = NULL;
 	struct super_block *sb = osb->sb;
 	struct ocfs2_find_inode_args args;
@@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 	}
 	trace_ocfs2_iget5_locked(inode->i_state);
 	if (inode->i_state & I_NEW) {
-		ocfs2_read_locked_inode(inode, &args);
+		rc = ocfs2_read_locked_inode(inode, &args);
 		unlock_new_inode(inode);
 	}
 	if (is_bad_inode(inode)) {
 		iput(inode);
-		inode = ERR_PTR(-ESTALE);
+		if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+		    (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+			/* Return OCFS2_FILECHECK_ERR_XXX related errno */
+			inode = ERR_PTR(rc);
+		else
+			inode = ERR_PTR(-ESTALE);
 		goto bail;
 	}
 
@@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	struct ocfs2_super *osb;
 	struct ocfs2_dinode *fe;
 	struct buffer_head *bh = NULL;
-	int status, can_lock;
+	int status, can_lock, lock_level = 0;
 	u32 generation = 0;
 
 	status = -EINVAL;
@@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 			mlog_errno(status);
 			return status;
 		}
-		status = ocfs2_inode_lock(inode, NULL, 0);
+		status = ocfs2_inode_lock(inode, NULL, lock_level);
 		if (status) {
 			make_bad_inode(inode);
 			mlog_errno(status);
@@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	}
 
 	if (can_lock) {
-		status = ocfs2_read_inode_block_full(inode, &bh,
-						     OCFS2_BH_IGNORE_CACHE);
+		if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+			status = ocfs2_filecheck_read_inode_block_full(inode,
+						&bh, OCFS2_BH_IGNORE_CACHE, 0);
+		else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+			status = ocfs2_filecheck_read_inode_block_full(inode,
+						&bh, OCFS2_BH_IGNORE_CACHE, 1);
+		else
+			status = ocfs2_read_inode_block_full(inode,
+						&bh, OCFS2_BH_IGNORE_CACHE);
 	} else {
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 		/*
 		 * If buffer is in jbd, then its checksum may not have been
 		 * computed as yet.
 		 */
-		if (!status && !buffer_jbd(bh))
-			status = ocfs2_validate_inode_block(osb->sb, bh);
+		if (!status && !buffer_jbd(bh)) {
+			if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+				status = ocfs2_filecheck_validate_inode_block(
+								osb->sb, bh);
+			else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+				status = ocfs2_filecheck_repair_inode_block(
+								osb->sb, bh);
+			else
+				status = ocfs2_validate_inode_block(
+								osb->sb, bh);
+		}
 	}
 	if (status < 0) {
 		mlog_errno(status);
@@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
 
+	if (buffer_dirty(bh) && !buffer_jbd(bh)) {
+		if (can_lock) {
+			ocfs2_inode_unlock(inode, lock_level);
+			lock_level = 1;
+			ocfs2_inode_lock(inode, NULL, lock_level);
+		}
+		status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	status = 0;
 
 bail:
 	if (can_lock)
-		ocfs2_inode_unlock(inode, 0);
+		ocfs2_inode_unlock(inode, lock_level);
 
 	if (status < 0)
 		make_bad_inode(inode);
@@ -1397,6 +1441,169 @@ bail:
 	return rc;
 }
 
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+						struct buffer_head *bh)
+{
+	int rc = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	trace_ocfs2_filecheck_validate_inode_block(
+		(unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * Call ocfs2_validate_meta_ecc() first since it has ecc repair
+	 * function, but we should not return error immediately when ecc
+	 * validation fails, because the reason is quite likely the invalid
+	 * inode number inputed.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+	if (rc) {
+		mlog(ML_ERROR,
+		     "Filecheck: checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+	}
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		mlog(ML_ERROR,
+		     "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+		     (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+		rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+		goto bail;
+	} else if (rc)
+		goto bail;
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		mlog(ML_ERROR,
+		     "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+		     (unsigned long long)bh->b_blocknr,
+		     (unsigned long long)le64_to_cpu(di->i_blkno));
+		rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+		goto bail;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		mlog(ML_ERROR,
+		     "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
+		     "not set\n",
+		     (unsigned long long)bh->b_blocknr);
+		rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+		goto bail;
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		mlog(ML_ERROR,
+		     "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+		     (unsigned long long)bh->b_blocknr,
+		     le32_to_cpu(di->i_fs_generation));
+		rc = -OCFS2_FILECHECK_ERR_GENERATION;
+		goto bail;
+	}
+
+bail:
+	return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+					      struct buffer_head *bh)
+{
+	int changed = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (!ocfs2_filecheck_validate_inode_block(sb, bh))
+		return 0;
+
+	trace_ocfs2_filecheck_repair_inode_block(
+		(unsigned long long)bh->b_blocknr);
+
+	if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+	    ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+		mlog(ML_ERROR,
+		     "Filecheck: cannot repair dinode #%llu "
+		     "on readonly filesystem\n",
+		     (unsigned long long)bh->b_blocknr);
+		return -OCFS2_FILECHECK_ERR_READONLY;
+	}
+
+	if (buffer_jbd(bh)) {
+		mlog(ML_ERROR,
+		     "Filecheck: cannot repair dinode #%llu, "
+		     "its buffer is in jbd\n",
+		     (unsigned long long)bh->b_blocknr);
+		return -OCFS2_FILECHECK_ERR_INJBD;
+	}
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		/* Cannot fix invalid inode block */
+		return -OCFS2_FILECHECK_ERR_INVALIDINO;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		/* Cannot just add VALID_FL flag back as a fix,
+		 * need more things to check here.
+		 */
+		return -OCFS2_FILECHECK_ERR_VALIDFLAG;
+	}
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		di->i_blkno = cpu_to_le64(bh->b_blocknr);
+		changed = 1;
+		mlog(ML_ERROR,
+		     "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+		     (unsigned long long)bh->b_blocknr,
+		     (unsigned long long)le64_to_cpu(di->i_blkno));
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+		changed = 1;
+		mlog(ML_ERROR,
+		     "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+		     (unsigned long long)bh->b_blocknr,
+		     le32_to_cpu(di->i_fs_generation));
+	}
+
+	if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+		ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+		mark_buffer_dirty(bh);
+		mlog(ML_ERROR,
+		     "Filecheck: reset dinode #%llu: compute meta ecc\n",
+		     (unsigned long long)bh->b_blocknr);
+	}
+
+	return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+				      struct buffer_head **bh,
+				      int flags, int type)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	if (!type) /* Check inode block */
+		rc = ocfs2_read_blocks(INODE_CACHE(inode),
+				OCFS2_I(inode)->ip_blkno,
+				1, &tmp, flags,
+				ocfs2_filecheck_validate_inode_block);
+	else /* Repair inode block */
+		rc = ocfs2_read_blocks(INODE_CACHE(inode),
+				OCFS2_I(inode)->ip_blkno,
+				1, &tmp, flags,
+				ocfs2_filecheck_repair_inode_block);
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
 int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
 				int flags)
 {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..01635e016b3e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE		0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK	0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX	0x8
+
 struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
 			 int sysfile_type);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a52a2dbc064e..24b7e7f591dc 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1540,6 +1540,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
 DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
 
 DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
 
 TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
 	TP_PROTO(void *task, void *dc_task, unsigned long long ino,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
 	.attrs = ocfs2_attrs,
 };
 
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
 
 static void ocfs2_sysfs_exit(void)
 {
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 
+extern struct kset *ocfs2_kset;
+
 #endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 302854ee0985..ccc9386c42c5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,6 +74,7 @@
 #include "suballoc.h"
 
 #include "buffer_head_io.h"
+#include "filecheck.h"
 
 static struct kmem_cache *ocfs2_inode_cachep;
 struct kmem_cache *ocfs2_dquot_cachep;
@@ -1205,6 +1206,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	/* Start this when the mount is almost sure of being successful */
 	ocfs2_orphan_scan_start(osb);
 
+	/* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+	ocfs2_filecheck_create_sysfs(sb);
+
 	return status;
 
 read_super_error:
@@ -1668,6 +1672,7 @@ static void ocfs2_put_super(struct super_block *sb)
 
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
+	ocfs2_filecheck_remove_sysfs(sb);
 }
 
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/open.c b/fs/open.c
index 55bdc75e2172..17cb6b1dab75 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
 EXPORT_SYMBOL(filp_open);
 
 struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
-			    const char *filename, int flags)
+			    const char *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
-	int err = build_open_flags(flags, 0, &op);
+	int err = build_open_flags(flags, mode, &op);
 	if (err)
 		return ERR_PTR(err);
-	if (flags & O_CREAT)
-		return ERR_PTR(-EINVAL);
 	return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 2c6f0cb816b4..c54a24360f85 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -4,3 +4,4 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
 ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
 ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
 ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
+ubifs-y += misc.o
diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c
new file mode 100644
index 000000000000..486a2844949f
--- /dev/null
+++ b/fs/ubifs/misc.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include "ubifs.h"
+
+/* Normal UBIFS messages */
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_notice("UBIFS (ubi%d:%d): %pV\n",
+		  c->vi.ubi_num, c->vi.vol_id, &vaf);
+
+	va_end(args);
+}								    \
+
+/* UBIFS error messages */
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("UBIFS error (ubi%d:%d pid %d): %ps: %pV\n",
+	       c->vi.ubi_num, c->vi.vol_id, current->pid,
+	       __builtin_return_address(0),
+	       &vaf);
+
+	va_end(args);
+}								    \
+
+/* UBIFS warning messages */
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_warn("UBIFS warning (ubi%d:%d pid %d): %ps: %pV\n",
+		c->vi.ubi_num, c->vi.vol_id, current->pid,
+		__builtin_return_address(0),
+		&vaf);
+
+	va_end(args);
+}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a5697de763f5..c2a57e193a81 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -42,30 +42,6 @@
 /* Version of this UBIFS implementation */
 #define UBIFS_VERSION 1
 
-/* Normal UBIFS messages */
-#define ubifs_msg(c, fmt, ...)                                      \
-	pr_notice("UBIFS (ubi%d:%d): " fmt "\n",                    \
-		  (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
-/* UBIFS error messages */
-#define ubifs_err(c, fmt, ...)                                      \
-	pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n",      \
-	       (c)->vi.ubi_num, (c)->vi.vol_id, current->pid,       \
-	       __func__, ##__VA_ARGS__)
-/* UBIFS warning messages */
-#define ubifs_warn(c, fmt, ...)                                     \
-	pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n",   \
-		(c)->vi.ubi_num, (c)->vi.vol_id, current->pid,      \
-		__func__, ##__VA_ARGS__)
-/*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
- */
-#define ubifs_errc(c, fmt, ...)                                     \
-	do {                                                        \
-		if (!(c)->probing)                                  \
-			ubifs_err(c, fmt, ##__VA_ARGS__);           \
-	} while (0)
-
 /* UBIFS file system VFS magic number */
 #define UBIFS_SUPER_MAGIC 0x24051905
 
@@ -1802,4 +1778,21 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
 #include "misc.h"
 #include "key.h"
 
+/* Normal UBIFS messages */
+__printf(2, 3)
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...)						\
+do {									\
+	if (!(c)->probing)						\
+		ubifs_err(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
 #endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c7f4d434d098..b043e044121d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -59,7 +59,6 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
-#include <linux/posix_acl_xattr.h>
 
 /*
  * Limit the number of extended attributes per inode so that the total size
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f64639176670..3542d94fddce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,4 +121,5 @@ xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_PNFS)		+= xfs_pnfs.o
+xfs-$(CONFIG_NFSD_BLOCKLAYOUT)	+= xfs_pnfs.o
+xfs-$(CONFIG_NFSD_SCSILAYOUT)	+= xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 2816d42507bc..a1b2dd828b9d 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
 	.fh_to_parent		= xfs_fs_fh_to_parent,
 	.get_parent		= xfs_fs_get_parent,
 	.commit_metadata	= xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_PNFS
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 	.get_uuid		= xfs_fs_get_uuid,
 	.map_blocks		= xfs_fs_map_blocks,
 	.commit_blocks		= xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 8147ac108820..93f74853961b 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
 #ifndef _XFS_PNFS_H
 #define _XFS_PNFS_H 1
 
-#ifdef CONFIG_NFSD_PNFS
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
 int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
 int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 		struct iomap *iomap, bool write, u32 *device_generation);