365 files changed, 13266 insertions, 9359 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2a5de610dd8f..bdabb2765d1b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -483,6 +483,9 @@ static int v9fs_test_inode(struct inode *inode, void *data)
 
 	if (v9inode->qid.type != st->qid.type)
 		return 0;
+
+	if (v9inode->qid.path != st->qid.path)
+		return 0;
 	return 1;
 }
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 70f9887c59a9..7f6ae21a27b3 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -87,6 +87,9 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data)
 
 	if (v9inode->qid.type != st->qid.type)
 		return 0;
+
+	if (v9inode->qid.path != st->qid.path)
+		return 0;
 	return 1;
 }
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8b75463cb211..af03c2a901eb 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,13 +94,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	if (v9ses->cache)
 		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
-	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME;
 	if (!v9ses->cache)
-		sb->s_flags |= MS_SYNCHRONOUS;
+		sb->s_flags |= SB_SYNCHRONOUS;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 #endif
 
 	return 0;
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b2f82cf6bf86..58c2bbd385ad 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -34,8 +34,8 @@ config ARCH_BINFMT_ELF_STATE
 
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
-	default y
-	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+	default y if !BINFMT_ELF
+	depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
 	select ELFCORE
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c9fdfb112933..cfda2c7caedc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -213,7 +213,7 @@ static int parse_options(struct super_block *sb, char *options)
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return parse_options(sb, data);
 }
 
@@ -372,7 +372,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root;
 	int ret = -EINVAL;
 
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
 	if (!asb)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 185d5ab7e986..0f0e6925e97d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -453,7 +453,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
 	pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
 	if (!sb_rdonly(sb))
 		pr_warn("Remounting filesystem read-only\n");
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	va_end(args);
 }
 
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 2b1399611d9e..5ba9ef2742f6 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -250,12 +250,12 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 	int i, res = 0;
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		return 0;
 
 	if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
 		pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 		return 0;
 	}
 
@@ -288,7 +288,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 		if (affs_checksum_block(sb, bh)) {
 			pr_warn("Bitmap %u invalid - mounting %s read only.\n",
 				bm->bm_key, sb->s_id);
-			*flags |= MS_RDONLY;
+			*flags |= SB_RDONLY;
 			goto out;
 		}
 		pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 884bedab7266..1117e36134cc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -356,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic             = AFFS_SUPER_MAGIC;
 	sb->s_op                = &affs_sops;
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -466,7 +466,7 @@ got_root:
 	if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
 	     || chksum == MUFS_DCOFS) && !sb_rdonly(sb)) {
 		pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	switch (chksum) {
 	case MUFS_FS:
@@ -488,7 +488,7 @@ got_root:
 		/* fall thru */
 	case FS_OFS:
 		affs_set_opt(sbi->s_flags, SF_OFS);
-		sb->s_flags |= MS_NOEXEC;
+		sb->s_flags |= SB_NOEXEC;
 		break;
 	case MUFS_DCOFS:
 	case MUFS_INTLOFS:
@@ -497,7 +497,7 @@ got_root:
 	case FS_INTLOFS:
 		affs_set_opt(sbi->s_flags, SF_INTL);
 		affs_set_opt(sbi->s_flags, SF_OFS);
-		sb->s_flags |= MS_NOEXEC;
+		sb->s_flags |= SB_NOEXEC;
 		break;
 	default:
 		pr_err("Unknown filesystem on device %s: %08X\n",
@@ -513,7 +513,7 @@ got_root:
 			sig, sig[3] + '0', blocksize);
 	}
 
-	sb->s_flags |= MS_NODEV | MS_NOSUID;
+	sb->s_flags |= SB_NODEV | SB_NOSUID;
 
 	sbi->s_data_blksize = sb->s_blocksize;
 	if (affs_test_opt(sbi->s_flags, SF_OFS))
@@ -570,7 +570,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
 
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 
 	memcpy(volume, sbi->s_volume, 32);
 	if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
@@ -596,10 +596,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	memcpy(sbi->s_volume, volume, 32);
 	spin_unlock(&sbi->symlink_lock);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		affs_free_bitmap(sb);
 	else
 		res = affs_init_bitmap(sb, flags);
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 641148208e90..45b7fc405fa6 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -7,6 +7,7 @@ afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
 
 kafs-objs := \
 	$(afs-cache-y) \
+	addr_list.o \
 	callback.o \
 	cell.o \
 	cmservice.o \
@@ -19,14 +20,14 @@ kafs-objs := \
 	misc.o \
 	mntpt.o \
 	proc.o \
+	rotate.o \
 	rxrpc.o \
 	security.o \
 	server.o \
+	server_list.o \
 	super.o \
 	netdevices.o \
 	vlclient.o \
-	vlocation.o \
-	vnode.o \
 	volume.o \
 	write.o \
 	xattr.o
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
new file mode 100644
index 000000000000..a537368ba0db
--- /dev/null
+++ b/fs/afs/addr_list.c
@@ -0,0 +1,381 @@
+/* Server address list management
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/dns_resolver.h>
+#include <linux/inet.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+//#define AFS_MAX_ADDRESSES
+//	((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) /
+//			sizeof(struct sockaddr_rxrpc)))
+#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
+
+/*
+ * Release an address list.
+ */
+void afs_put_addrlist(struct afs_addr_list *alist)
+{
+	if (alist && refcount_dec_and_test(&alist->usage))
+		call_rcu(&alist->rcu, (rcu_callback_t)kfree);
+}
+
+/*
+ * Allocate an address list.
+ */
+struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+					 unsigned short service,
+					 unsigned short port)
+{
+	struct afs_addr_list *alist;
+	unsigned int i;
+
+	_enter("%u,%u,%u", nr, service, port);
+
+	alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr,
+			GFP_KERNEL);
+	if (!alist)
+		return NULL;
+
+	refcount_set(&alist->usage, 1);
+
+	for (i = 0; i < nr; i++) {
+		struct sockaddr_rxrpc *srx = &alist->addrs[i];
+		srx->srx_family			= AF_RXRPC;
+		srx->srx_service		= service;
+		srx->transport_type		= SOCK_DGRAM;
+		srx->transport_len		= sizeof(srx->transport.sin6);
+		srx->transport.sin6.sin6_family	= AF_INET6;
+		srx->transport.sin6.sin6_port	= htons(port);
+	}
+
+	return alist;
+}
+
+/*
+ * Parse a text string consisting of delimited addresses.
+ */
+struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
+					   char delim,
+					   unsigned short service,
+					   unsigned short port)
+{
+	struct afs_addr_list *alist;
+	const char *p, *end = text + len;
+	unsigned int nr = 0;
+
+	_enter("%*.*s,%c", (int)len, (int)len, text, delim);
+
+	if (!len)
+		return ERR_PTR(-EDESTADDRREQ);
+
+	if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
+		delim = ',';
+
+	/* Count the addresses */
+	p = text;
+	do {
+		if (!*p)
+			return ERR_PTR(-EINVAL);
+		if (*p == delim)
+			continue;
+		nr++;
+		if (*p == '[') {
+			p++;
+			if (p == end)
+				return ERR_PTR(-EINVAL);
+			p = memchr(p, ']', end - p);
+			if (!p)
+				return ERR_PTR(-EINVAL);
+			p++;
+			if (p >= end)
+				break;
+		}
+
+		p = memchr(p, delim, end - p);
+		if (!p)
+			break;
+		p++;
+	} while (p < end);
+
+	_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
+	if (nr > AFS_MAX_ADDRESSES)
+		nr = AFS_MAX_ADDRESSES;
+
+	alist = afs_alloc_addrlist(nr, service, port);
+	if (!alist)
+		return ERR_PTR(-ENOMEM);
+
+	/* Extract the addresses */
+	p = text;
+	do {
+		struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
+		char tdelim = delim;
+
+		if (*p == delim) {
+			p++;
+			continue;
+		}
+
+		if (*p == '[') {
+			p++;
+			tdelim = ']';
+		}
+
+		if (in4_pton(p, end - p,
+			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
+			     tdelim, &p)) {
+			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
+			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
+			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
+		} else if (in6_pton(p, end - p,
+				    srx->transport.sin6.sin6_addr.s6_addr,
+				    tdelim, &p)) {
+			/* Nothing to do */
+		} else {
+			goto bad_address;
+		}
+
+		if (tdelim == ']') {
+			if (p == end || *p != ']')
+				goto bad_address;
+			p++;
+		}
+
+		if (p < end) {
+			if (*p == '+') {
+				/* Port number specification "+1234" */
+				unsigned int xport = 0;
+				p++;
+				if (p >= end || !isdigit(*p))
+					goto bad_address;
+				do {
+					xport *= 10;
+					xport += *p - '0';
+					if (xport > 65535)
+						goto bad_address;
+					p++;
+				} while (p < end && isdigit(*p));
+				srx->transport.sin6.sin6_port = htons(xport);
+			} else if (*p == delim) {
+				p++;
+			} else {
+				goto bad_address;
+			}
+		}
+
+		alist->nr_addrs++;
+	} while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
+
+	_leave(" = [nr %u]", alist->nr_addrs);
+	return alist;
+
+bad_address:
+	kfree(alist);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Compare old and new address lists to see if there's been any change.
+ * - How to do this in better than O(Nlog(N)) time?
+ *   - We don't really want to sort the address list, but would rather take the
+ *     list as we got it so as not to undo record rotation by the DNS server.
+ */
+#if 0
+static int afs_cmp_addr_list(const struct afs_addr_list *a1,
+			     const struct afs_addr_list *a2)
+{
+}
+#endif
+
+/*
+ * Perform a DNS query for VL servers and build a up an address list.
+ */
+struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
+{
+	struct afs_addr_list *alist;
+	char *vllist = NULL;
+	int ret;
+
+	_enter("%s", cell->name);
+
+	ret = dns_query("afsdb", cell->name, cell->name_len,
+			"ipv4", &vllist, _expiry);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
+				     VL_SERVICE, AFS_VL_PORT);
+	if (IS_ERR(alist)) {
+		kfree(vllist);
+		if (alist != ERR_PTR(-ENOMEM))
+			pr_err("Failed to parse DNS data\n");
+		return alist;
+	}
+
+	kfree(vllist);
+	return alist;
+}
+
+/*
+ * Merge an IPv4 entry into a fileserver address list.
+ */
+void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+{
+	struct sockaddr_in6 *a;
+	__be16 xport = htons(port);
+	int i;
+
+	for (i = 0; i < alist->nr_ipv4; i++) {
+		a = &alist->addrs[i].transport.sin6;
+		if (xdr == a->sin6_addr.s6_addr32[3] &&
+		    xport == a->sin6_port)
+			return;
+		if (xdr == a->sin6_addr.s6_addr32[3] &&
+		    xport < a->sin6_port)
+			break;
+		if (xdr < a->sin6_addr.s6_addr32[3])
+			break;
+	}
+
+	if (i < alist->nr_addrs)
+		memmove(alist->addrs + i + 1,
+			alist->addrs + i,
+			sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+	a = &alist->addrs[i].transport.sin6;
+	a->sin6_port		  = xport;
+	a->sin6_addr.s6_addr32[0] = 0;
+	a->sin6_addr.s6_addr32[1] = 0;
+	a->sin6_addr.s6_addr32[2] = htonl(0xffff);
+	a->sin6_addr.s6_addr32[3] = xdr;
+	alist->nr_ipv4++;
+	alist->nr_addrs++;
+}
+
+/*
+ * Merge an IPv6 entry into a fileserver address list.
+ */
+void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+{
+	struct sockaddr_in6 *a;
+	__be16 xport = htons(port);
+	int i, diff;
+
+	for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+		a = &alist->addrs[i].transport.sin6;
+		diff = memcmp(xdr, &a->sin6_addr, 16);
+		if (diff == 0 &&
+		    xport == a->sin6_port)
+			return;
+		if (diff == 0 &&
+		    xport < a->sin6_port)
+			break;
+		if (diff < 0)
+			break;
+	}
+
+	if (i < alist->nr_addrs)
+		memmove(alist->addrs + i + 1,
+			alist->addrs + i,
+			sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+	a = &alist->addrs[i].transport.sin6;
+	a->sin6_port		  = xport;
+	a->sin6_addr.s6_addr32[0] = xdr[0];
+	a->sin6_addr.s6_addr32[1] = xdr[1];
+	a->sin6_addr.s6_addr32[2] = xdr[2];
+	a->sin6_addr.s6_addr32[3] = xdr[3];
+	alist->nr_addrs++;
+}
+
+/*
+ * Get an address to try.
+ */
+bool afs_iterate_addresses(struct afs_addr_cursor *ac)
+{
+	_enter("%hu+%hd", ac->start, (short)ac->index);
+
+	if (!ac->alist)
+		return false;
+
+	if (ac->begun) {
+		ac->index++;
+		if (ac->index == ac->alist->nr_addrs)
+			ac->index = 0;
+
+		if (ac->index == ac->start) {
+			ac->error = -EDESTADDRREQ;
+			return false;
+		}
+	}
+
+	ac->begun = true;
+	ac->responded = false;
+	ac->addr = &ac->alist->addrs[ac->index];
+	return true;
+}
+
+/*
+ * Release an address list cursor.
+ */
+int afs_end_cursor(struct afs_addr_cursor *ac)
+{
+	if (ac->responded && ac->index != ac->start)
+		WRITE_ONCE(ac->alist->index, ac->index);
+
+	afs_put_addrlist(ac->alist);
+	ac->alist = NULL;
+	return ac->error;
+}
+
+/*
+ * Set the address cursor for iterating over VL servers.
+ */
+int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell)
+{
+	struct afs_addr_list *alist;
+	int ret;
+
+	if (!rcu_access_pointer(cell->vl_addrs)) {
+		ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
+				  TASK_INTERRUPTIBLE);
+		if (ret < 0)
+			return ret;
+
+		if (!rcu_access_pointer(cell->vl_addrs) &&
+		    ktime_get_real_seconds() < cell->dns_expiry)
+			return cell->error;
+	}
+
+	read_lock(&cell->vl_addrs_lock);
+	alist = rcu_dereference_protected(cell->vl_addrs,
+					  lockdep_is_held(&cell->vl_addrs_lock));
+	if (alist->nr_addrs > 0)
+		afs_get_addrlist(alist);
+	else
+		alist = NULL;
+	read_unlock(&cell->vl_addrs_lock);
+
+	if (!alist)
+		return -EDESTADDRREQ;
+
+	ac->alist = alist;
+	ac->addr = NULL;
+	ac->start = READ_ONCE(alist->index);
+	ac->index = ac->start;
+	ac->error = 0;
+	ac->begun = false;
+	return 0;
+}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 3c462ff6db63..b94d0edc2b78 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -14,11 +14,14 @@
 
 #include <linux/in.h>
 
-#define AFS_MAXCELLNAME	64		/* maximum length of a cell name */
-#define AFS_MAXVOLNAME	64		/* maximum length of a volume name */
-#define AFSNAMEMAX	256		/* maximum length of a filename plus NUL */
-#define AFSPATHMAX	1024		/* maximum length of a pathname plus NUL */
-#define AFSOPAQUEMAX	1024		/* maximum length of an opaque field */
+#define AFS_MAXCELLNAME		64  	/* Maximum length of a cell name */
+#define AFS_MAXVOLNAME		64  	/* Maximum length of a volume name */
+#define AFS_MAXNSERVERS		8   	/* Maximum servers in a basic volume record */
+#define AFS_NMAXNSERVERS	13  	/* Maximum servers in a N/U-class volume record */
+#define AFS_MAXTYPES		3	/* Maximum number of volume types */
+#define AFSNAMEMAX		256 	/* Maximum length of a filename plus NUL */
+#define AFSPATHMAX		1024	/* Maximum length of a pathname plus NUL */
+#define AFSOPAQUEMAX		1024	/* Maximum length of an opaque field */
 
 typedef unsigned			afs_volid_t;
 typedef unsigned			afs_vnodeid_t;
@@ -72,6 +75,15 @@ struct afs_callback {
 
 #define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
 
+struct afs_uuid {
+	__be32		time_low;			/* low part of timestamp */
+	__be16		time_mid;			/* mid part of timestamp */
+	__be16		time_hi_and_version;		/* high part of timestamp and version  */
+	__s8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+	__s8		clock_seq_low;			/* clock seq low */
+	__s8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
 /*
  * AFS volume information
  */
@@ -124,7 +136,6 @@ struct afs_file_status {
 	afs_access_t		caller_access;	/* access rights for authenticated caller */
 	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
 	umode_t			mode;		/* UNIX mode */
-	struct afs_fid		parent;		/* parent dir ID for non-dirs only */
 	time_t			mtime_client;	/* last time client changed data */
 	time_t			mtime_server;	/* last time server changed data */
 	s32			lock_count;	/* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
@@ -167,4 +178,16 @@ struct afs_volume_status {
 
 #define AFS_BLOCK_SIZE	1024
 
+/*
+ * XDR encoding of UUID in AFS.
+ */
+struct afs_uuid__xdr {
+	__be32		time_low;
+	__be32		time_mid;
+	__be32		time_hi_and_version;
+	__be32		clock_seq_hi_and_reserved;
+	__be32		clock_seq_low;
+	__be32		node[6];
+};
+
 #endif /* AFS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index eb647323d8f0..d47b6d01e4c0 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -37,9 +37,12 @@ enum AFS_FS_Operations {
 	FSLOOKUP		= 161,	/* AFS lookup file in directory */
 	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
 	FSSTOREDATA64		= 65538, /* AFS Store file data */
+	FSGIVEUPALLCALLBACKS	= 65539, /* AFS Give up all outstanding callbacks on a server */
+	FSGETCAPABILITIES	= 65540, /* Probe and get the capabilities of a fileserver */
 };
 
 enum AFS_FS_Errors {
+	VRESTARTING	= -100,	/* Server is restarting */
 	VSALVAGE	= 101,	/* volume needs salvaging */
 	VNOVNODE	= 102,	/* no such file/dir (vnode) */
 	VNOVOL		= 103,	/* no such volume or volume unavailable */
@@ -51,6 +54,9 @@ enum AFS_FS_Errors {
 	VOVERQUOTA	= 109,	/* volume's maximum quota exceeded */
 	VBUSY		= 110,	/* volume is temporarily unavailable */
 	VMOVED		= 111,	/* volume moved to new server - ask this FS where */
+	VIO		= 112,	/* I/O error in volume */
+	VSALVAGING	= 113,	/* Volume is being salvaged */
+	VRESTRICTED	= 120,	/* Volume is restricted from using  */
 };
 
 #endif /* AFS_FS_H */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index 800f607ffaf5..e3c4688f573b 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -16,11 +16,17 @@
 
 #define AFS_VL_PORT		7003	/* volume location service port */
 #define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
+#define YFS_VL_SERVICE		2503	/* Service ID for AuriStor upgraded VL service */
 
 enum AFSVL_Operations {
-	VLGETENTRYBYID		= 503,	/* AFS Get Cache Entry By ID operation ID */
-	VLGETENTRYBYNAME	= 504,	/* AFS Get Cache Entry By Name operation ID */
-	VLPROBE			= 514,	/* AFS Probe Volume Location Service operation ID */
+	VLGETENTRYBYID		= 503,	/* AFS Get VLDB entry by ID */
+	VLGETENTRYBYNAME	= 504,	/* AFS Get VLDB entry by name */
+	VLPROBE			= 514,	/* AFS probe VL service */
+	VLGETENTRYBYIDU		= 526,	/* AFS Get VLDB entry by ID (UUID-variant) */
+	VLGETENTRYBYNAMEU	= 527,	/* AFS Get VLDB entry by name (UUID-variant) */
+	VLGETADDRSU		= 533,	/* AFS Get addrs for fileserver */
+	YVLGETENDPOINTS		= 64002, /* YFS Get endpoints for file/volume server */
+	VLGETCAPABILITIES	= 65537, /* AFS Get server capabilities */
 };
 
 enum AFSVL_Errors {
@@ -54,6 +60,19 @@ enum AFSVL_Errors {
 	AFSVL_NOMEM 		= 363547,	/* malloc/realloc failed to alloc enough memory */
 };
 
+enum {
+	YFS_SERVER_INDEX	= 0,
+	YFS_SERVER_UUID		= 1,
+	YFS_SERVER_ENDPOINT	= 2,
+};
+
+enum {
+	YFS_ENDPOINT_IPV4	= 0,
+	YFS_ENDPOINT_IPV6	= 1,
+};
+
+#define YFS_MAXENDPOINTS	16
+
 /*
  * maps to "struct vldbentry" in vvl-spec.pdf
  */
@@ -74,11 +93,57 @@ struct afs_vldbentry {
 		struct in_addr	addr;		/* server address */
 		unsigned	partition;	/* partition ID on this server */
 		unsigned	flags;		/* server specific flags */
-#define AFS_VLSF_NEWREPSITE	0x0001	/* unused */
+#define AFS_VLSF_NEWREPSITE	0x0001	/* Ignore all 'non-new' servers */
 #define AFS_VLSF_ROVOL		0x0002	/* this server holds a R/O instance of the volume */
 #define AFS_VLSF_RWVOL		0x0004	/* this server holds a R/W instance of the volume */
 #define AFS_VLSF_BACKVOL	0x0008	/* this server holds a backup instance of the volume */
+#define AFS_VLSF_UUID		0x0010	/* This server is referred to by its UUID */
+#define AFS_VLSF_DONTUSE	0x0020	/* This server ref should be ignored */
 	} servers[8];
 };
 
+#define AFS_VLDB_MAXNAMELEN 65
+
+
+struct afs_ListAddrByAttributes__xdr {
+	__be32			Mask;
+#define AFS_VLADDR_IPADDR	0x1	/* Match by ->ipaddr */
+#define AFS_VLADDR_INDEX	0x2	/* Match by ->index */
+#define AFS_VLADDR_UUID		0x4	/* Match by ->uuid */
+	__be32			ipaddr;
+	__be32			index;
+	__be32			spare;
+	struct afs_uuid__xdr	uuid;
+};
+
+struct afs_uvldbentry__xdr {
+	__be32			name[AFS_VLDB_MAXNAMELEN];
+	__be32			nServers;
+	struct afs_uuid__xdr	serverNumber[AFS_NMAXNSERVERS];
+	__be32			serverUnique[AFS_NMAXNSERVERS];
+	__be32			serverPartition[AFS_NMAXNSERVERS];
+	__be32			serverFlags[AFS_NMAXNSERVERS];
+	__be32			volumeId[AFS_MAXTYPES];
+	__be32			cloneId;
+	__be32			flags;
+	__be32			spares1;
+	__be32			spares2;
+	__be32			spares3;
+	__be32			spares4;
+	__be32			spares5;
+	__be32			spares6;
+	__be32			spares7;
+	__be32			spares8;
+	__be32			spares9;
+};
+
+struct afs_address_list {
+	refcount_t		usage;
+	unsigned int		version;
+	unsigned int		nr_addrs;
+	struct sockaddr_rxrpc	addrs[];
+};
+
+extern void afs_put_address_list(struct afs_address_list *alist);
+
 #endif /* AFS_VL_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 1fe855191261..f62ff71d28c9 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -14,19 +14,6 @@
 
 static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
 				       void *buffer, uint16_t buflen);
-static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-				       void *buffer, uint16_t buflen);
-static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
-						      const void *buffer,
-						      uint16_t buflen);
-
-static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
-					    void *buffer, uint16_t buflen);
-static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
-					    void *buffer, uint16_t buflen);
-static enum fscache_checkaux afs_vlocation_cache_check_aux(
-	void *cookie_netfs_data, const void *buffer, uint16_t buflen);
-
 static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
 					 void *buffer, uint16_t buflen);
 
@@ -42,23 +29,13 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 
 struct fscache_netfs afs_cache_netfs = {
 	.name			= "afs",
-	.version		= 0,
+	.version		= 1,
 };
 
 struct fscache_cookie_def afs_cell_cache_index_def = {
 	.name		= "AFS.cell",
 	.type		= FSCACHE_COOKIE_TYPE_INDEX,
 	.get_key	= afs_cell_cache_get_key,
-	.get_aux	= afs_cell_cache_get_aux,
-	.check_aux	= afs_cell_cache_check_aux,
-};
-
-struct fscache_cookie_def afs_vlocation_cache_index_def = {
-	.name			= "AFS.vldb",
-	.type			= FSCACHE_COOKIE_TYPE_INDEX,
-	.get_key		= afs_vlocation_cache_get_key,
-	.get_aux		= afs_vlocation_cache_get_aux,
-	.check_aux		= afs_vlocation_cache_check_aux,
 };
 
 struct fscache_cookie_def afs_volume_cache_index_def = {
@@ -95,150 +72,26 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
 	return klen;
 }
 
-/*
- * provide new auxiliary cache data
- */
-static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-				       void *buffer, uint16_t bufmax)
-{
-	const struct afs_cell *cell = cookie_netfs_data;
-	uint16_t dlen;
-
-	_enter("%p,%p,%u", cell, buffer, bufmax);
-
-	dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
-	dlen = min(dlen, bufmax);
-	dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
-
-	memcpy(buffer, cell->vl_addrs, dlen);
-	return dlen;
-}
-
-/*
- * check that the auxiliary data indicates that the entry is still valid
- */
-static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
-						      const void *buffer,
-						      uint16_t buflen)
-{
-	_leave(" = OKAY");
-	return FSCACHE_CHECKAUX_OKAY;
-}
-
-/*****************************************************************************/
-/*
- * set the key for the index entry
- */
-static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
-					    void *buffer, uint16_t bufmax)
-{
-	const struct afs_vlocation *vlocation = cookie_netfs_data;
-	uint16_t klen;
-
-	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
-
-	klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
-	if (klen > bufmax)
-		return 0;
-
-	memcpy(buffer, vlocation->vldb.name, klen);
-
-	_leave(" = %u", klen);
-	return klen;
-}
-
-/*
- * provide new auxiliary cache data
- */
-static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
-					    void *buffer, uint16_t bufmax)
-{
-	const struct afs_vlocation *vlocation = cookie_netfs_data;
-	uint16_t dlen;
-
-	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
-
-	dlen = sizeof(struct afs_cache_vlocation);
-	dlen -= offsetof(struct afs_cache_vlocation, nservers);
-	if (dlen > bufmax)
-		return 0;
-
-	memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
-
-	_leave(" = %u", dlen);
-	return dlen;
-}
-
-/*
- * check that the auxiliary data indicates that the entry is still valid
- */
-static
-enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
-						    const void *buffer,
-						    uint16_t buflen)
-{
-	const struct afs_cache_vlocation *cvldb;
-	struct afs_vlocation *vlocation = cookie_netfs_data;
-	uint16_t dlen;
-
-	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
-
-	/* check the size of the data is what we're expecting */
-	dlen = sizeof(struct afs_cache_vlocation);
-	dlen -= offsetof(struct afs_cache_vlocation, nservers);
-	if (dlen != buflen)
-		return FSCACHE_CHECKAUX_OBSOLETE;
-
-	cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
-
-	/* if what's on disk is more valid than what's in memory, then use the
-	 * VL record from the cache */
-	if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
-		memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
-		vlocation->valid = 1;
-		_leave(" = SUCCESS [c->m]");
-		return FSCACHE_CHECKAUX_OKAY;
-	}
-
-	/* need to update the cache if the cached info differs */
-	if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
-		/* delete if the volume IDs for this name differ */
-		if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
-			   sizeof(cvldb->vid)) != 0
-		    ) {
-			_leave(" = OBSOLETE");
-			return FSCACHE_CHECKAUX_OBSOLETE;
-		}
-
-		_leave(" = UPDATE");
-		return FSCACHE_CHECKAUX_NEEDS_UPDATE;
-	}
-
-	_leave(" = OKAY");
-	return FSCACHE_CHECKAUX_OKAY;
-}
-
 /*****************************************************************************/
 /*
  * set the key for the volume index entry
  */
 static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
-					void *buffer, uint16_t bufmax)
+					 void *buffer, uint16_t bufmax)
 {
 	const struct afs_volume *volume = cookie_netfs_data;
-	uint16_t klen;
+	struct {
+		u64 volid;
+	} __packed key;
 
 	_enter("{%u},%p,%u", volume->type, buffer, bufmax);
 
-	klen = sizeof(volume->type);
-	if (klen > bufmax)
+	if (bufmax < sizeof(key))
 		return 0;
 
-	memcpy(buffer, &volume->type, sizeof(volume->type));
-
-	_leave(" = %u", klen);
-	return klen;
-
+	key.volid = volume->vid;
+	memcpy(buffer, &key, sizeof(key));
+	return sizeof(key);
 }
 
 /*****************************************************************************/
@@ -249,20 +102,25 @@ static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
 					void *buffer, uint16_t bufmax)
 {
 	const struct afs_vnode *vnode = cookie_netfs_data;
-	uint16_t klen;
+	struct {
+		u32 vnode_id[3];
+	} __packed key;
 
 	_enter("{%x,%x,%llx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, bufmax);
 
-	klen = sizeof(vnode->fid.vnode);
-	if (klen > bufmax)
-		return 0;
+	/* Allow for a 96-bit key */
+	memset(&key, 0, sizeof(key));
+	key.vnode_id[0] = vnode->fid.vnode;
+	key.vnode_id[1] = 0;
+	key.vnode_id[2] = 0;
 
-	memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
+	if (sizeof(key) > bufmax)
+		return 0;
 
-	_leave(" = %u", klen);
-	return klen;
+	memcpy(buffer, &key, sizeof(key));
+	return sizeof(key);
 }
 
 /*
@@ -280,6 +138,11 @@ static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
 	*size = vnode->status.size;
 }
 
+struct afs_vnode_cache_aux {
+	u64 data_version;
+	u32 fid_unique;
+} __packed;
+
 /*
  * provide new auxiliary cache data
  */
@@ -287,23 +150,21 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 					void *buffer, uint16_t bufmax)
 {
 	const struct afs_vnode *vnode = cookie_netfs_data;
-	uint16_t dlen;
+	struct afs_vnode_cache_aux aux;
 
 	_enter("{%x,%x,%Lx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, bufmax);
 
-	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
-	if (dlen > bufmax)
-		return 0;
+	memset(&aux, 0, sizeof(aux));
+	aux.data_version = vnode->status.data_version;
+	aux.fid_unique = vnode->fid.unique;
 
-	memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
-	buffer += sizeof(vnode->fid.unique);
-	memcpy(buffer, &vnode->status.data_version,
-	       sizeof(vnode->status.data_version));
+	if (bufmax < sizeof(aux))
+		return 0;
 
-	_leave(" = %u", dlen);
-	return dlen;
+	memcpy(buffer, &aux, sizeof(aux));
+	return sizeof(aux);
 }
 
 /*
@@ -314,43 +175,29 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 						       uint16_t buflen)
 {
 	struct afs_vnode *vnode = cookie_netfs_data;
-	uint16_t dlen;
+	struct afs_vnode_cache_aux aux;
 
 	_enter("{%x,%x,%llx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, buflen);
 
+	memcpy(&aux, buffer, sizeof(aux));
+
 	/* check the size of the data is what we're expecting */
-	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
-	if (dlen != buflen) {
-		_leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+	if (buflen != sizeof(aux)) {
+		_leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux));
 		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
-	if (memcmp(buffer,
-		   &vnode->fid.unique,
-		   sizeof(vnode->fid.unique)
-		   ) != 0) {
-		unsigned unique;
-
-		memcpy(&unique, buffer, sizeof(unique));
-
+	if (vnode->fid.unique != aux.fid_unique) {
 		_leave(" = OBSOLETE [uniq %x != %x]",
-		       unique, vnode->fid.unique);
+		       aux.fid_unique, vnode->fid.unique);
 		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
-	if (memcmp(buffer + sizeof(vnode->fid.unique),
-		   &vnode->status.data_version,
-		   sizeof(vnode->status.data_version)
-		   ) != 0) {
-		afs_dataversion_t version;
-
-		memcpy(&version, buffer + sizeof(vnode->fid.unique),
-		       sizeof(version));
-
+	if (vnode->status.data_version != aux.data_version) {
 		_leave(" = OBSOLETE [vers %llx != %llx]",
-		       version, vnode->status.data_version);
+		       aux.data_version, vnode->status.data_version);
 		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 25d404d22cae..f4291b576054 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -20,118 +20,151 @@
 #include <linux/sched.h>
 #include "internal.h"
 
-#if 0
-unsigned afs_vnode_update_timeout = 10;
-#endif  /*  0  */
-
-#define afs_breakring_space(server) \
-	CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,	\
-		   ARRAY_SIZE((server)->cb_break))
-
-//static void afs_callback_updater(struct work_struct *);
-
-static struct workqueue_struct *afs_callback_update_worker;
-
 /*
- * allow the fileserver to request callback state (re-)initialisation
+ * Set up an interest-in-callbacks record for a volume on a server and
+ * register it with the server.
+ * - Called with volume->server_sem held.
  */
-void afs_init_callback_state(struct afs_server *server)
+int afs_register_server_cb_interest(struct afs_vnode *vnode,
+				    struct afs_server_entry *entry)
 {
-	struct afs_vnode *vnode;
-
-	_enter("{%p}", server);
+	struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x;
+	struct afs_server *server = entry->server;
+
+again:
+	vcbi = vnode->cb_interest;
+	if (vcbi) {
+		if (vcbi == cbi)
+			return 0;
+
+		if (cbi && vcbi->server == cbi->server) {
+			write_seqlock(&vnode->cb_lock);
+			vnode->cb_interest = afs_get_cb_interest(cbi);
+			write_sequnlock(&vnode->cb_lock);
+			afs_put_cb_interest(afs_v2net(vnode), cbi);
+			return 0;
+		}
 
-	spin_lock(&server->cb_lock);
+		if (!cbi && vcbi->server == server) {
+			afs_get_cb_interest(vcbi);
+			x = cmpxchg(&entry->cb_interest, cbi, vcbi);
+			if (x != cbi) {
+				cbi = x;
+				afs_put_cb_interest(afs_v2net(vnode), vcbi);
+				goto again;
+			}
+			return 0;
+		}
+	}
 
-	/* kill all the promises on record from this server */
-	while (!RB_EMPTY_ROOT(&server->cb_promises)) {
-		vnode = rb_entry(server->cb_promises.rb_node,
-				 struct afs_vnode, cb_promise);
-		_debug("UNPROMISE { vid=%x:%u uq=%u}",
-		       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
-		rb_erase(&vnode->cb_promise, &server->cb_promises);
-		vnode->cb_promised = false;
+	if (!cbi) {
+		new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+
+		refcount_set(&new->usage, 1);
+		new->sb = vnode->vfs_inode.i_sb;
+		new->vid = vnode->volume->vid;
+		new->server = afs_get_server(server);
+		INIT_LIST_HEAD(&new->cb_link);
+
+		write_lock(&server->cb_break_lock);
+		list_add_tail(&new->cb_link, &server->cb_interests);
+		write_unlock(&server->cb_break_lock);
+
+		x = cmpxchg(&entry->cb_interest, cbi, new);
+		if (x == cbi) {
+			cbi = new;
+		} else {
+			cbi = x;
+			afs_put_cb_interest(afs_v2net(vnode), new);
+		}
 	}
 
-	spin_unlock(&server->cb_lock);
-	_leave("");
+	ASSERT(cbi);
+
+	/* Change the server the vnode is using.  This entails scrubbing any
+	 * interest the vnode had in the previous server it was using.
+	 */
+	write_seqlock(&vnode->cb_lock);
+
+	vnode->cb_interest = afs_get_cb_interest(cbi);
+	vnode->cb_s_break = cbi->server->cb_s_break;
+	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+	write_sequnlock(&vnode->cb_lock);
+	return 0;
 }
 
 /*
- * handle the data invalidation side of a callback being broken
+ * Set a vnode's interest on a server.
  */
-void afs_broken_callback_work(struct work_struct *work)
+void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi)
 {
-	struct afs_vnode *vnode =
-		container_of(work, struct afs_vnode, cb_broken_work);
+	struct afs_cb_interest *old_cbi = NULL;
 
-	_enter("");
-
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+	if (vnode->cb_interest == cbi)
 		return;
 
-	/* we're only interested in dealing with a broken callback on *this*
-	 * vnode and only if no-one else has dealt with it yet */
-	if (!mutex_trylock(&vnode->validate_lock))
-		return; /* someone else is dealing with it */
-
-	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
-		if (S_ISDIR(vnode->vfs_inode.i_mode))
-			afs_clear_permits(vnode);
-
-		if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
-			goto out;
-
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			goto out;
-
-		/* if the vnode's data version number changed then its contents
-		 * are different */
-		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
-			afs_zap_data(vnode);
+	write_seqlock(&vnode->cb_lock);
+	if (vnode->cb_interest != cbi) {
+		afs_get_cb_interest(cbi);
+		old_cbi = vnode->cb_interest;
+		vnode->cb_interest = cbi;
 	}
+	write_sequnlock(&vnode->cb_lock);
+	afs_put_cb_interest(afs_v2net(vnode), cbi);
+}
 
-out:
-	mutex_unlock(&vnode->validate_lock);
-
-	/* avoid the potential race whereby the mutex_trylock() in this
-	 * function happens again between the clear_bit() and the
-	 * mutex_unlock() */
-	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
-		_debug("requeue");
-		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+/*
+ * Remove an interest on a server.
+ */
+void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
+{
+	if (cbi && refcount_dec_and_test(&cbi->usage)) {
+		if (!list_empty(&cbi->cb_link)) {
+			write_lock(&cbi->server->cb_break_lock);
+			list_del_init(&cbi->cb_link);
+			write_unlock(&cbi->server->cb_break_lock);
+			afs_put_server(net, cbi->server);
+		}
+		kfree(cbi);
 	}
-	_leave("");
+}
+
+/*
+ * allow the fileserver to request callback state (re-)initialisation
+ */
+void afs_init_callback_state(struct afs_server *server)
+{
+	if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags))
+		server->cb_s_break++;
 }
 
 /*
  * actually break a callback
  */
-static void afs_break_callback(struct afs_server *server,
-			       struct afs_vnode *vnode)
+void afs_break_callback(struct afs_vnode *vnode)
 {
 	_enter("");
 
-	set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+	write_seqlock(&vnode->cb_lock);
+
+	if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+		vnode->cb_break++;
+		afs_clear_permits(vnode);
 
-	if (vnode->cb_promised) {
 		spin_lock(&vnode->lock);
 
 		_debug("break callback");
 
-		spin_lock(&server->cb_lock);
-		if (vnode->cb_promised) {
-			rb_erase(&vnode->cb_promise, &server->cb_promises);
-			vnode->cb_promised = false;
-		}
-		spin_unlock(&server->cb_lock);
-
-		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
 		if (list_empty(&vnode->granted_locks) &&
 		    !list_empty(&vnode->pending_locks))
 			afs_lock_may_be_available(vnode);
 		spin_unlock(&vnode->lock);
 	}
+
+	write_sequnlock(&vnode->cb_lock);
 }
 
 /*
@@ -143,49 +176,31 @@ static void afs_break_callback(struct afs_server *server,
 static void afs_break_one_callback(struct afs_server *server,
 				   struct afs_fid *fid)
 {
+	struct afs_cb_interest *cbi;
+	struct afs_iget_data data;
 	struct afs_vnode *vnode;
-	struct rb_node *p;
-
-	_debug("find");
-	spin_lock(&server->fs_lock);
-	p = server->fs_vnodes.rb_node;
-	while (p) {
-		vnode = rb_entry(p, struct afs_vnode, server_rb);
-		if (fid->vid < vnode->fid.vid)
-			p = p->rb_left;
-		else if (fid->vid > vnode->fid.vid)
-			p = p->rb_right;
-		else if (fid->vnode < vnode->fid.vnode)
-			p = p->rb_left;
-		else if (fid->vnode > vnode->fid.vnode)
-			p = p->rb_right;
-		else if (fid->unique < vnode->fid.unique)
-			p = p->rb_left;
-		else if (fid->unique > vnode->fid.unique)
-			p = p->rb_right;
-		else
-			goto found;
-	}
-
-	/* not found so we just ignore it (it may have moved to another
-	 * server) */
-not_available:
-	_debug("not avail");
-	spin_unlock(&server->fs_lock);
-	_leave("");
-	return;
+	struct inode *inode;
 
-found:
-	_debug("found");
-	ASSERTCMP(server, ==, vnode->server);
+	read_lock(&server->cb_break_lock);
 
-	if (!igrab(AFS_VNODE_TO_I(vnode)))
-		goto not_available;
-	spin_unlock(&server->fs_lock);
+	/* Step through all interested superblocks.  There may be more than one
+	 * because of cell aliasing.
+	 */
+	list_for_each_entry(cbi, &server->cb_interests, cb_link) {
+		if (cbi->vid != fid->vid)
+			continue;
+
+		data.volume = NULL;
+		data.fid = *fid;
+		inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			afs_break_callback(vnode);
+			iput(inode);
+		}
+	}
 
-	afs_break_callback(server, vnode);
-	iput(&vnode->vfs_inode);
-	_leave("");
+	read_unlock(&server->cb_break_lock);
 }
 
 /*
@@ -216,261 +231,14 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
 }
 
 /*
- * record the callback for breaking
- * - the caller must hold server->cb_lock
+ * Clear the callback interests in a server list.
  */
-static void afs_do_give_up_callback(struct afs_server *server,
-				    struct afs_vnode *vnode)
+void afs_clear_callback_interests(struct afs_net *net, struct afs_server_list *slist)
 {
-	struct afs_callback *cb;
-
-	_enter("%p,%p", server, vnode);
-
-	cb = &server->cb_break[server->cb_break_head];
-	cb->fid		= vnode->fid;
-	cb->version	= vnode->cb_version;
-	cb->expiry	= vnode->cb_expiry;
-	cb->type	= vnode->cb_type;
-	smp_wmb();
-	server->cb_break_head =
-		(server->cb_break_head + 1) &
-		(ARRAY_SIZE(server->cb_break) - 1);
-
-	/* defer the breaking of callbacks to try and collect as many as
-	 * possible to ship in one operation */
-	switch (atomic_inc_return(&server->cb_break_n)) {
-	case 1 ... AFSCBMAX - 1:
-		queue_delayed_work(afs_callback_update_worker,
-				   &server->cb_break_work, HZ * 2);
-		break;
-	case AFSCBMAX:
-		afs_flush_callback_breaks(server);
-		break;
-	default:
-		break;
-	}
-
-	ASSERT(server->cb_promises.rb_node != NULL);
-	rb_erase(&vnode->cb_promise, &server->cb_promises);
-	vnode->cb_promised = false;
-	_leave("");
-}
-
-/*
- * discard the callback on a deleted item
- */
-void afs_discard_callback_on_delete(struct afs_vnode *vnode)
-{
-	struct afs_server *server = vnode->server;
+	int i;
 
-	_enter("%d", vnode->cb_promised);
-
-	if (!vnode->cb_promised) {
-		_leave(" [not promised]");
-		return;
-	}
-
-	ASSERT(server != NULL);
-
-	spin_lock(&server->cb_lock);
-	if (vnode->cb_promised) {
-		ASSERT(server->cb_promises.rb_node != NULL);
-		rb_erase(&vnode->cb_promise, &server->cb_promises);
-		vnode->cb_promised = false;
+	for (i = 0; i < slist->nr_servers; i++) {
+		afs_put_cb_interest(net, slist->servers[i].cb_interest);
+		slist->servers[i].cb_interest = NULL;
 	}
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * give up the callback registered for a vnode on the file server when the
- * inode is being cleared
- */
-void afs_give_up_callback(struct afs_vnode *vnode)
-{
-	struct afs_server *server = vnode->server;
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	_enter("%d", vnode->cb_promised);
-
-	_debug("GIVE UP INODE %p", &vnode->vfs_inode);
-
-	if (!vnode->cb_promised) {
-		_leave(" [not promised]");
-		return;
-	}
-
-	ASSERT(server != NULL);
-
-	spin_lock(&server->cb_lock);
-	if (vnode->cb_promised && afs_breakring_space(server) == 0) {
-		add_wait_queue(&server->cb_break_waitq, &myself);
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!vnode->cb_promised ||
-			    afs_breakring_space(server) != 0)
-				break;
-			spin_unlock(&server->cb_lock);
-			schedule();
-			spin_lock(&server->cb_lock);
-		}
-		remove_wait_queue(&server->cb_break_waitq, &myself);
-		__set_current_state(TASK_RUNNING);
-	}
-
-	/* of course, it's always possible for the server to break this vnode's
-	 * callback first... */
-	if (vnode->cb_promised)
-		afs_do_give_up_callback(server, vnode);
-
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * dispatch a deferred give up callbacks operation
- */
-void afs_dispatch_give_up_callbacks(struct work_struct *work)
-{
-	struct afs_server *server =
-		container_of(work, struct afs_server, cb_break_work.work);
-
-	_enter("");
-
-	/* tell the fileserver to discard the callback promises it has
-	 * - in the event of ENOMEM or some other error, we just forget that we
-	 *   had callbacks entirely, and the server will call us later to break
-	 *   them
-	 */
-	afs_fs_give_up_callbacks(server, true);
-}
-
-/*
- * flush the outstanding callback breaks on a server
- */
-void afs_flush_callback_breaks(struct afs_server *server)
-{
-	mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
-}
-
-#if 0
-/*
- * update a bunch of callbacks
- */
-static void afs_callback_updater(struct work_struct *work)
-{
-	struct afs_server *server;
-	struct afs_vnode *vnode, *xvnode;
-	time64_t now;
-	long timeout;
-	int ret;
-
-	server = container_of(work, struct afs_server, updater);
-
-	_enter("");
-
-	now = ktime_get_real_seconds();
-
-	/* find the first vnode to update */
-	spin_lock(&server->cb_lock);
-	for (;;) {
-		if (RB_EMPTY_ROOT(&server->cb_promises)) {
-			spin_unlock(&server->cb_lock);
-			_leave(" [nothing]");
-			return;
-		}
-
-		vnode = rb_entry(rb_first(&server->cb_promises),
-				 struct afs_vnode, cb_promise);
-		if (atomic_read(&vnode->usage) > 0)
-			break;
-		rb_erase(&vnode->cb_promise, &server->cb_promises);
-		vnode->cb_promised = false;
-	}
-
-	timeout = vnode->update_at - now;
-	if (timeout > 0) {
-		queue_delayed_work(afs_vnode_update_worker,
-				   &afs_vnode_update, timeout * HZ);
-		spin_unlock(&server->cb_lock);
-		_leave(" [nothing]");
-		return;
-	}
-
-	list_del_init(&vnode->update);
-	atomic_inc(&vnode->usage);
-	spin_unlock(&server->cb_lock);
-
-	/* we can now perform the update */
-	_debug("update %s", vnode->vldb.name);
-	vnode->state = AFS_VL_UPDATING;
-	vnode->upd_rej_cnt = 0;
-	vnode->upd_busy_cnt = 0;
-
-	ret = afs_vnode_update_record(vl, &vldb);
-	switch (ret) {
-	case 0:
-		afs_vnode_apply_update(vl, &vldb);
-		vnode->state = AFS_VL_UPDATING;
-		break;
-	case -ENOMEDIUM:
-		vnode->state = AFS_VL_VOLUME_DELETED;
-		break;
-	default:
-		vnode->state = AFS_VL_UNCERTAIN;
-		break;
-	}
-
-	/* and then reschedule */
-	_debug("reschedule");
-	vnode->update_at = ktime_get_real_seconds() +
-			afs_vnode_update_timeout;
-
-	spin_lock(&server->cb_lock);
-
-	if (!list_empty(&server->cb_promises)) {
-		/* next update in 10 minutes, but wait at least 1 second more
-		 * than the newest record already queued so that we don't spam
-		 * the VL server suddenly with lots of requests
-		 */
-		xvnode = list_entry(server->cb_promises.prev,
-				    struct afs_vnode, update);
-		if (vnode->update_at <= xvnode->update_at)
-			vnode->update_at = xvnode->update_at + 1;
-		xvnode = list_entry(server->cb_promises.next,
-				    struct afs_vnode, update);
-		timeout = xvnode->update_at - now;
-		if (timeout < 0)
-			timeout = 0;
-	} else {
-		timeout = afs_vnode_update_timeout;
-	}
-
-	list_add_tail(&vnode->update, &server->cb_promises);
-
-	_debug("timeout %ld", timeout);
-	queue_delayed_work(afs_vnode_update_worker,
-			   &afs_vnode_update, timeout * HZ);
-	spin_unlock(&server->cb_lock);
-	afs_put_vnode(vl);
-}
-#endif
-
-/*
- * initialise the callback update process
- */
-int __init afs_callback_update_init(void)
-{
-	afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd",
-							     WQ_MEM_RECLAIM);
-	return afs_callback_update_worker ? 0 : -ENOMEM;
-}
-
-/*
- * shut down the callback update process
- */
-void afs_callback_update_kill(void)
-{
-	destroy_workqueue(afs_callback_update_worker);
 }
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index ca0a3cf93791..9bb921d120d0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,6 +1,6 @@
 /* AFS cell and server record management
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,213 +9,291 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
 #include <linux/dns_resolver.h>
 #include <linux/sched.h>
+#include <linux/inet.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
-DECLARE_RWSEM(afs_proc_cells_sem);
-LIST_HEAD(afs_proc_cells);
+unsigned __read_mostly afs_cell_gc_delay = 10;
 
-static LIST_HEAD(afs_cells);
-static DEFINE_RWLOCK(afs_cells_lock);
-static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
-static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
-static struct afs_cell *afs_cell_root;
+static void afs_manage_cell(struct work_struct *);
+
+static void afs_dec_cells_outstanding(struct afs_net *net)
+{
+	if (atomic_dec_and_test(&net->cells_outstanding))
+		wake_up_atomic_t(&net->cells_outstanding);
+}
 
 /*
- * allocate a cell record and fill in its name, VL server address list and
- * allocate an anonymous key
+ * Set the cell timer to fire after a given delay, assuming it's not already
+ * set for an earlier time.
  */
-static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
-				       char *vllist)
+static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
 {
-	struct afs_cell *cell;
-	struct key *key;
-	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
-	char  *dvllist = NULL, *_vllist = NULL;
-	char  delimiter = ':';
-	int ret;
+	if (net->live) {
+		atomic_inc(&net->cells_outstanding);
+		if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
+			afs_dec_cells_outstanding(net);
+	}
+}
 
-	_enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
+/*
+ * Look up and get an activation reference on a cell record under RCU
+ * conditions.  The caller must hold the RCU read lock.
+ */
+struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
+				     const char *name, unsigned int namesz)
+{
+	struct afs_cell *cell = NULL;
+	struct rb_node *p;
+	int n, seq = 0, ret = 0;
 
-	BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+	_enter("%*.*s", namesz, namesz, name);
 
-	if (namelen > AFS_MAXCELLNAME) {
-		_leave(" = -ENAMETOOLONG");
+	if (name && namesz == 0)
+		return ERR_PTR(-EINVAL);
+	if (namesz > AFS_MAXCELLNAME)
 		return ERR_PTR(-ENAMETOOLONG);
-	}
 
-	/* allocate and initialise a cell record */
-	cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
-	if (!cell) {
-		_leave(" = -ENOMEM");
-		return ERR_PTR(-ENOMEM);
-	}
+	do {
+		/* Unfortunately, rbtree walking doesn't give reliable results
+		 * under just the RCU read lock, so we have to check for
+		 * changes.
+		 */
+		if (cell)
+			afs_put_cell(net, cell);
+		cell = NULL;
+		ret = -ENOENT;
 
-	memcpy(cell->name, name, namelen);
-	cell->name[namelen] = 0;
-
-	atomic_set(&cell->usage, 1);
-	INIT_LIST_HEAD(&cell->link);
-	rwlock_init(&cell->servers_lock);
-	INIT_LIST_HEAD(&cell->servers);
-	init_rwsem(&cell->vl_sem);
-	INIT_LIST_HEAD(&cell->vl_list);
-	spin_lock_init(&cell->vl_lock);
-
-	/* if the ip address is invalid, try dns query */
-	if (!vllist || strlen(vllist) < 7) {
-		ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
-		if (ret < 0) {
-			if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
-				/* translate these errors into something
-				 * userspace might understand */
-				ret = -EDESTADDRREQ;
-			_leave(" = %d", ret);
-			return ERR_PTR(ret);
+		read_seqbegin_or_lock(&net->cells_lock, &seq);
+
+		if (!name) {
+			cell = rcu_dereference_raw(net->ws_cell);
+			if (cell) {
+				afs_get_cell(cell);
+				continue;
+			}
+			ret = -EDESTADDRREQ;
+			continue;
 		}
-		_vllist = dvllist;
 
-		/* change the delimiter for user-space reply */
-		delimiter = ',';
+		p = rcu_dereference_raw(net->cells.rb_node);
+		while (p) {
+			cell = rb_entry(p, struct afs_cell, net_node);
+
+			n = strncasecmp(cell->name, name,
+					min_t(size_t, cell->name_len, namesz));
+			if (n == 0)
+				n = cell->name_len - namesz;
+			if (n < 0) {
+				p = rcu_dereference_raw(p->rb_left);
+			} else if (n > 0) {
+				p = rcu_dereference_raw(p->rb_right);
+			} else {
+				if (atomic_inc_not_zero(&cell->usage)) {
+					ret = 0;
+					break;
+				}
+				/* We want to repeat the search, this time with
+				 * the lock properly locked.
+				 */
+			}
+			cell = NULL;
+		}
 
-	} else {
-		_vllist = vllist;
-	}
+	} while (need_seqretry(&net->cells_lock, seq));
 
-	/* fill in the VL server list from the rest of the string */
-	do {
-		unsigned a, b, c, d;
+	done_seqretry(&net->cells_lock, seq);
 
-		next = strchr(_vllist, delimiter);
-		if (next)
-			*next++ = 0;
+	return ret == 0 ? cell : ERR_PTR(ret);
+}
 
-		if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
-			goto bad_address;
+/*
+ * Set up a cell record and fill in its name, VL server address list and
+ * allocate an anonymous key
+ */
+static struct afs_cell *afs_alloc_cell(struct afs_net *net,
+				       const char *name, unsigned int namelen,
+				       const char *vllist)
+{
+	struct afs_cell *cell;
+	int i, ret;
 
-		if (a > 255 || b > 255 || c > 255 || d > 255)
-			goto bad_address;
+	ASSERT(name);
+	if (namelen == 0)
+		return ERR_PTR(-EINVAL);
+	if (namelen > AFS_MAXCELLNAME) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
 
-		cell->vl_addrs[cell->vl_naddrs++].s_addr =
-			htonl((a << 24) | (b << 16) | (c << 8) | d);
+	_enter("%*.*s,%s", namelen, namelen, name, vllist);
 
-	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
+	cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL);
+	if (!cell) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
 
-	/* create a key to represent an anonymous user */
-	memcpy(keyname, "afs@", 4);
-	dp = keyname + 4;
-	cp = cell->name;
-	do {
-		*dp++ = toupper(*cp);
-	} while (*cp++);
+	cell->net = net;
+	cell->name_len = namelen;
+	for (i = 0; i < namelen; i++)
+		cell->name[i] = tolower(name[i]);
+
+	atomic_set(&cell->usage, 2);
+	INIT_WORK(&cell->manager, afs_manage_cell);
+	cell->flags = ((1 << AFS_CELL_FL_NOT_READY) |
+		       (1 << AFS_CELL_FL_NO_LOOKUP_YET));
+	INIT_LIST_HEAD(&cell->proc_volumes);
+	rwlock_init(&cell->proc_lock);
+	rwlock_init(&cell->vl_addrs_lock);
+
+	/* Fill in the VL server list if we were given a list of addresses to
+	 * use.
+	 */
+	if (vllist) {
+		struct afs_addr_list *alist;
+
+		alist = afs_parse_text_addrs(vllist, strlen(vllist), ':',
+					     VL_SERVICE, AFS_VL_PORT);
+		if (IS_ERR(alist)) {
+			ret = PTR_ERR(alist);
+			goto parse_failed;
+		}
 
-	key = rxrpc_get_null_key(keyname);
-	if (IS_ERR(key)) {
-		_debug("no key");
-		ret = PTR_ERR(key);
-		goto error;
+		rcu_assign_pointer(cell->vl_addrs, alist);
+		cell->dns_expiry = TIME64_MAX;
 	}
-	cell->anonymous_key = key;
-
-	_debug("anon key %p{%x}",
-	       cell->anonymous_key, key_serial(cell->anonymous_key));
 
 	_leave(" = %p", cell);
 	return cell;
 
-bad_address:
-	printk(KERN_ERR "kAFS: bad VL server IP address\n");
-	ret = -EINVAL;
-error:
-	key_put(cell->anonymous_key);
-	kfree(dvllist);
+parse_failed:
+	if (ret == -EINVAL)
+		printk(KERN_ERR "kAFS: bad VL server IP address\n");
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
 /*
- * afs_cell_crate() - create a cell record
- * @name:	is the name of the cell.
- * @namsesz:	is the strlen of the cell name.
- * @vllist:	is a colon separated list of IP addresses in "a.b.c.d" format.
- * @retref:	is T to return the cell reference when the cell exists.
+ * afs_lookup_cell - Look up or create a cell record.
+ * @net:	The network namespace
+ * @name:	The name of the cell.
+ * @namesz:	The strlen of the cell name.
+ * @vllist:	A colon/comma separated list of numeric IP addresses or NULL.
+ * @excl:	T if an error should be given if the cell name already exists.
+ *
+ * Look up a cell record by name and query the DNS for VL server addresses if
+ * needed.  Note that that actual DNS query is punted off to the manager thread
+ * so that this function can return immediately if interrupted whilst allowing
+ * cell records to be shared even if not yet fully constructed.
  */
-struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
-				 char *vllist, bool retref)
+struct afs_cell *afs_lookup_cell(struct afs_net *net,
+				 const char *name, unsigned int namesz,
+				 const char *vllist, bool excl)
 {
-	struct afs_cell *cell;
-	int ret;
-
-	_enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
+	struct afs_cell *cell, *candidate, *cursor;
+	struct rb_node *parent, **pp;
+	int ret, n;
+
+	_enter("%s,%s", name, vllist);
+
+	if (!excl) {
+		rcu_read_lock();
+		cell = afs_lookup_cell_rcu(net, name, namesz);
+		rcu_read_unlock();
+		if (!IS_ERR(cell))
+			goto wait_for_cell;
+	}
 
-	down_write(&afs_cells_sem);
-	read_lock(&afs_cells_lock);
-	list_for_each_entry(cell, &afs_cells, link) {
-		if (strncasecmp(cell->name, name, namesz) == 0)
-			goto duplicate_name;
+	/* Assume we're probably going to create a cell and preallocate and
+	 * mostly set up a candidate record.  We can then use this to stash the
+	 * name, the net namespace and VL server addresses.
+	 *
+	 * We also want to do this before we hold any locks as it may involve
+	 * upcalling to userspace to make DNS queries.
+	 */
+	candidate = afs_alloc_cell(net, name, namesz, vllist);
+	if (IS_ERR(candidate)) {
+		_leave(" = %ld", PTR_ERR(candidate));
+		return candidate;
 	}
-	read_unlock(&afs_cells_lock);
 
-	cell = afs_cell_alloc(name, namesz, vllist);
-	if (IS_ERR(cell)) {
-		_leave(" = %ld", PTR_ERR(cell));
-		up_write(&afs_cells_sem);
-		return cell;
+	/* Find the insertion point and check to see if someone else added a
+	 * cell whilst we were allocating.
+	 */
+	write_seqlock(&net->cells_lock);
+
+	pp = &net->cells.rb_node;
+	parent = NULL;
+	while (*pp) {
+		parent = *pp;
+		cursor = rb_entry(parent, struct afs_cell, net_node);
+
+		n = strncasecmp(cursor->name, name,
+				min_t(size_t, cursor->name_len, namesz));
+		if (n == 0)
+			n = cursor->name_len - namesz;
+		if (n < 0)
+			pp = &(*pp)->rb_left;
+		else if (n > 0)
+			pp = &(*pp)->rb_right;
+		else
+			goto cell_already_exists;
 	}
 
-	/* add a proc directory for this cell */
-	ret = afs_proc_cell_setup(cell);
-	if (ret < 0)
-		goto error;
+	cell = candidate;
+	candidate = NULL;
+	rb_link_node_rcu(&cell->net_node, parent, pp);
+	rb_insert_color(&cell->net_node, &net->cells);
+	atomic_inc(&net->cells_outstanding);
+	write_sequnlock(&net->cells_lock);
 
-#ifdef CONFIG_AFS_FSCACHE
-	/* put it up for caching (this never returns an error) */
-	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
-					     &afs_cell_cache_index_def,
-					     cell, true);
-#endif
+	queue_work(afs_wq, &cell->manager);
 
-	/* add to the cell lists */
-	write_lock(&afs_cells_lock);
-	list_add_tail(&cell->link, &afs_cells);
-	write_unlock(&afs_cells_lock);
+wait_for_cell:
+	_debug("wait_for_cell");
+	ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NOT_READY, TASK_INTERRUPTIBLE);
+	smp_rmb();
 
-	down_write(&afs_proc_cells_sem);
-	list_add_tail(&cell->proc_link, &afs_proc_cells);
-	up_write(&afs_proc_cells_sem);
-	up_write(&afs_cells_sem);
+	switch (READ_ONCE(cell->state)) {
+	case AFS_CELL_FAILED:
+		ret = cell->error;
+		goto error;
+	default:
+		_debug("weird %u %d", cell->state, cell->error);
+		goto error;
+	case AFS_CELL_ACTIVE:
+		break;
+	}
 
-	_leave(" = %p", cell);
+	_leave(" = %p [cell]", cell);
 	return cell;
 
+cell_already_exists:
+	_debug("cell exists");
+	cell = cursor;
+	if (excl) {
+		ret = -EEXIST;
+	} else {
+		afs_get_cell(cursor);
+		ret = 0;
+	}
+	write_sequnlock(&net->cells_lock);
+	kfree(candidate);
+	if (ret == 0)
+		goto wait_for_cell;
+	goto error_noput;
 error:
-	up_write(&afs_cells_sem);
-	key_put(cell->anonymous_key);
-	kfree(cell);
-	_leave(" = %d", ret);
+	afs_put_cell(net, cell);
+error_noput:
+	_leave(" = %d [error]", ret);
 	return ERR_PTR(ret);
-
-duplicate_name:
-	if (retref && !IS_ERR(cell))
-		afs_get_cell(cell);
-
-	read_unlock(&afs_cells_lock);
-	up_write(&afs_cells_sem);
-
-	if (retref) {
-		_leave(" = %p", cell);
-		return cell;
-	}
-
-	_leave(" = -EEXIST");
-	return ERR_PTR(-EEXIST);
 }
 
 /*
@@ -223,10 +301,11 @@ duplicate_name:
  * - can be called with a module parameter string
  * - can be called from a write to /proc/fs/afs/rootcell
  */
-int afs_cell_init(char *rootcell)
+int afs_cell_init(struct afs_net *net, const char *rootcell)
 {
 	struct afs_cell *old_root, *new_root;
-	char *cp;
+	const char *cp, *vllist;
+	size_t len;
 
 	_enter("");
 
@@ -239,222 +318,453 @@ int afs_cell_init(char *rootcell)
 	}
 
 	cp = strchr(rootcell, ':');
-	if (!cp)
+	if (!cp) {
 		_debug("kAFS: no VL server IP addresses specified");
-	else
-		*cp++ = 0;
+		vllist = NULL;
+		len = strlen(rootcell);
+	} else {
+		vllist = cp + 1;
+		len = cp - rootcell;
+	}
 
 	/* allocate a cell record for the root cell */
-	new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
+	new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
 	if (IS_ERR(new_root)) {
 		_leave(" = %ld", PTR_ERR(new_root));
 		return PTR_ERR(new_root);
 	}
 
+	set_bit(AFS_CELL_FL_NO_GC, &new_root->flags);
+	afs_get_cell(new_root);
+
 	/* install the new cell */
-	write_lock(&afs_cells_lock);
-	old_root = afs_cell_root;
-	afs_cell_root = new_root;
-	write_unlock(&afs_cells_lock);
-	afs_put_cell(old_root);
+	write_seqlock(&net->cells_lock);
+	old_root = net->ws_cell;
+	net->ws_cell = new_root;
+	write_sequnlock(&net->cells_lock);
 
+	afs_put_cell(net, old_root);
 	_leave(" = 0");
 	return 0;
 }
 
 /*
- * lookup a cell record
+ * Update a cell's VL server address list from the DNS.
  */
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
-				 bool dns_cell)
+static void afs_update_cell(struct afs_cell *cell)
 {
-	struct afs_cell *cell;
-
-	_enter("\"%*.*s\",", namesz, namesz, name ?: "");
-
-	down_read(&afs_cells_sem);
-	read_lock(&afs_cells_lock);
-
-	if (name) {
-		/* if the cell was named, look for it in the cell record list */
-		list_for_each_entry(cell, &afs_cells, link) {
-			if (strncmp(cell->name, name, namesz) == 0) {
-				afs_get_cell(cell);
-				goto found;
-			}
+	struct afs_addr_list *alist, *old;
+	time64_t now, expiry;
+
+	_enter("%s", cell->name);
+
+	alist = afs_dns_query(cell, &expiry);
+	if (IS_ERR(alist)) {
+		switch (PTR_ERR(alist)) {
+		case -ENODATA:
+			/* The DNS said that the cell does not exist */
+			set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
+			clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+			cell->dns_expiry = ktime_get_real_seconds() + 61;
+			break;
+
+		case -EAGAIN:
+		case -ECONNREFUSED:
+		default:
+			set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+			cell->dns_expiry = ktime_get_real_seconds() + 10;
+			break;
 		}
-		cell = ERR_PTR(-ENOENT);
-		if (dns_cell)
-			goto create_cell;
-	found:
-		;
+
+		cell->error = -EDESTADDRREQ;
 	} else {
-		cell = afs_cell_root;
-		if (!cell) {
-			/* this should not happen unless user tries to mount
-			 * when root cell is not set. Return an impossibly
-			 * bizarre errno to alert the user. Things like
-			 * ENOENT might be "more appropriate" but they happen
-			 * for other reasons.
-			 */
-			cell = ERR_PTR(-EDESTADDRREQ);
-		} else {
-			afs_get_cell(cell);
-		}
+		clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+		clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
+
+		/* Exclusion on changing vl_addrs is achieved by a
+		 * non-reentrant work item.
+		 */
+		old = rcu_dereference_protected(cell->vl_addrs, true);
+		rcu_assign_pointer(cell->vl_addrs, alist);
+		cell->dns_expiry = expiry;
 
+		if (old)
+			afs_put_addrlist(old);
 	}
 
-	read_unlock(&afs_cells_lock);
-	up_read(&afs_cells_sem);
-	_leave(" = %p", cell);
-	return cell;
+	if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET);
 
-create_cell:
-	read_unlock(&afs_cells_lock);
-	up_read(&afs_cells_sem);
+	now = ktime_get_real_seconds();
+	afs_set_cell_timer(cell->net, cell->dns_expiry - now);
+	_leave("");
+}
 
-	cell = afs_cell_create(name, namesz, NULL, true);
+/*
+ * Destroy a cell record
+ */
+static void afs_cell_destroy(struct rcu_head *rcu)
+{
+	struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
 
-	_leave(" = %p", cell);
-	return cell;
+	_enter("%p{%s}", cell, cell->name);
+
+	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+
+	afs_put_addrlist(cell->vl_addrs);
+	key_put(cell->anonymous_key);
+	kfree(cell);
+
+	_leave(" [destroyed]");
 }
 
-#if 0
 /*
- * try and get a cell record
+ * Queue the cell manager.
  */
-struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
+static void afs_queue_cell_manager(struct afs_net *net)
 {
-	write_lock(&afs_cells_lock);
+	int outstanding = atomic_inc_return(&net->cells_outstanding);
 
-	if (cell && !list_empty(&cell->link))
-		afs_get_cell(cell);
-	else
-		cell = NULL;
+	_enter("%d", outstanding);
 
-	write_unlock(&afs_cells_lock);
-	return cell;
+	if (!queue_work(afs_wq, &net->cells_manager))
+		afs_dec_cells_outstanding(net);
 }
-#endif  /*  0  */
 
 /*
- * destroy a cell record
+ * Cell management timer.  We have an increment on cells_outstanding that we
+ * need to pass along to the work item.
  */
-void afs_put_cell(struct afs_cell *cell)
+void afs_cells_timer(struct timer_list *timer)
 {
-	if (!cell)
-		return;
+	struct afs_net *net = container_of(timer, struct afs_net, cells_timer);
 
-	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+	_enter("");
+	if (!queue_work(afs_wq, &net->cells_manager))
+		afs_dec_cells_outstanding(net);
+}
 
-	ASSERTCMP(atomic_read(&cell->usage), >, 0);
+/*
+ * Get a reference on a cell record.
+ */
+struct afs_cell *afs_get_cell(struct afs_cell *cell)
+{
+	atomic_inc(&cell->usage);
+	return cell;
+}
 
-	/* to prevent a race, the decrement and the dequeue must be effectively
-	 * atomic */
-	write_lock(&afs_cells_lock);
+/*
+ * Drop a reference on a cell record.
+ */
+void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
+{
+	time64_t now, expire_delay;
 
-	if (likely(!atomic_dec_and_test(&cell->usage))) {
-		write_unlock(&afs_cells_lock);
-		_leave("");
+	if (!cell)
 		return;
-	}
 
-	ASSERT(list_empty(&cell->servers));
-	ASSERT(list_empty(&cell->vl_list));
+	_enter("%s", cell->name);
 
-	write_unlock(&afs_cells_lock);
+	now = ktime_get_real_seconds();
+	cell->last_inactive = now;
+	expire_delay = 0;
+	if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
+	    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+		expire_delay = afs_cell_gc_delay;
 
-	wake_up(&afs_cells_freeable_wq);
+	if (atomic_dec_return(&cell->usage) > 1)
+		return;
 
-	_leave(" [unused]");
+	/* 'cell' may now be garbage collected. */
+	afs_set_cell_timer(net, expire_delay);
 }
 
 /*
- * destroy a cell record
- * - must be called with the afs_cells_sem write-locked
- * - cell->link should have been broken by the caller
+ * Allocate a key to use as a placeholder for anonymous user security.
  */
-static void afs_cell_destroy(struct afs_cell *cell)
+static int afs_alloc_anon_key(struct afs_cell *cell)
 {
-	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+	struct key *key;
+	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp;
+
+	/* Create a key to represent an anonymous user. */
+	memcpy(keyname, "afs@", 4);
+	dp = keyname + 4;
+	cp = cell->name;
+	do {
+		*dp++ = tolower(*cp);
+	} while (*cp++);
 
-	ASSERTCMP(atomic_read(&cell->usage), >=, 0);
-	ASSERT(list_empty(&cell->link));
+	key = rxrpc_get_null_key(keyname);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
 
-	/* wait for everyone to stop using the cell */
-	if (atomic_read(&cell->usage) > 0) {
-		DECLARE_WAITQUEUE(myself, current);
+	cell->anonymous_key = key;
 
-		_debug("wait for cell %s", cell->name);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		add_wait_queue(&afs_cells_freeable_wq, &myself);
+	_debug("anon key %p{%x}",
+	       cell->anonymous_key, key_serial(cell->anonymous_key));
+	return 0;
+}
 
-		while (atomic_read(&cell->usage) > 0) {
-			schedule();
-			set_current_state(TASK_UNINTERRUPTIBLE);
-		}
+/*
+ * Activate a cell.
+ */
+static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
+{
+	int ret;
 
-		remove_wait_queue(&afs_cells_freeable_wq, &myself);
-		set_current_state(TASK_RUNNING);
+	if (!cell->anonymous_key) {
+		ret = afs_alloc_anon_key(cell);
+		if (ret < 0)
+			return ret;
 	}
 
-	_debug("cell dead");
-	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
-	ASSERT(list_empty(&cell->servers));
-	ASSERT(list_empty(&cell->vl_list));
+#ifdef CONFIG_AFS_FSCACHE
+	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+					     &afs_cell_cache_index_def,
+					     cell, true);
+#endif
+	ret = afs_proc_cell_setup(net, cell);
+	if (ret < 0)
+		return ret;
+	spin_lock(&net->proc_cells_lock);
+	list_add_tail(&cell->proc_link, &net->proc_cells);
+	spin_unlock(&net->proc_cells_lock);
+	return 0;
+}
 
-	afs_proc_cell_remove(cell);
+/*
+ * Deactivate a cell.
+ */
+static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
+{
+	_enter("%s", cell->name);
+
+	afs_proc_cell_remove(net, cell);
 
-	down_write(&afs_proc_cells_sem);
+	spin_lock(&net->proc_cells_lock);
 	list_del_init(&cell->proc_link);
-	up_write(&afs_proc_cells_sem);
+	spin_unlock(&net->proc_cells_lock);
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(cell->cache, 0);
+	cell->cache = NULL;
 #endif
-	key_put(cell->anonymous_key);
-	kfree(cell);
 
-	_leave(" [destroyed]");
+	_leave("");
 }
 
 /*
- * purge in-memory cell database on module unload or afs_init() failure
- * - the timeout daemon is stopped before calling this
+ * Manage a cell record, initialising and destroying it, maintaining its DNS
+ * records.
  */
-void afs_cell_purge(void)
+static void afs_manage_cell(struct work_struct *work)
 {
-	struct afs_cell *cell;
+	struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+	struct afs_net *net = cell->net;
+	bool deleted;
+	int ret, usage;
+
+	_enter("%s", cell->name);
+
+again:
+	_debug("state %u", cell->state);
+	switch (cell->state) {
+	case AFS_CELL_INACTIVE:
+	case AFS_CELL_FAILED:
+		write_seqlock(&net->cells_lock);
+		usage = 1;
+		deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0);
+		if (deleted)
+			rb_erase(&cell->net_node, &net->cells);
+		write_sequnlock(&net->cells_lock);
+		if (deleted)
+			goto final_destruction;
+		if (cell->state == AFS_CELL_FAILED)
+			goto done;
+		cell->state = AFS_CELL_UNSET;
+		goto again;
+
+	case AFS_CELL_UNSET:
+		cell->state = AFS_CELL_ACTIVATING;
+		goto again;
+
+	case AFS_CELL_ACTIVATING:
+		ret = afs_activate_cell(net, cell);
+		if (ret < 0)
+			goto activation_failed;
+
+		cell->state = AFS_CELL_ACTIVE;
+		smp_wmb();
+		clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+		goto again;
+
+	case AFS_CELL_ACTIVE:
+		if (atomic_read(&cell->usage) > 1) {
+			time64_t now = ktime_get_real_seconds();
+			if (cell->dns_expiry <= now && net->live)
+				afs_update_cell(cell);
+			goto done;
+		}
+		cell->state = AFS_CELL_DEACTIVATING;
+		goto again;
+
+	case AFS_CELL_DEACTIVATING:
+		set_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
+		if (atomic_read(&cell->usage) > 1)
+			goto reverse_deactivation;
+		afs_deactivate_cell(net, cell);
+		cell->state = AFS_CELL_INACTIVE;
+		goto again;
+
+	default:
+		break;
+	}
+	_debug("bad state %u", cell->state);
+	BUG(); /* Unhandled state */
+
+activation_failed:
+	cell->error = ret;
+	afs_deactivate_cell(net, cell);
+
+	cell->state = AFS_CELL_FAILED;
+	smp_wmb();
+	if (test_and_clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags))
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+	goto again;
+
+reverse_deactivation:
+	cell->state = AFS_CELL_ACTIVE;
+	smp_wmb();
+	clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
+	wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+	_leave(" [deact->act]");
+	return;
+
+done:
+	_leave(" [done %u]", cell->state);
+	return;
+
+final_destruction:
+	call_rcu(&cell->rcu, afs_cell_destroy);
+	afs_dec_cells_outstanding(net);
+	_leave(" [destruct %d]", atomic_read(&net->cells_outstanding));
+}
+
+/*
+ * Manage the records of cells known to a network namespace.  This includes
+ * updating the DNS records and garbage collecting unused cells that were
+ * automatically added.
+ *
+ * Note that constructed cell records may only be removed from net->cells by
+ * this work item, so it is safe for this work item to stash a cursor pointing
+ * into the tree and then return to caller (provided it skips cells that are
+ * still under construction).
+ *
+ * Note also that we were given an increment on net->cells_outstanding by
+ * whoever queued us that we need to deal with before returning.
+ */
+void afs_manage_cells(struct work_struct *work)
+{
+	struct afs_net *net = container_of(work, struct afs_net, cells_manager);
+	struct rb_node *cursor;
+	time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
+	bool purging = !net->live;
 
 	_enter("");
 
-	afs_put_cell(afs_cell_root);
+	/* Trawl the cell database looking for cells that have expired from
+	 * lack of use and cells whose DNS results have expired and dispatch
+	 * their managers.
+	 */
+	read_seqlock_excl(&net->cells_lock);
 
-	down_write(&afs_cells_sem);
+	for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
+		struct afs_cell *cell =
+			rb_entry(cursor, struct afs_cell, net_node);
+		unsigned usage;
+		bool sched_cell = false;
 
-	while (!list_empty(&afs_cells)) {
-		cell = NULL;
+		usage = atomic_read(&cell->usage);
+		_debug("manage %s %u", cell->name, usage);
+
+		ASSERTCMP(usage, >=, 1);
 
-		/* remove the next cell from the front of the list */
-		write_lock(&afs_cells_lock);
+		if (purging) {
+			if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+				usage = atomic_dec_return(&cell->usage);
+			ASSERTCMP(usage, ==, 1);
+		}
+
+		if (usage == 1) {
+			time64_t expire_at = cell->last_inactive;
+
+			if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
+			    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+				expire_at += afs_cell_gc_delay;
+			if (purging || expire_at <= now)
+				sched_cell = true;
+			else if (expire_at < next_manage)
+				next_manage = expire_at;
+		}
 
-		if (!list_empty(&afs_cells)) {
-			cell = list_entry(afs_cells.next,
-					  struct afs_cell, link);
-			list_del_init(&cell->link);
+		if (!purging) {
+			if (cell->dns_expiry <= now)
+				sched_cell = true;
+			else if (cell->dns_expiry <= next_manage)
+				next_manage = cell->dns_expiry;
 		}
 
-		write_unlock(&afs_cells_lock);
+		if (sched_cell)
+			queue_work(afs_wq, &cell->manager);
+	}
+
+	read_sequnlock_excl(&net->cells_lock);
 
-		if (cell) {
-			_debug("PURGING CELL %s (%d)",
-			       cell->name, atomic_read(&cell->usage));
+	/* Update the timer on the way out.  We have to pass an increment on
+	 * cells_outstanding in the namespace that we are in to the timer or
+	 * the work scheduler.
+	 */
+	if (!purging && next_manage < TIME64_MAX) {
+		now = ktime_get_real_seconds();
 
-			/* now the cell should be left with no references */
-			afs_cell_destroy(cell);
+		if (next_manage - now <= 0) {
+			if (queue_work(afs_wq, &net->cells_manager))
+				atomic_inc(&net->cells_outstanding);
+		} else {
+			afs_set_cell_timer(net, next_manage - now);
 		}
 	}
 
-	up_write(&afs_cells_sem);
+	afs_dec_cells_outstanding(net);
+	_leave(" [%d]", atomic_read(&net->cells_outstanding));
+}
+
+/*
+ * Purge in-memory cell database.
+ */
+void afs_cell_purge(struct afs_net *net)
+{
+	struct afs_cell *ws;
+
+	_enter("");
+
+	write_seqlock(&net->cells_lock);
+	ws = net->ws_cell;
+	net->ws_cell = NULL;
+	write_sequnlock(&net->cells_lock);
+	afs_put_cell(net, ws);
+
+	_debug("del timer");
+	if (del_timer_sync(&net->cells_timer))
+		atomic_dec(&net->cells_outstanding);
+
+	_debug("kick mgr");
+	afs_queue_cell_manager(net);
+
+	_debug("wait");
+	wait_on_atomic_t(&net->cells_outstanding, atomic_t_wait,
+			 TASK_UNINTERRUPTIBLE);
 	_leave("");
 }
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 782d4d05a53b..41e277f57b20 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -41,7 +41,6 @@ static CM_NAME(CallBack);
 static const struct afs_call_type afs_SRXCBCallBack = {
 	.name		= afs_SRXCBCallBack_name,
 	.deliver	= afs_deliver_cb_callback,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_CallBack,
 };
@@ -53,7 +52,6 @@ static CM_NAME(InitCallBackState);
 static const struct afs_call_type afs_SRXCBInitCallBackState = {
 	.name		= afs_SRXCBInitCallBackState_name,
 	.deliver	= afs_deliver_cb_init_call_back_state,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_InitCallBackState,
 };
@@ -65,7 +63,6 @@ static CM_NAME(InitCallBackState3);
 static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
 	.name		= afs_SRXCBInitCallBackState3_name,
 	.deliver	= afs_deliver_cb_init_call_back_state3,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_InitCallBackState,
 };
@@ -77,7 +74,6 @@ static CM_NAME(Probe);
 static const struct afs_call_type afs_SRXCBProbe = {
 	.name		= afs_SRXCBProbe_name,
 	.deliver	= afs_deliver_cb_probe,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_Probe,
 };
@@ -89,7 +85,6 @@ static CM_NAME(ProbeUuid);
 static const struct afs_call_type afs_SRXCBProbeUuid = {
 	.name		= afs_SRXCBProbeUuid_name,
 	.deliver	= afs_deliver_cb_probe_uuid,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_ProbeUuid,
 };
@@ -101,7 +96,6 @@ static CM_NAME(TellMeAboutYourself);
 static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
 	.name		= afs_SRXCBTellMeAboutYourself_name,
 	.deliver	= afs_deliver_cb_tell_me_about_yourself,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_TellMeAboutYourself,
 };
@@ -127,6 +121,9 @@ bool afs_cm_incoming_call(struct afs_call *call)
 	case CBProbe:
 		call->type = &afs_SRXCBProbe;
 		return true;
+	case CBProbeUuid:
+		call->type = &afs_SRXCBProbeUuid;
+		return true;
 	case CBTellMeAboutYourself:
 		call->type = &afs_SRXCBTellMeAboutYourself;
 		return true;
@@ -147,18 +144,16 @@ static void afs_cm_destructor(struct afs_call *call)
 	 * afs_deliver_cb_callback().
 	 */
 	if (call->unmarshall == 5) {
-		ASSERT(call->server && call->count && call->request);
-		afs_break_callbacks(call->server, call->count, call->request);
+		ASSERT(call->cm_server && call->count && call->request);
+		afs_break_callbacks(call->cm_server, call->count, call->request);
 	}
 
-	afs_put_server(call->server);
-	call->server = NULL;
 	kfree(call->buffer);
 	call->buffer = NULL;
 }
 
 /*
- * allow the fileserver to see if the cache manager is still alive
+ * The server supplied a list of callbacks that it wanted to break.
  */
 static void SRXAFSCB_CallBack(struct work_struct *work)
 {
@@ -173,7 +168,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	 * yet */
 	afs_send_empty_reply(call);
 
-	afs_break_callbacks(call->server, call->count, call->request);
+	afs_break_callbacks(call->cm_server, call->count, call->request);
 	afs_put_call(call);
 	_leave("");
 }
@@ -193,7 +188,6 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 	switch (call->unmarshall) {
 	case 0:
-		rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -286,14 +280,16 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 		break;
 	}
 
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
-	call->server = server;
+	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -305,9 +301,9 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 {
 	struct afs_call *call = container_of(work, struct afs_call, work);
 
-	_enter("{%p}", call->server);
+	_enter("{%p}", call->cm_server);
 
-	afs_init_callback_state(call->server);
+	afs_init_callback_state(call->cm_server);
 	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
@@ -324,21 +320,18 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 
 	ret = afs_extract_data(call, NULL, 0, false);
 	if (ret < 0)
 		return ret;
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
-
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
-	call->server = server;
+	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -357,8 +350,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
-
 	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
@@ -402,15 +393,16 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 		break;
 	}
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
-	call->server = server;
+	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -441,8 +433,8 @@ static int afs_deliver_cb_probe(struct afs_call *call)
 	if (ret < 0)
 		return ret;
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	return afs_queue_call_work(call);
 }
@@ -461,7 +453,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 
 	_enter("");
 
-	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
+	if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
 		reply.match = htonl(0);
 	else
 		reply.match = htonl(1);
@@ -524,7 +516,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 		break;
 	}
 
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	return afs_queue_call_work(call);
 }
@@ -568,13 +561,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 	memset(&reply, 0, sizeof(reply));
 	reply.ia.nifs = htonl(nifs);
 
-	reply.ia.uuid[0] = afs_uuid.time_low;
-	reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid));
-	reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version));
-	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
-	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+	reply.ia.uuid[0] = call->net->uuid.time_low;
+	reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
+	reply.ia.uuid[2] = htonl(ntohs(call->net->uuid.time_hi_and_version));
+	reply.ia.uuid[3] = htonl((s8) call->net->uuid.clock_seq_hi_and_reserved);
+	reply.ia.uuid[4] = htonl((s8) call->net->uuid.clock_seq_low);
 	for (loop = 0; loop < 6; loop++)
-		reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+		reply.ia.uuid[loop + 5] = htonl((s8) call->net->uuid.node[loop]);
 
 	if (ifs) {
 		for (loop = 0; loop < nifs; loop++) {
@@ -605,8 +598,8 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
 	if (ret < 0)
 		return ret;
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	return afs_queue_call_work(call);
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 613a77058263..ff8d5bf4354f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -130,10 +130,11 @@ struct afs_lookup_cookie {
 /*
  * check that a directory page is valid
  */
-static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
+bool afs_dir_check_page(struct inode *dir, struct page *page)
 {
 	struct afs_dir_page *dbuf;
-	loff_t latter;
+	struct afs_vnode *vnode = AFS_FS_I(dir);
+	loff_t latter, i_size, off;
 	int tmp, qty;
 
 #if 0
@@ -150,8 +151,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
 	}
 #endif
 
-	/* determine how many magic numbers there should be in this page */
-	latter = dir->i_size - page_offset(page);
+	/* Determine how many magic numbers there should be in this page, but
+	 * we must take care because the directory may change size under us.
+	 */
+	off = page_offset(page);
+	i_size = i_size_read(dir);
+	if (i_size <= off)
+		goto checked;
+
+	latter = i_size - off;
 	if (latter >= PAGE_SIZE)
 		qty = PAGE_SIZE;
 	else
@@ -162,13 +170,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
 	dbuf = page_address(page);
 	for (tmp = 0; tmp < qty; tmp++) {
 		if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
-			printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n",
+			printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
 			       __func__, dir->i_ino, tmp, qty,
 			       ntohs(dbuf->blocks[tmp].pagehdr.magic));
+			trace_afs_dir_check_failed(vnode, off, i_size);
 			goto error;
 		}
 	}
 
+checked:
 	SetPageChecked(page);
 	return true;
 
@@ -183,6 +193,7 @@ error:
 static inline void afs_dir_put_page(struct page *page)
 {
 	kunmap(page);
+	unlock_page(page);
 	put_page(page);
 }
 
@@ -197,9 +208,10 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
 
 	page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
 	if (!IS_ERR(page)) {
+		lock_page(page);
 		kmap(page);
 		if (unlikely(!PageChecked(page))) {
-			if (PageError(page) || !afs_dir_check_page(dir, page))
+			if (PageError(page))
 				goto fail;
 		}
 	}
@@ -384,8 +396,7 @@ out:
  */
 static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	return afs_dir_iterate(file_inode(file), 
-			      ctx, file->private_data);
+	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file));
 }
 
 /*
@@ -553,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
 
 	/* instantiate the dentry */
-	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL);
 	key_put(key);
 	if (IS_ERR(inode)) {
 		_leave(" = %ld", PTR_ERR(inode));
@@ -581,6 +592,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	struct afs_vnode *vnode, *dir;
 	struct afs_fid uninitialized_var(fid);
 	struct dentry *parent;
+	struct inode *inode;
 	struct key *key;
 	void *dir_version;
 	int ret;
@@ -588,30 +600,39 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	vnode = AFS_FS_I(d_inode(dentry));
-
-	if (d_really_is_positive(dentry))
+	if (d_really_is_positive(dentry)) {
+		vnode = AFS_FS_I(d_inode(dentry));
 		_enter("{v={%x:%u} n=%pd fl=%lx},",
 		       vnode->fid.vid, vnode->fid.vnode, dentry,
 		       vnode->flags);
-	else
+	} else {
 		_enter("{neg n=%pd}", dentry);
+	}
 
 	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
 	if (IS_ERR(key))
 		key = NULL;
 
+	if (d_really_is_positive(dentry)) {
+		inode = d_inode(dentry);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			afs_validate(vnode, key);
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				goto out_bad;
+		}
+	}
+
 	/* lock down the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
 	dir = AFS_FS_I(d_inode(parent));
 
 	/* validate the parent directory */
-	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
-		afs_validate(dir, key);
+	afs_validate(dir, key);
 
 	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
 		_debug("%pd: parent dir deleted", dentry);
-		goto out_bad;
+		goto out_bad_parent;
 	}
 
 	dir_version = (void *) (unsigned long) dir->status.data_version;
@@ -626,13 +647,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	case 0:
 		/* the filename maps to something */
 		if (d_really_is_negative(dentry))
-			goto out_bad;
-		if (is_bad_inode(d_inode(dentry))) {
+			goto out_bad_parent;
+		inode = d_inode(dentry);
+		if (is_bad_inode(inode)) {
 			printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
 			       dentry);
-			goto out_bad;
+			goto out_bad_parent;
 		}
 
+		vnode = AFS_FS_I(inode);
+
 		/* if the vnode ID has changed, then the dirent points to a
 		 * different file */
 		if (fid.vnode != vnode->fid.vnode) {
@@ -649,10 +673,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 			_debug("%pd: file deleted (uq %u -> %u I:%u)",
 			       dentry, fid.unique,
 			       vnode->fid.unique,
-			       d_inode(dentry)->i_generation);
-			spin_lock(&vnode->lock);
+			       vnode->vfs_inode.i_generation);
+			write_seqlock(&vnode->cb_lock);
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
-			spin_unlock(&vnode->lock);
+			write_sequnlock(&vnode->cb_lock);
 			goto not_found;
 		}
 		goto out_valid;
@@ -667,7 +691,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	default:
 		_debug("failed to iterate dir %pd: %d",
 		       parent, ret);
-		goto out_bad;
+		goto out_bad_parent;
 	}
 
 out_valid:
@@ -683,9 +707,10 @@ not_found:
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);
 
-out_bad:
+out_bad_parent:
 	_debug("dropping dentry %pd2", dentry);
 	dput(parent);
+out_bad:
 	key_put(key);
 
 	_leave(" = 0 [bad]");
@@ -727,20 +752,48 @@ static void afs_d_release(struct dentry *dentry)
 }
 
 /*
+ * Create a new inode for create/mkdir/symlink
+ */
+static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
+				struct dentry *new_dentry,
+				struct afs_fid *newfid,
+				struct afs_file_status *newstatus,
+				struct afs_callback *newcb)
+{
+	struct inode *inode;
+
+	if (fc->ac.error < 0)
+		return;
+
+	d_drop(new_dentry);
+
+	inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
+			 newfid, newstatus, newcb, fc->cbi);
+	if (IS_ERR(inode)) {
+		/* ENOMEM or EINTR at a really inconvenient time - just abandon
+		 * the new directory on the server.
+		 */
+		fc->ac.error = PTR_ERR(inode);
+		return;
+	}
+
+	d_add(new_dentry, inode);
+}
+
+/*
  * create a directory on an AFS filesystem
  */
 static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct afs_file_status status;
-	struct afs_callback cb;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_file_status newstatus;
+	struct afs_fs_cursor fc;
+	struct afs_callback newcb;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
+	mode |= S_IFDIR;
 
 	_enter("{%x:%u},{%pd},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
@@ -751,40 +804,29 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto error;
 	}
 
-	mode |= S_IFDIR;
-	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
-			       mode, &fid, &status, &cb, &server);
-	if (ret < 0)
-		goto mkdir_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &newfid, &newstatus, &newcb);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	} else {
+		goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(server);
-mkdir_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -793,16 +835,29 @@ error:
 }
 
 /*
+ * Remove a subdir from a directory.
+ */
+static void afs_dir_remove_subdir(struct dentry *dentry)
+{
+	if (d_really_is_positive(dentry)) {
+		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+
+		clear_nlink(&vnode->vfs_inode);
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	}
+}
+
+/*
  * remove a directory from an AFS filesystem
  */
 static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_fs_cursor fc;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
@@ -812,45 +867,69 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto error;
 	}
 
-	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
-	if (ret < 0)
-		goto rmdir_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_remove(&fc, dentry->d_name.name, true);
+		}
 
-	if (d_really_is_positive(dentry)) {
-		vnode = AFS_FS_I(d_inode(dentry));
-		clear_nlink(&vnode->vfs_inode);
-		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		afs_discard_callback_on_delete(vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0)
+			afs_dir_remove_subdir(dentry);
 	}
 
 	key_put(key);
-	_leave(" = 0");
-	return 0;
-
-rmdir_error:
-	key_put(key);
 error:
-	_leave(" = %d", ret);
 	return ret;
 }
 
 /*
- * remove a file from an AFS filesystem
+ * Remove a link to a file or symlink from a directory.
+ *
+ * If the file was not deleted due to excess hard links, the fileserver will
+ * break the callback promise on the file - if it had one - before it returns
+ * to us, and if it was deleted, it won't
+ *
+ * However, if we didn't have a callback promise outstanding, or it was
+ * outstanding on a different server, then it won't break it either...
+ */
+static int afs_dir_remove_link(struct dentry *dentry, struct key *key)
+{
+	int ret = 0;
+
+	if (d_really_is_positive(dentry)) {
+		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			kdebug("AFS_VNODE_DELETED");
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+		ret = afs_validate(vnode, key);
+		if (ret == -ESTALE)
+			ret = 0;
+		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+	}
+
+	return ret;
+}
+
+/*
+ * Remove a file or symlink from an AFS filesystem.
  */
 static int afs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_fs_cursor fc;
+	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
-	ret = -ENAMETOOLONG;
 	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
+		return -ENAMETOOLONG;
 
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
@@ -858,44 +937,28 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		goto error;
 	}
 
+	/* Try to make sure we have a callback promise on the victim. */
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
-
-		/* make sure we have a callback promise on the victim */
 		ret = afs_validate(vnode, key);
 		if (ret < 0)
-			goto error;
+			goto error_key;
 	}
 
-	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
-	if (ret < 0)
-		goto remove_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_remove(&fc, dentry->d_name.name, false);
+		}
 
-	if (d_really_is_positive(dentry)) {
-		/* if the file wasn't deleted due to excess hard links, the
-		 * fileserver will break the callback promise on the file - if
-		 * it had one - before it returns to us, and if it was deleted,
-		 * it won't
-		 *
-		 * however, if we didn't have a callback promise outstanding,
-		 * or it was outstanding on a different server, then it won't
-		 * break it either...
-		 */
-		vnode = AFS_FS_I(d_inode(dentry));
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			_debug("AFS_VNODE_DELETED");
-		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
-			_debug("AFS_VNODE_CB_BROKEN");
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_validate(vnode, key);
-		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0)
+			ret = afs_dir_remove_link(dentry, key);
 	}
 
-	key_put(key);
-	_leave(" = 0");
-	return 0;
-
-remove_error:
+error_key:
 	key_put(key);
 error:
 	_leave(" = %d", ret);
@@ -908,60 +971,52 @@ error:
 static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      bool excl)
 {
-	struct afs_file_status status;
-	struct afs_callback cb;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_fs_cursor fc;
+	struct afs_file_status newstatus;
+	struct afs_callback newcb;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
+	mode |= S_IFREG;
 
 	_enter("{%x:%u},{%pd},%ho,",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
-	mode |= S_IFREG;
-	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
-			       mode, &fid, &status, &cb, &server);
-	if (ret < 0)
-		goto create_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &newfid, &newstatus, &newcb);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	} else {
+		goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(server);
-create_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -975,6 +1030,7 @@ error:
 static int afs_link(struct dentry *from, struct inode *dir,
 		    struct dentry *dentry)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *dvnode, *vnode;
 	struct key *key;
 	int ret;
@@ -987,23 +1043,47 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	       dvnode->fid.vid, dvnode->fid.vnode,
 	       dentry);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
-	ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
-	if (ret < 0)
-		goto link_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
+			afs_end_vnode_operation(&fc);
+			goto error_key;
+		}
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_link(&fc, vnode, dentry->d_name.name);
+		}
+
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break_2);
+		ihold(&vnode->vfs_inode);
+		d_instantiate(dentry, &vnode->vfs_inode);
+
+		mutex_unlock(&vnode->io_lock);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	} else {
+		goto error_key;
+	}
 
-	ihold(&vnode->vfs_inode);
-	d_instantiate(dentry, &vnode->vfs_inode);
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-link_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -1017,20 +1097,21 @@ error:
 static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		       const char *content)
 {
-	struct afs_file_status status;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_fs_cursor fc;
+	struct afs_file_status newstatus;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd},%s",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry,
 	       content);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	ret = -EINVAL;
 	if (strlen(content) >= AFSPATHMAX)
 		goto error;
@@ -1041,39 +1122,29 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		goto error;
 	}
 
-	ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
-				&fid, &status, &server);
-	if (ret < 0)
-		goto create_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_symlink(&fc, dentry->d_name.name, content,
+				       &newfid, &newstatus);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, NULL);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	} else {
+		goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(server);
-create_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -1088,6 +1159,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		      struct inode *new_dir, struct dentry *new_dentry,
 		      unsigned int flags)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
 	struct key *key;
 	int ret;
@@ -1111,19 +1183,33 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto error;
 	}
 
-	ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
-			       old_dentry->d_name.name,
-			       new_dentry->d_name.name);
-	if (ret < 0)
-		goto rename_error;
-	key_put(key);
-	_leave(" = 0");
-	return 0;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, orig_dvnode, key)) {
+		if (orig_dvnode != new_dvnode) {
+			if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) {
+				afs_end_vnode_operation(&fc);
+				goto error_key;
+			}
+		}
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break;
+			fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break;
+			afs_fs_rename(&fc, old_dentry->d_name.name,
+				      new_dvnode, new_dentry->d_name.name);
+		}
 
-rename_error:
+		afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2);
+		if (orig_dvnode != new_dvnode)
+			mutex_unlock(&new_dvnode->io_lock);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	}
+
+error_key:
 	key_put(key);
 error:
-	d_drop(new_dentry);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 510cba15fa56..a39192ced99e 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,11 +19,11 @@
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
+static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
 static int afs_readpage(struct file *file, struct page *page);
 static void afs_invalidatepage(struct page *page, unsigned int offset,
 			       unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
-static int afs_launder_page(struct page *page);
 
 static int afs_readpages(struct file *filp, struct address_space *mapping,
 			 struct list_head *pages, unsigned nr_pages);
@@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= generic_file_read_iter,
 	.write_iter	= afs_file_write,
-	.mmap		= generic_file_readonly_mmap,
+	.mmap		= afs_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= afs_fsync,
 	.lock		= afs_lock,
@@ -62,12 +62,63 @@ const struct address_space_operations afs_fs_aops = {
 	.writepages	= afs_writepages,
 };
 
+static const struct vm_operations_struct afs_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= afs_page_mkwrite,
+};
+
+/*
+ * Discard a pin on a writeback key.
+ */
+void afs_put_wb_key(struct afs_wb_key *wbk)
+{
+	if (refcount_dec_and_test(&wbk->usage)) {
+		key_put(wbk->key);
+		kfree(wbk);
+	}
+}
+
+/*
+ * Cache key for writeback.
+ */
+int afs_cache_wb_key(struct afs_vnode *vnode, struct afs_file *af)
+{
+	struct afs_wb_key *wbk, *p;
+
+	wbk = kzalloc(sizeof(struct afs_wb_key), GFP_KERNEL);
+	if (!wbk)
+		return -ENOMEM;
+	refcount_set(&wbk->usage, 2);
+	wbk->key = af->key;
+
+	spin_lock(&vnode->wb_lock);
+	list_for_each_entry(p, &vnode->wb_keys, vnode_link) {
+		if (p->key == wbk->key)
+			goto found;
+	}
+
+	key_get(wbk->key);
+	list_add_tail(&wbk->vnode_link, &vnode->wb_keys);
+	spin_unlock(&vnode->wb_lock);
+	af->wb = wbk;
+	return 0;
+
+found:
+	refcount_inc(&p->usage);
+	spin_unlock(&vnode->wb_lock);
+	af->wb = p;
+	kfree(wbk);
+	return 0;
+}
+
 /*
  * open an AFS file or directory and attach a key to it
  */
 int afs_open(struct inode *inode, struct file *file)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_file *af;
 	struct key *key;
 	int ret;
 
@@ -75,19 +126,38 @@ int afs_open(struct inode *inode, struct file *file)
 
 	key = afs_request_key(vnode->volume->cell);
 	if (IS_ERR(key)) {
-		_leave(" = %ld [key]", PTR_ERR(key));
-		return PTR_ERR(key);
+		ret = PTR_ERR(key);
+		goto error;
 	}
 
-	ret = afs_validate(vnode, key);
-	if (ret < 0) {
-		_leave(" = %d [val]", ret);
-		return ret;
+	af = kzalloc(sizeof(*af), GFP_KERNEL);
+	if (!af) {
+		ret = -ENOMEM;
+		goto error_key;
 	}
+	af->key = key;
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0)
+		goto error_af;
 
-	file->private_data = key;
+	if (file->f_mode & FMODE_WRITE) {
+		ret = afs_cache_wb_key(vnode, af);
+		if (ret < 0)
+			goto error_af;
+	}
+	
+	file->private_data = af;
 	_leave(" = 0");
 	return 0;
+
+error_af:
+	kfree(af);
+error_key:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
 }
 
 /*
@@ -96,10 +166,16 @@ int afs_open(struct inode *inode, struct file *file)
 int afs_release(struct inode *inode, struct file *file)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_file *af = file->private_data;
 
 	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
 
-	key_put(file->private_data);
+	file->private_data = NULL;
+	if (af->wb)
+		afs_put_wb_key(af->wb);
+	key_put(af->key);
+	kfree(af);
+	afs_prune_wb_keys(vnode);
 	_leave(" = 0");
 	return 0;
 }
@@ -138,6 +214,37 @@ static void afs_file_readpage_read_complete(struct page *page,
 #endif
 
 /*
+ * Fetch file data from the volume.
+ */
+int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,,,",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_fetch_data(&fc, desc);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * read page from file, directory or symlink, given a key to use
  */
 int afs_page_filler(void *data, struct page *page)
@@ -199,8 +306,13 @@ int afs_page_filler(void *data, struct page *page)
 
 		/* read the contents of the file from the server into the
 		 * page */
-		ret = afs_vnode_fetch_data(vnode, key, req);
+		ret = afs_fetch_data(vnode, key, req);
 		afs_put_read(req);
+
+		if (ret >= 0 && S_ISDIR(inode->i_mode) &&
+		    !afs_dir_check_page(inode, page))
+			ret = -EIO;
+
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				_debug("got NOENT from server"
@@ -259,12 +371,12 @@ static int afs_readpage(struct file *file, struct page *page)
 	int ret;
 
 	if (file) {
-		key = file->private_data;
+		key = afs_file_key(file);
 		ASSERT(key != NULL);
 		ret = afs_page_filler(key, page);
 	} else {
 		struct inode *inode = page->mapping->host;
-		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+		key = afs_request_key(AFS_FS_S(inode->i_sb)->cell);
 		if (IS_ERR(key)) {
 			ret = PTR_ERR(key);
 		} else {
@@ -281,7 +393,7 @@ static int afs_readpage(struct file *file, struct page *page)
 static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
 {
 #ifdef CONFIG_AFS_FSCACHE
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 #endif
 	struct page *page = req->pages[req->index];
 
@@ -310,7 +422,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
 	struct afs_read *req;
 	struct list_head *p;
 	struct page *first, *page;
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	pgoff_t index;
 	int ret, n, i;
 
@@ -369,7 +481,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
 		return 0;
 	}
 
-	ret = afs_vnode_fetch_data(vnode, key, req);
+	ret = afs_fetch_data(vnode, key, req);
 	if (ret < 0)
 		goto error;
 
@@ -406,7 +518,7 @@ error:
 static int afs_readpages(struct file *file, struct address_space *mapping,
 			 struct list_head *pages, unsigned nr_pages)
 {
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	struct afs_vnode *vnode;
 	int ret = 0;
 
@@ -464,16 +576,6 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 }
 
 /*
- * write back a dirty page
- */
-static int afs_launder_page(struct page *page)
-{
-	_enter("{%lu}", page->index);
-
-	return 0;
-}
-
-/*
  * invalidate part or all of a page
  * - release a page and clean up its private data if offset is 0 (indicating
  *   the entire page)
@@ -481,7 +583,8 @@ static int afs_launder_page(struct page *page)
 static void afs_invalidatepage(struct page *page, unsigned int offset,
 			       unsigned int length)
 {
-	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+	unsigned long priv;
 
 	_enter("{%lu},%u,%u", page->index, offset, length);
 
@@ -498,13 +601,11 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
 #endif
 
 		if (PagePrivate(page)) {
-			if (wb && !PageWriteback(page)) {
-				set_page_private(page, 0);
-				afs_put_writeback(wb);
-			}
-
-			if (!page_private(page))
-				ClearPagePrivate(page);
+			priv = page_private(page);
+			trace_afs_page_dirty(vnode, tracepoint_string("inval"),
+					     page->index, priv);
+			set_page_private(page, 0);
+			ClearPagePrivate(page);
 		}
 	}
 
@@ -517,8 +618,8 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
  */
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
 	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+	unsigned long priv;
 
 	_enter("{{%x:%u}[%lu],%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
@@ -534,10 +635,10 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 #endif
 
 	if (PagePrivate(page)) {
-		if (wb) {
-			set_page_private(page, 0);
-			afs_put_writeback(wb);
-		}
+		priv = page_private(page);
+		trace_afs_page_dirty(vnode, tracepoint_string("rel"),
+				     page->index, priv);
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 	}
 
@@ -545,3 +646,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 	_leave(" = T");
 	return 1;
 }
+
+/*
+ * Handle setting up a memory mapping on an AFS file.
+ */
+static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret;
+
+	ret = generic_file_mmap(file, vma);
+	if (ret == 0)
+		vma->vm_ops = &afs_vm_ops;
+	return ret;
+}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3191dff2c156..c40ba2fe3cbe 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -14,48 +14,17 @@
 #define AFS_LOCK_GRANTED	0
 #define AFS_LOCK_PENDING	1
 
+struct workqueue_struct *afs_lock_manager;
+
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
 static void afs_fl_release_private(struct file_lock *fl);
 
-static struct workqueue_struct *afs_lock_manager;
-static DEFINE_MUTEX(afs_lock_manager_mutex);
-
 static const struct file_lock_operations afs_lock_ops = {
 	.fl_copy_lock		= afs_fl_copy_lock,
 	.fl_release_private	= afs_fl_release_private,
 };
 
 /*
- * initialise the lock manager thread if it isn't already running
- */
-static int afs_init_lock_manager(void)
-{
-	int ret;
-
-	ret = 0;
-	if (!afs_lock_manager) {
-		mutex_lock(&afs_lock_manager_mutex);
-		if (!afs_lock_manager) {
-			afs_lock_manager = alloc_workqueue("kafs_lockd",
-							   WQ_MEM_RECLAIM, 0);
-			if (!afs_lock_manager)
-				ret = -ENOMEM;
-		}
-		mutex_unlock(&afs_lock_manager_mutex);
-	}
-	return ret;
-}
-
-/*
- * destroy the lock manager thread if it's running
- */
-void __exit afs_kill_lock_manager(void)
-{
-	if (afs_lock_manager)
-		destroy_workqueue(afs_lock_manager);
-}
-
-/*
  * if the callback is broken on this vnode, then the lock may now be available
  */
 void afs_lock_may_be_available(struct afs_vnode *vnode)
@@ -99,6 +68,100 @@ static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl)
 }
 
 /*
+ * Get a lock on a file
+ */
+static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
+			afs_lock_type_t type)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%u",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key), type);
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_set_lock(&fc, type);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Extend a lock on a file
+ */
+static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_current_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_extend_lock(&fc);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Release a lock on a file
+ */
+static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_current_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_release_lock(&fc);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * do work for a lock, including:
  * - probing for a lock we're waiting on but didn't get immediately
  * - extending a lock that's close to timing out
@@ -107,7 +170,7 @@ void afs_lock_work(struct work_struct *work)
 {
 	struct afs_vnode *vnode =
 		container_of(work, struct afs_vnode, lock_work.work);
-	struct file_lock *fl;
+	struct file_lock *fl, *next;
 	afs_lock_type_t type;
 	struct key *key;
 	int ret;
@@ -116,117 +179,136 @@ void afs_lock_work(struct work_struct *work)
 
 	spin_lock(&vnode->lock);
 
-	if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
+again:
+	_debug("wstate %u for %p", vnode->lock_state, vnode);
+	switch (vnode->lock_state) {
+	case AFS_VNODE_LOCK_NEED_UNLOCK:
 		_debug("unlock");
+		vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING;
 		spin_unlock(&vnode->lock);
 
 		/* attempt to release the server lock; if it fails, we just
-		 * wait 5 minutes and it'll time out anyway */
-		ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
+		 * wait 5 minutes and it'll expire anyway */
+		ret = afs_release_lock(vnode, vnode->lock_key);
 		if (ret < 0)
 			printk(KERN_WARNING "AFS:"
 			       " Failed to release lock on {%x:%x} error %d\n",
 			       vnode->fid.vid, vnode->fid.vnode, ret);
 
 		spin_lock(&vnode->lock);
-		key_put(vnode->unlock_key);
-		vnode->unlock_key = NULL;
-		clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
-	}
+		key_put(vnode->lock_key);
+		vnode->lock_key = NULL;
+		vnode->lock_state = AFS_VNODE_LOCK_NONE;
+
+		if (list_empty(&vnode->pending_locks)) {
+			spin_unlock(&vnode->lock);
+			return;
+		}
 
-	/* if we've got a lock, then it must be time to extend that lock as AFS
-	 * locks time out after 5 minutes */
-	if (!list_empty(&vnode->granted_locks)) {
+		/* The new front of the queue now owns the state variables. */
+		next = list_entry(vnode->pending_locks.next,
+				  struct file_lock, fl_u.afs.link);
+		vnode->lock_key = afs_file_key(next->fl_file);
+		vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+		vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
+		goto again;
+
+	/* If we've already got a lock, then it must be time to extend that
+	 * lock as AFS locks time out after 5 minutes.
+	 */
+	case AFS_VNODE_LOCK_GRANTED:
 		_debug("extend");
 
-		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
-			BUG();
-		fl = list_entry(vnode->granted_locks.next,
-				struct file_lock, fl_u.afs.link);
-		key = key_get(fl->fl_file->private_data);
+		ASSERT(!list_empty(&vnode->granted_locks));
+
+		key = key_get(vnode->lock_key);
+		vnode->lock_state = AFS_VNODE_LOCK_EXTENDING;
 		spin_unlock(&vnode->lock);
 
-		ret = afs_vnode_extend_lock(vnode, key);
-		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		ret = afs_extend_lock(vnode, key); /* RPC */
 		key_put(key);
-		switch (ret) {
-		case 0:
+
+		if (ret < 0)
+			pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n",
+				   vnode->fid.vid, vnode->fid.vnode, ret);
+
+		spin_lock(&vnode->lock);
+
+		if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING)
+			goto again;
+		vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
+
+		if (ret == 0)
 			afs_schedule_lock_extension(vnode);
-			break;
-		default:
-			/* ummm... we failed to extend the lock - retry
-			 * extension shortly */
-			printk(KERN_WARNING "AFS:"
-			       " Failed to extend lock on {%x:%x} error %d\n",
-			       vnode->fid.vid, vnode->fid.vnode, ret);
+		else
 			queue_delayed_work(afs_lock_manager, &vnode->lock_work,
 					   HZ * 10);
-			break;
-		}
-		_leave(" [extend]");
+		spin_unlock(&vnode->lock);
+		_leave(" [ext]");
 		return;
-	}
 
-	/* if we don't have a granted lock, then we must've been called back by
-	 * the server, and so if might be possible to get a lock we're
-	 * currently waiting for */
-	if (!list_empty(&vnode->pending_locks)) {
+		/* If we don't have a granted lock, then we must've been called
+		 * back by the server, and so if might be possible to get a
+		 * lock we're currently waiting for.
+		 */
+	case AFS_VNODE_LOCK_WAITING_FOR_CB:
 		_debug("get");
 
-		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
-			BUG();
-		fl = list_entry(vnode->pending_locks.next,
-				struct file_lock, fl_u.afs.link);
-		key = key_get(fl->fl_file->private_data);
-		type = (fl->fl_type == F_RDLCK) ?
-			AFS_LOCK_READ : AFS_LOCK_WRITE;
+		key = key_get(vnode->lock_key);
+		type = vnode->lock_type;
+		vnode->lock_state = AFS_VNODE_LOCK_SETTING;
 		spin_unlock(&vnode->lock);
 
-		ret = afs_vnode_set_lock(vnode, key, type);
-		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		ret = afs_set_lock(vnode, key, type); /* RPC */
+		key_put(key);
+
+		spin_lock(&vnode->lock);
 		switch (ret) {
 		case -EWOULDBLOCK:
 			_debug("blocked");
 			break;
 		case 0:
 			_debug("acquired");
-			if (type == AFS_LOCK_READ)
-				set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
-			else
-				set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
-			ret = AFS_LOCK_GRANTED;
+			vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
+			/* Fall through */
 		default:
-			spin_lock(&vnode->lock);
-			/* the pending lock may have been withdrawn due to a
-			 * signal */
-			if (list_entry(vnode->pending_locks.next,
-				       struct file_lock, fl_u.afs.link) == fl) {
-				fl->fl_u.afs.state = ret;
-				if (ret == AFS_LOCK_GRANTED)
-					afs_grant_locks(vnode, fl);
-				else
-					list_del_init(&fl->fl_u.afs.link);
-				wake_up(&fl->fl_wait);
-				spin_unlock(&vnode->lock);
-			} else {
+			/* Pass the lock or the error onto the first locker in
+			 * the list - if they're looking for this type of lock.
+			 * If they're not, we assume that whoever asked for it
+			 * took a signal.
+			 */
+			if (list_empty(&vnode->pending_locks)) {
 				_debug("withdrawn");
-				clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
-				clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
-				spin_unlock(&vnode->lock);
-				afs_vnode_release_lock(vnode, key);
-				if (!list_empty(&vnode->pending_locks))
-					afs_lock_may_be_available(vnode);
+				vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
+				goto again;
 			}
-			break;
+
+			fl = list_entry(vnode->pending_locks.next,
+					struct file_lock, fl_u.afs.link);
+			type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+			if (vnode->lock_type != type) {
+				_debug("changed");
+				vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
+				goto again;
+			}
+
+			fl->fl_u.afs.state = ret;
+			if (ret == 0)
+				afs_grant_locks(vnode, fl);
+			else
+				list_del_init(&fl->fl_u.afs.link);
+			wake_up(&fl->fl_wait);
+			spin_unlock(&vnode->lock);
+			_leave(" [granted]");
+			return;
 		}
-		key_put(key);
-		_leave(" [pend]");
+
+	default:
+		/* Looks like a lock request was withdrawn. */
+		spin_unlock(&vnode->lock);
+		_leave(" [no]");
 		return;
 	}
-
-	/* looks like the lock request was withdrawn on a signal */
-	spin_unlock(&vnode->lock);
-	_leave(" [no locks]");
 }
 
 /*
@@ -235,15 +317,105 @@ void afs_lock_work(struct work_struct *work)
  * AF_RXRPC
  * - the caller must hold the vnode lock
  */
-static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
+static void afs_defer_unlock(struct afs_vnode *vnode)
+{
+	_enter("");
+
+	if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED ||
+	    vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) {
+		cancel_delayed_work(&vnode->lock_work);
+
+		vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
+		afs_lock_may_be_available(vnode);
+	}
+}
+
+/*
+ * Check that our view of the file metadata is up to date and check to see
+ * whether we think that we have a locking permit.
+ */
+static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key,
+			      afs_lock_type_t type, bool can_sleep)
+{
+	afs_access_t access;
+	int ret;
+
+	/* Make sure we've got a callback on this file and that our view of the
+	 * data version is up to date.
+	 */
+	ret = afs_validate(vnode, key);
+	if (ret < 0)
+		return ret;
+
+	/* Check the permission set to see if we're actually going to be
+	 * allowed to get a lock on this file.
+	 */
+	ret = afs_check_permit(vnode, key, &access);
+	if (ret < 0)
+		return ret;
+
+	/* At a rough estimation, you need LOCK, WRITE or INSERT perm to
+	 * read-lock a file and WRITE or INSERT perm to write-lock a file.
+	 *
+	 * We can't rely on the server to do this for us since if we want to
+	 * share a read lock that we already have, we won't go the server.
+	 */
+	if (type == AFS_LOCK_READ) {
+		if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK)))
+			return -EACCES;
+		if (vnode->status.lock_count == -1 && !can_sleep)
+			return -EAGAIN; /* Write locked */
+	} else {
+		if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE)))
+			return -EACCES;
+		if (vnode->status.lock_count != 0 && !can_sleep)
+			return -EAGAIN; /* Locked */
+	}
+
+	return 0;
+}
+
+/*
+ * Remove the front runner from the pending queue.
+ * - The caller must hold vnode->lock.
+ */
+static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
 {
-	cancel_delayed_work(&vnode->lock_work);
-	if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
-	    !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
-		BUG();
-	if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
-		BUG();
-	vnode->unlock_key = key_get(key);
+	struct file_lock *next;
+
+	_enter("");
+
+	/* ->lock_type, ->lock_key and ->lock_state only belong to this
+	 * file_lock if we're at the front of the pending queue or if we have
+	 * the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING.
+	 */
+	if (vnode->granted_locks.next == &fl->fl_u.afs.link &&
+	    vnode->granted_locks.prev == &fl->fl_u.afs.link) {
+		list_del_init(&fl->fl_u.afs.link);
+		afs_defer_unlock(vnode);
+		return;
+	}
+
+	if (!list_empty(&vnode->granted_locks) ||
+	    vnode->pending_locks.next != &fl->fl_u.afs.link) {
+		list_del_init(&fl->fl_u.afs.link);
+		return;
+	}
+
+	list_del_init(&fl->fl_u.afs.link);
+	key_put(vnode->lock_key);
+	vnode->lock_key = NULL;
+	vnode->lock_state = AFS_VNODE_LOCK_NONE;
+
+	if (list_empty(&vnode->pending_locks))
+		return;
+
+	/* The new front of the queue now owns the state variables. */
+	next = list_entry(vnode->pending_locks.next,
+			  struct file_lock, fl_u.afs.link);
+	vnode->lock_key = afs_file_key(next->fl_file);
+	vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+	vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
 	afs_lock_may_be_available(vnode);
 }
 
@@ -252,10 +424,10 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
  */
 static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
-	struct inode *inode = file_inode(file);
+	struct inode *inode = locks_inode(file);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_lock_type_t type;
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	int ret;
 
 	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
@@ -264,175 +436,142 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
 		return -EINVAL;
 
-	ret = afs_init_lock_manager();
-	if (ret < 0)
-		return ret;
-
 	fl->fl_ops = &afs_lock_ops;
 	INIT_LIST_HEAD(&fl->fl_u.afs.link);
 	fl->fl_u.afs.state = AFS_LOCK_PENDING;
 
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 
-	spin_lock(&inode->i_lock);
-
-	/* make sure we've got a callback on this file and that our view of the
-	 * data version is up to date */
-	ret = afs_vnode_fetch_status(vnode, NULL, key);
+	ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP);
 	if (ret < 0)
-		goto error;
-
-	if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
-		ret = -EAGAIN;
-		goto error;
-	}
+		return ret;
 
 	spin_lock(&vnode->lock);
 
-	/* if we've already got a readlock on the server then we can instantly
+	/* If we've already got a readlock on the server then we instantly
 	 * grant another readlock, irrespective of whether there are any
-	 * pending writelocks */
+	 * pending writelocks.
+	 */
 	if (type == AFS_LOCK_READ &&
-	    vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
+	    vnode->lock_state == AFS_VNODE_LOCK_GRANTED &&
+	    vnode->lock_type == AFS_LOCK_READ) {
 		_debug("instant readlock");
-		ASSERTCMP(vnode->flags &
-			  ((1 << AFS_VNODE_LOCKING) |
-			   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
 		ASSERT(!list_empty(&vnode->granted_locks));
-		goto sharing_existing_lock;
+		goto share_existing_lock;
 	}
 
-	/* if there's no-one else with a lock on this vnode, then we need to
-	 * ask the server for a lock */
-	if (list_empty(&vnode->pending_locks) &&
-	    list_empty(&vnode->granted_locks)) {
-		_debug("not locked");
-		ASSERTCMP(vnode->flags &
-			  ((1 << AFS_VNODE_LOCKING) |
-			   (1 << AFS_VNODE_READLOCKED) |
-			   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
-		list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
-		set_bit(AFS_VNODE_LOCKING, &vnode->flags);
-		spin_unlock(&vnode->lock);
+	list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
 
-		ret = afs_vnode_set_lock(vnode, key, type);
-		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
-		switch (ret) {
-		case 0:
-			_debug("acquired");
-			goto acquired_server_lock;
-		case -EWOULDBLOCK:
-			_debug("would block");
-			spin_lock(&vnode->lock);
-			ASSERT(list_empty(&vnode->granted_locks));
-			ASSERTCMP(vnode->pending_locks.next, ==,
-				  &fl->fl_u.afs.link);
-			goto wait;
-		default:
-			spin_lock(&vnode->lock);
-			list_del_init(&fl->fl_u.afs.link);
-			spin_unlock(&vnode->lock);
-			goto error;
-		}
-	}
+	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
+		goto need_to_wait;
 
-	/* otherwise, we need to wait for a local lock to become available */
-	_debug("wait local");
-	list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
-wait:
-	if (!(fl->fl_flags & FL_SLEEP)) {
-		_debug("noblock");
-		ret = -EAGAIN;
-		goto abort_attempt;
-	}
+	/* We don't have a lock on this vnode and we aren't currently waiting
+	 * for one either, so ask the server for a lock.
+	 *
+	 * Note that we need to be careful if we get interrupted by a signal
+	 * after dispatching the request as we may still get the lock, even
+	 * though we don't wait for the reply (it's not too bad a problem - the
+	 * lock will expire in 10 mins anyway).
+	 */
+	_debug("not locked");
+	vnode->lock_key = key_get(key);
+	vnode->lock_type = type;
+	vnode->lock_state = AFS_VNODE_LOCK_SETTING;
 	spin_unlock(&vnode->lock);
 
-	/* now we need to sleep and wait for the lock manager thread to get the
-	 * lock from the server */
-	_debug("sleep");
-	ret = wait_event_interruptible(fl->fl_wait,
-				       fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
-	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
-		ret = fl->fl_u.afs.state;
-		if (ret < 0)
-			goto error;
-		spin_lock(&vnode->lock);
-		goto given_lock;
-	}
-
-	/* we were interrupted, but someone may still be in the throes of
-	 * giving us the lock */
-	_debug("intr");
-	ASSERTCMP(ret, ==, -ERESTARTSYS);
+	ret = afs_set_lock(vnode, key, type); /* RPC */
 
 	spin_lock(&vnode->lock);
-	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
-		ret = fl->fl_u.afs.state;
-		if (ret < 0) {
-			spin_unlock(&vnode->lock);
-			goto error;
-		}
-		goto given_lock;
-	}
+	switch (ret) {
+	default:
+		goto abort_attempt;
 
-abort_attempt:
-	/* we aren't going to get the lock, either because we're unwilling to
-	 * wait, or because some signal happened */
-	_debug("abort");
-	if (list_empty(&vnode->granted_locks) &&
-	    vnode->pending_locks.next == &fl->fl_u.afs.link) {
-		if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
-			/* kick the next pending lock into having a go */
-			list_del_init(&fl->fl_u.afs.link);
-			afs_lock_may_be_available(vnode);
-		}
-	} else {
-		list_del_init(&fl->fl_u.afs.link);
+	case -EWOULDBLOCK:
+		/* The server doesn't have a lock-waiting queue, so the client
+		 * will have to retry.  The server will break the outstanding
+		 * callbacks on a file when a lock is released.
+		 */
+		_debug("would block");
+		ASSERT(list_empty(&vnode->granted_locks));
+		ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link);
+		vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
+		goto need_to_wait;
+
+	case 0:
+		_debug("acquired");
+		break;
 	}
-	spin_unlock(&vnode->lock);
-	goto error;
 
-acquired_server_lock:
 	/* we've acquired a server lock, but it needs to be renewed after 5
 	 * mins */
-	spin_lock(&vnode->lock);
+	vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
 	afs_schedule_lock_extension(vnode);
-	if (type == AFS_LOCK_READ)
-		set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
-	else
-		set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
-sharing_existing_lock:
+
+share_existing_lock:
 	/* the lock has been granted as far as we're concerned... */
 	fl->fl_u.afs.state = AFS_LOCK_GRANTED;
 	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+
 given_lock:
 	/* ... but we do still need to get the VFS's blessing */
-	ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
-	ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
-				(1 << AFS_VNODE_WRITELOCKED))) != 0);
+	spin_unlock(&vnode->lock);
+
 	ret = posix_lock_file(file, fl, NULL);
 	if (ret < 0)
 		goto vfs_rejected_lock;
-	spin_unlock(&vnode->lock);
 
-	/* again, make sure we've got a callback on this file and, again, make
+	/* Again, make sure we've got a callback on this file and, again, make
 	 * sure that our view of the data version is up to date (we ignore
-	 * errors incurred here and deal with the consequences elsewhere) */
-	afs_vnode_fetch_status(vnode, NULL, key);
+	 * errors incurred here and deal with the consequences elsewhere).
+	 */
+	afs_validate(vnode, key);
+	_leave(" = 0");
+	return 0;
 
-error:
-	spin_unlock(&inode->i_lock);
+need_to_wait:
+	/* We're going to have to wait.  Either this client doesn't have a lock
+	 * on the server yet and we need to wait for a callback to occur, or
+	 * the client does have a lock on the server, but it belongs to some
+	 * other process(es) and is incompatible with the lock we want.
+	 */
+	ret = -EAGAIN;
+	if (fl->fl_flags & FL_SLEEP) {
+		spin_unlock(&vnode->lock);
+
+		_debug("sleep");
+		ret = wait_event_interruptible(fl->fl_wait,
+					       fl->fl_u.afs.state != AFS_LOCK_PENDING);
+
+		spin_lock(&vnode->lock);
+	}
+
+	if (fl->fl_u.afs.state == AFS_LOCK_GRANTED)
+		goto given_lock;
+	if (fl->fl_u.afs.state < 0)
+		ret = fl->fl_u.afs.state;
+
+abort_attempt:
+	/* we aren't going to get the lock, either because we're unwilling to
+	 * wait, or because some signal happened */
+	_debug("abort");
+	afs_dequeue_lock(vnode, fl);
+
+error_unlock:
+	spin_unlock(&vnode->lock);
 	_leave(" = %d", ret);
 	return ret;
 
 vfs_rejected_lock:
-	/* the VFS rejected the lock we just obtained, so we have to discard
-	 * what we just got */
+	/* The VFS rejected the lock we just obtained, so we have to discard
+	 * what we just got.  We defer this to the lock manager work item to
+	 * deal with.
+	 */
 	_debug("vfs refused %d", ret);
+	spin_lock(&vnode->lock);
 	list_del_init(&fl->fl_u.afs.link);
 	if (list_empty(&vnode->granted_locks))
-		afs_defer_unlock(vnode, key);
-	goto abort_attempt;
+		afs_defer_unlock(vnode);
+	goto error_unlock;
 }
 
 /*
@@ -440,34 +579,21 @@ vfs_rejected_lock:
  */
 static int afs_do_unlk(struct file *file, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
-	struct key *key = file->private_data;
+	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
 	int ret;
 
 	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
 
+	/* Flush all pending writes before doing anything with locks. */
+	vfs_fsync(file, 0);
+
 	/* only whole-file unlocks are supported */
 	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
 		return -EINVAL;
 
-	fl->fl_ops = &afs_lock_ops;
-	INIT_LIST_HEAD(&fl->fl_u.afs.link);
-	fl->fl_u.afs.state = AFS_LOCK_PENDING;
-
-	spin_lock(&vnode->lock);
 	ret = posix_lock_file(file, fl, NULL);
-	if (ret < 0) {
-		spin_unlock(&vnode->lock);
-		_leave(" = %d [vfs]", ret);
-		return ret;
-	}
-
-	/* discard the server lock only if all granted locks are gone */
-	if (list_empty(&vnode->granted_locks))
-		afs_defer_unlock(vnode, key);
-	spin_unlock(&vnode->lock);
-	_leave(" = 0");
-	return 0;
+	_leave(" = %d [%u]", ret, vnode->lock_state);
+	return ret;
 }
 
 /*
@@ -475,37 +601,33 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
  */
 static int afs_do_getlk(struct file *file, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
-	struct key *key = file->private_data;
+	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
+	struct key *key = afs_file_key(file);
 	int ret, lock_count;
 
 	_enter("");
 
 	fl->fl_type = F_UNLCK;
 
-	inode_lock(&vnode->vfs_inode);
-
 	/* check local lock records first */
-	ret = 0;
 	posix_test_lock(file, fl);
 	if (fl->fl_type == F_UNLCK) {
 		/* no local locks; consult the server */
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		ret = afs_fetch_status(vnode, key);
 		if (ret < 0)
 			goto error;
-		lock_count = vnode->status.lock_count;
-		if (lock_count) {
-			if (lock_count > 0)
-				fl->fl_type = F_RDLCK;
-			else
-				fl->fl_type = F_WRLCK;
-			fl->fl_start = 0;
-			fl->fl_end = OFFSET_MAX;
-		}
+
+		lock_count = READ_ONCE(vnode->status.lock_count);
+		if (lock_count > 0)
+			fl->fl_type = F_RDLCK;
+		else
+			fl->fl_type = F_WRLCK;
+		fl->fl_start = 0;
+		fl->fl_end = OFFSET_MAX;
 	}
 
+	ret = 0;
 error:
-	inode_unlock(&vnode->vfs_inode);
 	_leave(" = %d [%hd]", ret, fl->fl_type);
 	return ret;
 }
@@ -515,7 +637,7 @@ error:
  */
 int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
 
 	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -538,7 +660,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
  */
 int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
 
 	_enter("{%x:%u},%d,{t=%x,fl=%x}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -568,9 +690,13 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
  */
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
+	struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file));
+
 	_enter("");
 
+	spin_lock(&vnode->lock);
 	list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
+	spin_unlock(&vnode->lock);
 }
 
 /*
@@ -579,7 +705,12 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
  */
 static void afs_fl_release_private(struct file_lock *fl)
 {
+	struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file));
+
 	_enter("");
 
-	list_del_init(&fl->fl_u.afs.link);
+	spin_lock(&vnode->lock);
+	afs_dequeue_lock(vnode, fl);
+	_debug("state %u for %p", vnode->lock_state, vnode);
+	spin_unlock(&vnode->lock);
 }
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 19f76ae36982..b90ef39ae914 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -16,12 +16,19 @@
 #include "internal.h"
 #include "afs_fs.h"
 
+static const struct afs_fid afs_zero_fid;
+
 /*
  * We need somewhere to discard into in case the server helpfully returns more
  * than we asked for in FS.FetchData{,64}.
  */
 static u8 afs_discard_buffer[64];
 
+static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
+{
+	call->cbi = afs_get_cb_interest(cbi);
+}
+
 /*
  * decode an AFSFid block
  */
@@ -47,14 +54,18 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	const __be32 *bp = *_bp;
 	umode_t mode;
 	u64 data_version, size;
-	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+	bool changed = false;
 	kuid_t owner;
 	kgid_t group;
 
+	if (vnode)
+		write_seqlock(&vnode->cb_lock);
+
 #define EXTRACT(DST)				\
 	do {					\
 		u32 x = ntohl(*bp++);		\
-		changed |= DST - x;		\
+		if (DST != x)			\
+			changed |= true;	\
 		DST = x;			\
 	} while (0)
 
@@ -70,8 +81,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	EXTRACT(status->caller_access); /* call ticket dependent */
 	EXTRACT(status->anon_access);
 	EXTRACT(status->mode);
-	EXTRACT(status->parent.vnode);
-	EXTRACT(status->parent.unique);
+	bp++; /* parent.vnode */
+	bp++; /* parent.unique */
 	bp++; /* seg size */
 	status->mtime_client = ntohl(*bp++);
 	status->mtime_server = ntohl(*bp++);
@@ -95,7 +106,6 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	       status->mtime_client, status->mtime_server);
 
 	if (vnode) {
-		status->parent.vid = vnode->fid.vid;
 		if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
 			_debug("vnode changed");
 			i_size_write(&vnode->vfs_inode, size);
@@ -127,25 +137,47 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 			_debug("vnode modified %llx on {%x:%u}",
 			       (unsigned long long) data_version,
 			       vnode->fid.vid, vnode->fid.vnode);
-			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+			set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 		}
 	} else if (store_version) {
 		status->data_version = data_version;
 	}
+
+	if (vnode)
+		write_sequnlock(&vnode->cb_lock);
 }
 
 /*
  * decode an AFSCallBack block
  */
-static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+static void xdr_decode_AFSCallBack(struct afs_call *call,
+				   struct afs_vnode *vnode,
+				   const __be32 **_bp)
 {
+	struct afs_cb_interest *old, *cbi = call->cbi;
 	const __be32 *bp = *_bp;
+	u32 cb_expiry;
+
+	write_seqlock(&vnode->cb_lock);
+
+	if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) {
+		vnode->cb_version	= ntohl(*bp++);
+		cb_expiry		= ntohl(*bp++);
+		vnode->cb_type		= ntohl(*bp++);
+		vnode->cb_expires_at	= cb_expiry + ktime_get_real_seconds();
+		old = vnode->cb_interest;
+		if (old != call->cbi) {
+			vnode->cb_interest = cbi;
+			cbi = old;
+		}
+		set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	} else {
+		bp += 3;
+	}
 
-	vnode->cb_version	= ntohl(*bp++);
-	vnode->cb_expiry	= ntohl(*bp++);
-	vnode->cb_type		= ntohl(*bp++);
-	vnode->cb_expires	= vnode->cb_expiry + ktime_get_real_seconds();
+	write_sequnlock(&vnode->cb_lock);
+	call->cbi = cbi;
 	*_bp = bp;
 }
 
@@ -243,22 +275,22 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
  */
 static int afs_deliver_fs_fetch_status(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
-	_enter("");
-
 	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	xdr_decode_AFSCallBack(&bp, vnode);
-	if (call->reply2)
-		xdr_decode_AFSVolSync(&bp, call->reply2);
+	xdr_decode_AFSCallBack(call, vnode, &bp);
+	if (call->reply[1])
+		xdr_decode_AFSVolSync(&bp, call->reply[1]);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -269,35 +301,33 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSFetchStatus = {
 	.name		= "FS.FetchStatus",
+	.op		= afs_FS_FetchStatus,
 	.deliver	= afs_deliver_fs_fetch_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * fetch the status information for a file
  */
-int afs_fs_fetch_file_status(struct afs_server *server,
-			     struct key *key,
-			     struct afs_vnode *vnode,
-			     struct afs_volsync *volsync,
-			     bool async)
+int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
-	if (!call)
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+	if (!call) {
+		fc->ac.error = -ENOMEM;
 		return -ENOMEM;
+	}
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = volsync;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = volsync;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -306,7 +336,10 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	call->cb_break = fc->cb_break;
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -314,8 +347,8 @@ int afs_fs_fetch_file_status(struct afs_server *server,
  */
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
-	struct afs_read *req = call->reply3;
+	struct afs_vnode *vnode = call->reply[0];
+	struct afs_read *req = call->reply[2];
 	const __be32 *bp;
 	unsigned int size;
 	void *buffer;
@@ -431,9 +464,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 
 		bp = call->buffer;
 		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-		xdr_decode_AFSCallBack(&bp, vnode);
-		if (call->reply2)
-			xdr_decode_AFSVolSync(&bp, call->reply2);
+		xdr_decode_AFSCallBack(call, vnode, &bp);
+		if (call->reply[1])
+			xdr_decode_AFSVolSync(&bp, call->reply[1]);
 
 		call->offset = 0;
 		call->unmarshall++;
@@ -457,7 +490,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 
 static void afs_fetch_data_destructor(struct afs_call *call)
 {
-	struct afs_read *req = call->reply3;
+	struct afs_read *req = call->reply[2];
 
 	afs_put_read(req);
 	afs_flat_call_destructor(call);
@@ -468,43 +501,38 @@ static void afs_fetch_data_destructor(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
+	.op		= afs_FS_FetchData,
 	.deliver	= afs_deliver_fs_fetch_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_fetch_data_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
+	.op		= afs_FS_FetchData64,
 	.deliver	= afs_deliver_fs_fetch_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_fetch_data_destructor,
 };
 
 /*
  * fetch data from a very large file
  */
-static int afs_fs_fetch_data64(struct afs_server *server,
-			       struct key *key,
-			       struct afs_vnode *vnode,
-			       struct afs_read *req,
-			       bool async)
+static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = NULL; /* volsync */
-	call->reply3 = req;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->operation_ID = FSFETCHDATA64;
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = NULL; /* volsync */
+	call->reply[2] = req;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -518,39 +546,37 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[7] = htonl(lower_32_bits(req->len));
 
 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	call->cb_break = fc->cb_break;
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * fetch data from a file
  */
-int afs_fs_fetch_data(struct afs_server *server,
-		      struct key *key,
-		      struct afs_vnode *vnode,
-		      struct afs_read *req,
-		      bool async)
+int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	if (upper_32_bits(req->pos) ||
 	    upper_32_bits(req->len) ||
 	    upper_32_bits(req->pos + req->len))
-		return afs_fs_fetch_data64(server, key, vnode, req, async);
+		return afs_fs_fetch_data64(fc, req);
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = NULL; /* volsync */
-	call->reply3 = req;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->operation_ID = FSFETCHDATA;
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = NULL; /* volsync */
+	call->reply[2] = req;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -562,90 +588,10 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[5] = htonl(lower_32_bits(req->len));
 
 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
-}
-
-/*
- * deliver reply data to an FS.GiveUpCallBacks
- */
-static int afs_deliver_fs_give_up_callbacks(struct afs_call *call)
-{
-	_enter("");
-
-	/* shouldn't be any reply data */
-	return afs_extract_data(call, NULL, 0, false);
-}
-
-/*
- * FS.GiveUpCallBacks operation type
- */
-static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
-	.name		= "FS.GiveUpCallBacks",
-	.deliver	= afs_deliver_fs_give_up_callbacks,
-	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
-};
-
-/*
- * give up a set of callbacks
- * - the callbacks are held in the server->cb_break ring
- */
-int afs_fs_give_up_callbacks(struct afs_server *server,
-			     bool async)
-{
-	struct afs_call *call;
-	size_t ncallbacks;
-	__be32 *bp, *tp;
-	int loop;
-
-	ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
-			      ARRAY_SIZE(server->cb_break));
-
-	_enter("{%zu},", ncallbacks);
-
-	if (ncallbacks == 0)
-		return 0;
-	if (ncallbacks > AFSCBMAX)
-		ncallbacks = AFSCBMAX;
-
-	_debug("break %zu callbacks", ncallbacks);
-
-	call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
-				   12 + ncallbacks * 6 * 4, 0);
-	if (!call)
-		return -ENOMEM;
-
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-
-	/* marshall the parameters */
-	bp = call->request;
-	tp = bp + 2 + ncallbacks * 3;
-	*bp++ = htonl(FSGIVEUPCALLBACKS);
-	*bp++ = htonl(ncallbacks);
-	*tp++ = htonl(ncallbacks);
-
-	atomic_sub(ncallbacks, &server->cb_break_n);
-	for (loop = ncallbacks; loop > 0; loop--) {
-		struct afs_callback *cb =
-			&server->cb_break[server->cb_break_tail];
-
-		*bp++ = htonl(cb->fid.vid);
-		*bp++ = htonl(cb->fid.vnode);
-		*bp++ = htonl(cb->fid.unique);
-		*tp++ = htonl(cb->version);
-		*tp++ = htonl(cb->expiry);
-		*tp++ = htonl(cb->type);
-		smp_mb();
-		server->cb_break_tail =
-			(server->cb_break_tail + 1) &
-			(ARRAY_SIZE(server->cb_break) - 1);
-	}
-
-	ASSERT(ncallbacks > 0);
-	wake_up_nr(&server->cb_break_waitq, ncallbacks);
-
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	call->cb_break = fc->cb_break;
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -653,7 +599,7 @@ int afs_fs_give_up_callbacks(struct afs_server *server,
  */
 static int afs_deliver_fs_create_vnode(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -665,11 +611,11 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFid(&bp, call->reply2);
-	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFid(&bp, call->reply[1]);
+	xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL);
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	xdr_decode_AFSCallBack_raw(&bp, call->reply4);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	xdr_decode_AFSCallBack_raw(&bp, call->reply[3]);
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -678,27 +624,33 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
 /*
  * FS.CreateFile and FS.MakeDir operation type
  */
-static const struct afs_call_type afs_RXFSCreateXXXX = {
-	.name		= "FS.CreateXXXX",
+static const struct afs_call_type afs_RXFSCreateFile = {
+	.name		= "FS.CreateFile",
+	.op		= afs_FS_CreateFile,
+	.deliver	= afs_deliver_fs_create_vnode,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSMakeDir = {
+	.name		= "FS.MakeDir",
+	.op		= afs_FS_MakeDir,
 	.deliver	= afs_deliver_fs_create_vnode,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * create a file or make a directory
  */
-int afs_fs_create(struct afs_server *server,
-		  struct key *key,
-		  struct afs_vnode *vnode,
+int afs_fs_create(struct afs_fs_cursor *fc,
 		  const char *name,
 		  umode_t mode,
 		  struct afs_fid *newfid,
 		  struct afs_file_status *newstatus,
-		  struct afs_callback *newcb,
-		  bool async)
+		  struct afs_callback *newcb)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -708,18 +660,17 @@ int afs_fs_create(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz + (6 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
-				   (3 + 21 + 21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(
+		net, S_ISDIR(mode) ? &afs_RXFSMakeDir : &afs_RXFSCreateFile,
+		reqsz, (3 + 21 + 21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = newfid;
-	call->reply3 = newstatus;
-	call->reply4 = newcb;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = newfid;
+	call->reply[2] = newstatus;
+	call->reply[3] = newcb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -741,7 +692,9 @@ int afs_fs_create(struct afs_server *server,
 	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -749,7 +702,7 @@ int afs_fs_create(struct afs_server *server,
  */
 static int afs_deliver_fs_remove(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -762,7 +715,7 @@ static int afs_deliver_fs_remove(struct afs_call *call)
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -771,24 +724,28 @@ static int afs_deliver_fs_remove(struct afs_call *call)
 /*
  * FS.RemoveDir/FS.RemoveFile operation type
  */
-static const struct afs_call_type afs_RXFSRemoveXXXX = {
-	.name		= "FS.RemoveXXXX",
+static const struct afs_call_type afs_RXFSRemoveFile = {
+	.name		= "FS.RemoveFile",
+	.op		= afs_FS_RemoveFile,
+	.deliver	= afs_deliver_fs_remove,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSRemoveDir = {
+	.name		= "FS.RemoveDir",
+	.op		= afs_FS_RemoveDir,
 	.deliver	= afs_deliver_fs_remove,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * remove a file or directory
  */
-int afs_fs_remove(struct afs_server *server,
-		  struct key *key,
-		  struct afs_vnode *vnode,
-		  const char *name,
-		  bool isdir,
-		  bool async)
+int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -798,14 +755,14 @@ int afs_fs_remove(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz;
 
-	call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+	call = afs_alloc_flat_call(
+		net, isdir ? &afs_RXFSRemoveDir : &afs_RXFSRemoveFile,
+		reqsz, (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -821,7 +778,9 @@ int afs_fs_remove(struct afs_server *server,
 		bp = (void *) bp + padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -829,7 +788,7 @@ int afs_fs_remove(struct afs_server *server,
  */
 static int afs_deliver_fs_link(struct afs_call *call)
 {
-	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+	struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
 	const __be32 *bp;
 	int ret;
 
@@ -843,7 +802,7 @@ static int afs_deliver_fs_link(struct afs_call *call)
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -854,22 +813,20 @@ static int afs_deliver_fs_link(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSLink = {
 	.name		= "FS.Link",
+	.op		= afs_FS_Link,
 	.deliver	= afs_deliver_fs_link,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * make a hard link
  */
-int afs_fs_link(struct afs_server *server,
-		struct key *key,
-		struct afs_vnode *dvnode,
-		struct afs_vnode *vnode,
-		const char *name,
-		bool async)
+int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+		const char *name)
 {
+	struct afs_vnode *dvnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -879,15 +836,13 @@ int afs_fs_link(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz + (3 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = dvnode;
-	call->reply2 = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = dvnode;
+	call->reply[1] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -906,7 +861,9 @@ int afs_fs_link(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -914,7 +871,7 @@ int afs_fs_link(struct afs_server *server,
  */
 static int afs_deliver_fs_symlink(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -926,10 +883,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFid(&bp, call->reply2);
-	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFid(&bp, call->reply[1]);
+	xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL);
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -940,24 +897,23 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSSymlink = {
 	.name		= "FS.Symlink",
+	.op		= afs_FS_Symlink,
 	.deliver	= afs_deliver_fs_symlink,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * create a symbolic link
  */
-int afs_fs_symlink(struct afs_server *server,
-		   struct key *key,
-		   struct afs_vnode *vnode,
+int afs_fs_symlink(struct afs_fs_cursor *fc,
 		   const char *name,
 		   const char *contents,
 		   struct afs_fid *newfid,
-		   struct afs_file_status *newstatus,
-		   bool async)
+		   struct afs_file_status *newstatus)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
 	__be32 *bp;
 
@@ -971,17 +927,15 @@ int afs_fs_symlink(struct afs_server *server,
 
 	reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+	call = afs_alloc_flat_call(net, &afs_RXFSSymlink, reqsz,
 				   (3 + 21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = newfid;
-	call->reply3 = newstatus;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = newfid;
+	call->reply[2] = newstatus;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1010,7 +964,9 @@ int afs_fs_symlink(struct afs_server *server,
 	*bp++ = htonl(S_IRWXUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1018,7 +974,7 @@ int afs_fs_symlink(struct afs_server *server,
  */
 static int afs_deliver_fs_rename(struct afs_call *call)
 {
-	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
+	struct afs_vnode *orig_dvnode = call->reply[0], *new_dvnode = call->reply[1];
 	const __be32 *bp;
 	int ret;
 
@@ -1034,7 +990,7 @@ static int afs_deliver_fs_rename(struct afs_call *call)
 	if (new_dvnode != orig_dvnode)
 		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
 					  NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1045,23 +1001,22 @@ static int afs_deliver_fs_rename(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSRename = {
 	.name		= "FS.Rename",
+	.op		= afs_FS_Rename,
 	.deliver	= afs_deliver_fs_rename,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * create a symbolic link
  */
-int afs_fs_rename(struct afs_server *server,
-		  struct key *key,
-		  struct afs_vnode *orig_dvnode,
+int afs_fs_rename(struct afs_fs_cursor *fc,
 		  const char *orig_name,
 		  struct afs_vnode *new_dvnode,
-		  const char *new_name,
-		  bool async)
+		  const char *new_name)
 {
+	struct afs_vnode *orig_dvnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(orig_dvnode);
 	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
 	__be32 *bp;
 
@@ -1078,15 +1033,13 @@ int afs_fs_rename(struct afs_server *server,
 		(3 * 4) +
 		4 + n_namesz + n_padsz;
 
-	call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = orig_dvnode;
-	call->reply2 = new_dvnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = orig_dvnode;
+	call->reply[1] = new_dvnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1113,7 +1066,9 @@ int afs_fs_rename(struct afs_server *server,
 		bp = (void *) bp + n_padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &orig_dvnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1121,7 +1076,7 @@ int afs_fs_rename(struct afs_server *server,
  */
 static int afs_deliver_fs_store_data(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1135,7 +1090,7 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
 				  &call->store_version);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	afs_pages_written_back(vnode, call);
 
@@ -1148,47 +1103,44 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSStoreData = {
 	.name		= "FS.StoreData",
+	.op		= afs_FS_StoreData,
 	.deliver	= afs_deliver_fs_store_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSStoreData64 = {
 	.name		= "FS.StoreData64",
+	.op		= afs_FS_StoreData64,
 	.deliver	= afs_deliver_fs_store_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * store a set of pages to a very large file
  */
-static int afs_fs_store_data64(struct afs_server *server,
-			       struct afs_writeback *wb,
+static int afs_fs_store_data64(struct afs_fs_cursor *fc,
+			       struct address_space *mapping,
 			       pgoff_t first, pgoff_t last,
 			       unsigned offset, unsigned to,
-			       loff_t size, loff_t pos, loff_t i_size,
-			       bool async)
+			       loff_t size, loff_t pos, loff_t i_size)
 {
-	struct afs_vnode *vnode = wb->vnode;
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData64,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64,
 				   (4 + 6 + 3 * 2) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->wb = wb;
-	call->key = wb->key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->mapping = vnode->vfs_inode.i_mapping;
+	call->key = fc->key;
+	call->mapping = mapping;
+	call->reply[0] = vnode;
 	call->first = first;
 	call->last = last;
 	call->first_offset = offset;
@@ -1217,24 +1169,25 @@ static int afs_fs_store_data64(struct afs_server *server,
 	*bp++ = htonl(i_size >> 32);
 	*bp++ = htonl((u32) i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * store a set of pages
  */
-int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
+int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 		      pgoff_t first, pgoff_t last,
-		      unsigned offset, unsigned to,
-		      bool async)
+		      unsigned offset, unsigned to)
 {
-	struct afs_vnode *vnode = wb->vnode;
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	loff_t size, pos, i_size;
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	size = (loff_t)to - (loff_t)offset;
 	if (first != last)
@@ -1251,21 +1204,18 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	       (unsigned long long) i_size);
 
 	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
-		return afs_fs_store_data64(server, wb, first, last, offset, to,
-					   size, pos, i_size, async);
+		return afs_fs_store_data64(fc, mapping, first, last, offset, to,
+					   size, pos, i_size);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData,
 				   (4 + 6 + 3) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->wb = wb;
-	call->key = wb->key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->mapping = vnode->vfs_inode.i_mapping;
+	call->key = fc->key;
+	call->mapping = mapping;
+	call->reply[0] = vnode;
 	call->first = first;
 	call->last = last;
 	call->first_offset = offset;
@@ -1291,7 +1241,9 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	*bp++ = htonl(size);
 	*bp++ = htonl(i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1300,7 +1252,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 static int afs_deliver_fs_store_status(struct afs_call *call)
 {
 	afs_dataversion_t *store_version;
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1317,7 +1269,7 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
 
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1328,22 +1280,22 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSStoreStatus = {
 	.name		= "FS.StoreStatus",
+	.op		= afs_FS_StoreStatus,
 	.deliver	= afs_deliver_fs_store_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSStoreData_as_Status = {
 	.name		= "FS.StoreData",
+	.op		= afs_FS_StoreData,
 	.deliver	= afs_deliver_fs_store_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
 	.name		= "FS.StoreData64",
+	.op		= afs_FS_StoreData64,
 	.deliver	= afs_deliver_fs_store_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1351,30 +1303,27 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
  * set the attributes on a very large file, using FS.StoreData rather than
  * FS.StoreStatus so as to alter the file size also
  */
-static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
-				 struct afs_vnode *vnode, struct iattr *attr,
-				 bool async)
+static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64_as_Status,
 				   (4 + 6 + 3 * 2) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 	call->store_version = vnode->status.data_version + 1;
-	call->operation_ID = FSSTOREDATA;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1392,40 +1341,38 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
 	*bp++ = htonl((u32) attr->ia_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
  * so as to alter the file size also
  */
-static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
-			       struct afs_vnode *vnode, struct iattr *attr,
-			       bool async)
+static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 	if (attr->ia_size >> 32)
-		return afs_fs_setattr_size64(server, key, vnode, attr,
-					     async);
+		return afs_fs_setattr_size64(fc, attr);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status,
 				   (4 + 6 + 3) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 	call->store_version = vnode->status.data_version + 1;
-	call->operation_ID = FSSTOREDATA;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1440,38 +1387,36 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 	*bp++ = 0;				/* size of write */
 	*bp++ = htonl(attr->ia_size);		/* new file length */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * set the attributes on a file, using FS.StoreData if there's a change in file
  * size, and FS.StoreStatus otherwise
  */
-int afs_fs_setattr(struct afs_server *server, struct key *key,
-		   struct afs_vnode *vnode, struct iattr *attr,
-		   bool async)
+int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	if (attr->ia_valid & ATTR_SIZE)
-		return afs_fs_setattr_size(server, key, vnode, attr,
-					   async);
+		return afs_fs_setattr_size(fc, attr);
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus,
 				   (4 + 6) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->operation_ID = FSSTORESTATUS;
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1482,7 +1427,9 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 
 	xdr_encode_AFS_StoreStatus(&bp, attr);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1510,7 +1457,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 			return ret;
 
 		bp = call->buffer;
-		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
+		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]);
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -1531,13 +1478,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 3:
 		_debug("extract volname");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply3,
+			ret = afs_extract_data(call, call->reply[2],
 					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
 
-		p = call->reply3;
+		p = call->reply[2];
 		p[call->count] = 0;
 		_debug("volname '%s'", p);
 
@@ -1578,13 +1525,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 6:
 		_debug("extract offline");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply3,
+			ret = afs_extract_data(call, call->reply[2],
 					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
 
-		p = call->reply3;
+		p = call->reply[2];
 		p[call->count] = 0;
 		_debug("offline '%s'", p);
 
@@ -1625,13 +1572,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 9:
 		_debug("extract motd");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply3,
+			ret = afs_extract_data(call, call->reply[2],
 					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
 
-		p = call->reply3;
+		p = call->reply[2];
 		p[call->count] = 0;
 		_debug("motd '%s'", p);
 
@@ -1662,8 +1609,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
  */
 static void afs_get_volume_status_call_destructor(struct afs_call *call)
 {
-	kfree(call->reply3);
-	call->reply3 = NULL;
+	kfree(call->reply[2]);
+	call->reply[2] = NULL;
 	afs_flat_call_destructor(call);
 }
 
@@ -1672,21 +1619,20 @@ static void afs_get_volume_status_call_destructor(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSGetVolumeStatus = {
 	.name		= "FS.GetVolumeStatus",
+	.op		= afs_FS_GetVolumeStatus,
 	.deliver	= afs_deliver_fs_get_volume_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_get_volume_status_call_destructor,
 };
 
 /*
  * fetch the status of a volume
  */
-int afs_fs_get_volume_status(struct afs_server *server,
-			     struct key *key,
-			     struct afs_vnode *vnode,
-			     struct afs_volume_status *vs,
-			     bool async)
+int afs_fs_get_volume_status(struct afs_fs_cursor *fc,
+			     struct afs_volume_status *vs)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 	void *tmpbuf;
 
@@ -1696,25 +1642,25 @@ int afs_fs_get_volume_status(struct afs_server *server,
 	if (!tmpbuf)
 		return -ENOMEM;
 
-	call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
 	if (!call) {
 		kfree(tmpbuf);
 		return -ENOMEM;
 	}
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = vs;
-	call->reply3 = tmpbuf;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = vs;
+	call->reply[2] = tmpbuf;
 
 	/* marshall the parameters */
 	bp = call->request;
 	bp[0] = htonl(FSGETVOLUMESTATUS);
 	bp[1] = htonl(vnode->fid.vid);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1733,7 +1679,7 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1744,8 +1690,8 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSSetLock = {
 	.name		= "FS.SetLock",
+	.op		= afs_FS_SetLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1754,8 +1700,8 @@ static const struct afs_call_type afs_RXFSSetLock = {
  */
 static const struct afs_call_type afs_RXFSExtendLock = {
 	.name		= "FS.ExtendLock",
+	.op		= afs_FS_ExtendLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1764,33 +1710,29 @@ static const struct afs_call_type afs_RXFSExtendLock = {
  */
 static const struct afs_call_type afs_RXFSReleaseLock = {
 	.name		= "FS.ReleaseLock",
+	.op		= afs_FS_ReleaseLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
- * get a lock on a file
+ * Set a lock on a file
  */
-int afs_fs_set_lock(struct afs_server *server,
-		    struct key *key,
-		    struct afs_vnode *vnode,
-		    afs_lock_type_t type,
-		    bool async)
+int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1800,30 +1742,29 @@ int afs_fs_set_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.unique);
 	*bp++ = htonl(type);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * extend a lock on a file
  */
-int afs_fs_extend_lock(struct afs_server *server,
-		       struct key *key,
-		       struct afs_vnode *vnode,
-		       bool async)
+int afs_fs_extend_lock(struct afs_fs_cursor *fc)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1832,30 +1773,29 @@ int afs_fs_extend_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * release a lock on a file
  */
-int afs_fs_release_lock(struct afs_server *server,
-			struct key *key,
-			struct afs_vnode *vnode,
-			bool async)
+int afs_fs_release_lock(struct afs_fs_cursor *fc)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1864,5 +1804,145 @@ int afs_fs_release_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.GiveUpAllCallBacks operation.
+ */
+static int afs_deliver_fs_give_up_all_callbacks(struct afs_call *call)
+{
+	return afs_transfer_reply(call);
+}
+
+/*
+ * FS.GiveUpAllCallBacks operation type
+ */
+static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = {
+	.name		= "FS.GiveUpAllCallBacks",
+	.op		= afs_FS_GiveUpAllCallBacks,
+	.deliver	= afs_deliver_fs_give_up_all_callbacks,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Flush all the callbacks we have on a server.
+ */
+int afs_fs_give_up_all_callbacks(struct afs_net *net,
+				 struct afs_server *server,
+				 struct afs_addr_cursor *ac,
+				 struct key *key)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_RXFSGiveUpAllCallBacks, 1 * 4, 0);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSGIVEUPALLCALLBACKS);
+
+	/* Can't take a ref on server */
+	return afs_make_call(ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.GetCapabilities operation.
+ */
+static int afs_deliver_fs_get_capabilities(struct afs_call *call)
+{
+	u32 count;
+	int ret;
+
+	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract the capabilities word count */
+	case 1:
+		ret = afs_extract_data(call, &call->tmp,
+				       1 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		count = ntohl(call->tmp);
+
+		call->count = count;
+		call->count2 = count;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract capabilities words */
+	case 2:
+		count = min(call->count, 16U);
+		ret = afs_extract_data(call, call->buffer,
+				       count * sizeof(__be32),
+				       call->count > 16);
+		if (ret < 0)
+			return ret;
+
+		/* TODO: Examine capabilities */
+
+		call->count -= count;
+		if (call->count > 0)
+			goto again;
+		call->offset = 0;
+		call->unmarshall++;
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_RXFSGetCapabilities = {
+	.name		= "FS.GetCapabilities",
+	.op		= afs_FS_GetCapabilities,
+	.deliver	= afs_deliver_fs_get_capabilities,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Probe a fileserver for the capabilities that it supports.  This can
+ * return up to 196 words.
+ */
+int afs_fs_get_capabilities(struct afs_net *net,
+			    struct afs_server *server,
+			    struct afs_addr_cursor *ac,
+			    struct key *key)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_RXFSGetCapabilities, 1 * 4, 16 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSGETCAPABILITIES);
+
+	/* Can't take a ref on server */
+	trace_afs_make_fs_call(call, NULL);
+	return afs_make_call(ac, call, GFP_NOFS, false);
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 342316a9e3e0..3415eb7484f6 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -23,11 +23,6 @@
 #include <linux/namei.h>
 #include "internal.h"
 
-struct afs_iget_data {
-	struct afs_fid		fid;
-	struct afs_volume	*volume;	/* volume on which resides */
-};
-
 static const struct inode_operations afs_symlink_inode_operations = {
 	.get_link	= page_get_link,
 	.listxattr	= afs_listxattr,
@@ -39,6 +34,7 @@ static const struct inode_operations afs_symlink_inode_operations = {
 static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 {
 	struct inode *inode = AFS_VNODE_TO_I(vnode);
+	bool changed;
 
 	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
 	       vnode->status.type,
@@ -47,6 +43,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	       vnode->status.data_version,
 	       vnode->status.mode);
 
+	read_seqlock_excl(&vnode->cb_lock);
+
 	switch (vnode->status.type) {
 	case AFS_FTYPE_FILE:
 		inode->i_mode	= S_IFREG | vnode->status.mode;
@@ -63,9 +61,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		if ((vnode->status.mode & 0777) == 0644) {
 			inode->i_flags |= S_AUTOMOUNT;
 
-			spin_lock(&vnode->lock);
 			set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
-			spin_unlock(&vnode->lock);
 
 			inode->i_mode	= S_IFDIR | 0555;
 			inode->i_op	= &afs_mntpt_inode_operations;
@@ -78,13 +74,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		break;
 	default:
 		printk("kAFS: AFS vnode with undefined type\n");
+		read_sequnlock_excl(&vnode->cb_lock);
 		return -EBADMSG;
 	}
 
-#ifdef CONFIG_AFS_FSCACHE
-	if (vnode->status.size != inode->i_size)
-		fscache_attr_changed(vnode->cache);
-#endif
+	changed = (vnode->status.size != inode->i_size);
 
 	set_nlink(inode, vnode->status.nlink);
 	inode->i_uid		= vnode->status.owner;
@@ -97,13 +91,49 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	inode->i_generation	= vnode->fid.unique;
 	inode->i_version	= vnode->status.data_version;
 	inode->i_mapping->a_ops	= &afs_fs_aops;
+
+	read_sequnlock_excl(&vnode->cb_lock);
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (changed)
+		fscache_attr_changed(vnode->cache);
+#endif
 	return 0;
 }
 
 /*
+ * Fetch file status from the volume.
+ */
+int afs_fetch_status(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s,{%x:%u.%u,S=%lx}",
+	       vnode->volume->name,
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+	       vnode->flags);
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_fetch_file_status(&fc, NULL);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * iget5() comparator
  */
-static int afs_iget5_test(struct inode *inode, void *opaque)
+int afs_iget5_test(struct inode *inode, void *opaque)
 {
 	struct afs_iget_data *data = opaque;
 
@@ -204,7 +234,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
  */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
 		       struct afs_fid *fid, struct afs_file_status *status,
-		       struct afs_callback *cb)
+		       struct afs_callback *cb, struct afs_cb_interest *cbi)
 {
 	struct afs_iget_data data = { .fid = *fid };
 	struct afs_super_info *as;
@@ -237,8 +267,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 
 	if (!status) {
 		/* it's a remotely extant inode */
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		ret = afs_fetch_status(vnode, key);
 		if (ret < 0)
 			goto bad_inode;
 	} else {
@@ -249,16 +278,17 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 			/* it's a symlink we just created (the fileserver
 			 * didn't give us a callback) */
 			vnode->cb_version = 0;
-			vnode->cb_expiry = 0;
 			vnode->cb_type = 0;
-			vnode->cb_expires = ktime_get_real_seconds();
+			vnode->cb_expires_at = 0;
 		} else {
 			vnode->cb_version = cb->version;
-			vnode->cb_expiry = cb->expiry;
 			vnode->cb_type = cb->type;
-			vnode->cb_expires = vnode->cb_expiry +
-				ktime_get_real_seconds();
+			vnode->cb_expires_at = cb->expiry;
+			vnode->cb_interest = afs_get_cb_interest(cbi);
+			set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 		}
+
+		vnode->cb_expires_at += ktime_get_real_seconds();
 	}
 
 	/* set up caching before mapping the status, as map-status reads the
@@ -320,25 +350,34 @@ void afs_zap_data(struct afs_vnode *vnode)
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
+	time64_t now = ktime_get_real_seconds();
+	bool valid = false;
 	int ret;
 
 	_enter("{v={%x:%u} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));
 
-	if (vnode->cb_promised &&
-	    !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
-	    !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-		if (vnode->cb_expires < ktime_get_real_seconds() + 10) {
-			_debug("callback expired");
-			set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		} else {
-			goto valid;
+	/* Quickly check the callback state.  Ideally, we'd use read_seqbegin
+	 * here, but we have no way to pass the net namespace to the RCU
+	 * cleanup for the server record.
+	 */
+	read_seqlock_excl(&vnode->cb_lock);
+
+	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) {
+			vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
+		} else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) &&
+			   !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+			   vnode->cb_expires_at - 10 > now) {
+				valid = true;
 		}
+	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		valid = true;
 	}
 
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+	read_sequnlock_excl(&vnode->cb_lock);
+	if (valid)
 		goto valid;
 
 	mutex_lock(&vnode->validate_lock);
@@ -347,12 +386,16 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	 * a new promise - note that if the (parent) directory's metadata was
 	 * changed then the security may be different and we may no longer have
 	 * access */
-	if (!vnode->cb_promised ||
-	    test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+	if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		_debug("not promised");
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
-		if (ret < 0)
+		ret = afs_fetch_status(vnode, key);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				ret = -ESTALE;
+			}
 			goto error_unlock;
+		}
 		_debug("new promise [fl=%lx]", vnode->flags);
 	}
 
@@ -367,7 +410,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
 		afs_zap_data(vnode);
 
-	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+	clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
 	mutex_unlock(&vnode->validate_lock);
 valid:
 	_leave(" = 0");
@@ -386,10 +429,17 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	int seq = 0;
 
 	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
 
-	generic_fillattr(inode, stat);
+	do {
+		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+		generic_fillattr(inode, stat);
+	} while (need_seqretry(&vnode->cb_lock, seq));
+
+	done_seqretry(&vnode->cb_lock, seq);
 	return 0;
 }
 
@@ -411,18 +461,14 @@ int afs_drop_inode(struct inode *inode)
  */
 void afs_evict_inode(struct inode *inode)
 {
-	struct afs_permits *permits;
 	struct afs_vnode *vnode;
 
 	vnode = AFS_FS_I(inode);
 
-	_enter("{%x:%u.%d} v=%u x=%u t=%u }",
+	_enter("{%x:%u.%d}",
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       vnode->cb_version,
-	       vnode->cb_expiry,
-	       vnode->cb_type);
+	       vnode->fid.unique);
 
 	_debug("CLEAR INODE %p", inode);
 
@@ -431,31 +477,24 @@ void afs_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 
-	afs_give_up_callback(vnode);
-
-	if (vnode->server) {
-		spin_lock(&vnode->server->fs_lock);
-		rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
-		spin_unlock(&vnode->server->fs_lock);
-		afs_put_server(vnode->server);
-		vnode->server = NULL;
+	if (vnode->cb_interest) {
+		afs_put_cb_interest(afs_i2net(inode), vnode->cb_interest);
+		vnode->cb_interest = NULL;
 	}
 
-	ASSERT(list_empty(&vnode->writebacks));
-	ASSERT(!vnode->cb_promised);
+	while (!list_empty(&vnode->wb_keys)) {
+		struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next,
+						    struct afs_wb_key, vnode_link);
+		list_del(&wbk->vnode_link);
+		afs_put_wb_key(wbk);
+	}
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(vnode->cache, 0);
 	vnode->cache = NULL;
 #endif
 
-	mutex_lock(&vnode->permits_lock);
-	permits = vnode->permits;
-	RCU_INIT_POINTER(vnode->permits, NULL);
-	mutex_unlock(&vnode->permits_lock);
-	if (permits)
-		call_rcu(&permits->rcu, afs_zap_permits);
-
+	afs_put_permits(vnode->permit_cache);
 	_leave("");
 }
 
@@ -464,6 +503,7 @@ void afs_evict_inode(struct inode *inode)
  */
 int afs_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 	struct key *key;
 	int ret;
@@ -479,13 +519,11 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/* flush any dirty data outstanding on a regular file */
-	if (S_ISREG(vnode->vfs_inode.i_mode)) {
+	if (S_ISREG(vnode->vfs_inode.i_mode))
 		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
-		afs_writeback_all(vnode);
-	}
 
 	if (attr->ia_valid & ATTR_FILE) {
-		key = attr->ia_file->private_data;
+		key = afs_file_key(attr->ia_file);
 	} else {
 		key = afs_request_key(vnode->volume->cell);
 		if (IS_ERR(key)) {
@@ -494,7 +532,18 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	ret = afs_vnode_setattr(vnode, key, attr);
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_setattr(&fc, attr);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
 	if (!(attr->ia_valid & ATTR_FILE))
 		key_put(key);
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 82e16556afea..804d1f905622 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
 #include <linux/fscache.h>
 #include <linux/backing-dev.h>
 #include <linux/uuid.h>
+#include <net/net_namespace.h>
 #include <net/af_rxrpc.h>
 
 #include "afs.h"
@@ -31,16 +32,6 @@
 struct pagevec;
 struct afs_call;
 
-typedef enum {
-	AFS_VL_NEW,			/* new, uninitialised record */
-	AFS_VL_CREATING,		/* creating record */
-	AFS_VL_VALID,			/* record is pending */
-	AFS_VL_NO_VOLUME,		/* no such volume available */
-	AFS_VL_UPDATING,		/* update in progress */
-	AFS_VL_VOLUME_DELETED,		/* volume was deleted */
-	AFS_VL_UNCERTAIN,		/* uncertain state (update failed) */
-} __attribute__((packed)) afs_vlocation_state_t;
-
 struct afs_mount_params {
 	bool			rwpath;		/* T if the parent should be considered R/W */
 	bool			force;		/* T to force cell type */
@@ -48,20 +39,43 @@ struct afs_mount_params {
 	afs_voltype_t		type;		/* type of volume requested */
 	int			volnamesz;	/* size of volume name */
 	const char		*volname;	/* name of volume to mount */
+	struct afs_net		*net;		/* Network namespace in effect */
 	struct afs_cell		*cell;		/* cell in which to find volume */
 	struct afs_volume	*volume;	/* volume record */
 	struct key		*key;		/* key to use for secure mounting */
 };
 
+struct afs_iget_data {
+	struct afs_fid		fid;
+	struct afs_volume	*volume;	/* volume on which resides */
+};
+
 enum afs_call_state {
-	AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
-	AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
-	AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
-	AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
-	AFS_CALL_REPLYING,	/* replying to incoming call */
-	AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
-	AFS_CALL_COMPLETE,	/* Completed or failed */
+	AFS_CALL_CL_REQUESTING,		/* Client: Request is being sent */
+	AFS_CALL_CL_AWAIT_REPLY,	/* Client: Awaiting reply */
+	AFS_CALL_CL_PROC_REPLY,		/* Client: rxrpc call complete; processing reply */
+	AFS_CALL_SV_AWAIT_OP_ID,	/* Server: Awaiting op ID */
+	AFS_CALL_SV_AWAIT_REQUEST,	/* Server: Awaiting request data */
+	AFS_CALL_SV_REPLYING,		/* Server: Replying */
+	AFS_CALL_SV_AWAIT_ACK,		/* Server: Awaiting final ACK */
+	AFS_CALL_COMPLETE,		/* Completed or failed */
 };
+
+/*
+ * List of server addresses.
+ */
+struct afs_addr_list {
+	struct rcu_head		rcu;		/* Must be first */
+	refcount_t		usage;
+	u32			version;	/* Version */
+	unsigned short		nr_addrs;
+	unsigned short		index;		/* Address currently in use */
+	unsigned short		nr_ipv4;	/* Number of IPv4 addresses */
+	unsigned long		probed;		/* Mask of servers that have been probed */
+	unsigned long		yfs;		/* Mask of servers that are YFS */
+	struct sockaddr_rxrpc	addrs[];
+};
+
 /*
  * a record of an in-progress RxRPC call
  */
@@ -72,25 +86,25 @@ struct afs_call {
 	struct work_struct	work;		/* actual work processor */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct key		*key;		/* security for this call */
-	struct afs_server	*server;	/* server affected by incoming CM call */
+	struct afs_net		*net;		/* The network namespace */
+	struct afs_server	*cm_server;	/* Server affected by incoming CM call */
+	struct afs_cb_interest	*cbi;		/* Callback interest for server used */
 	void			*request;	/* request data (first part) */
-	struct address_space	*mapping;	/* page set */
-	struct afs_writeback	*wb;		/* writeback being performed */
+	struct address_space	*mapping;	/* Pages being written from */
 	void			*buffer;	/* reply receive buffer */
-	void			*reply;		/* reply buffer (first part) */
-	void			*reply2;	/* reply buffer (second part) */
-	void			*reply3;	/* reply buffer (third part) */
-	void			*reply4;	/* reply buffer (fourth part) */
+	void			*reply[4];	/* Where to put the reply */
 	pgoff_t			first;		/* first page in mapping to deal with */
 	pgoff_t			last;		/* last page in mapping to deal with */
 	size_t			offset;		/* offset into received data store */
 	atomic_t		usage;
 	enum afs_call_state	state;
+	spinlock_t		state_lock;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
 	unsigned		first_offset;	/* offset into mapping[first] */
+	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
 	union {
 		unsigned	last_to;	/* amount of mapping[last] */
 		unsigned	count2;		/* count used in unmarshalling */
@@ -100,8 +114,9 @@ struct afs_call {
 	bool			send_pages;	/* T if data from mapping should be sent */
 	bool			need_attention;	/* T if RxRPC poked us */
 	bool			async;		/* T if asynchronous */
-	u16			service_id;	/* RxRPC service ID to call */
-	__be16			port;		/* target UDP port */
+	bool			ret_reply0;	/* T if should return reply[0] on success */
+	bool			upgrade;	/* T to request service upgrade */
+	u16			service_id;	/* Actual service ID (after upgrade) */
 	u32			operation_ID;	/* operation ID for an incoming call */
 	u32			count;		/* count for use in unmarshalling */
 	__be32			tmp;		/* place to extract temporary data */
@@ -110,15 +125,13 @@ struct afs_call {
 
 struct afs_call_type {
 	const char *name;
+	unsigned int op; /* Really enum afs_fs_operation */
 
 	/* deliver request or reply data to an call
 	 * - returning an error will cause the call to be aborted
 	 */
 	int (*deliver)(struct afs_call *call);
 
-	/* map an abort code to an error number */
-	int (*abort_to_error)(u32 abort_code);
-
 	/* clean up a call */
 	void (*destructor)(struct afs_call *call);
 
@@ -127,6 +140,30 @@ struct afs_call_type {
 };
 
 /*
+ * Key available for writeback on a file.
+ */
+struct afs_wb_key {
+	refcount_t		usage;
+	struct key		*key;
+	struct list_head	vnode_link;	/* Link in vnode->wb_keys */
+};
+
+/*
+ * AFS open file information record.  Pointed to by file->private_data.
+ */
+struct afs_file {
+	struct key		*key;		/* The key this file was opened with */
+	struct afs_wb_key	*wb;		/* Writeback key record for this file */
+};
+
+static inline struct key *afs_file_key(struct file *file)
+{
+	struct afs_file *af = file->private_data;
+
+	return af->key;
+}
+
+/*
  * Record of an outstanding read operation on a vnode.
  */
 struct afs_read {
@@ -142,38 +179,13 @@ struct afs_read {
 };
 
 /*
- * record of an outstanding writeback on a vnode
- */
-struct afs_writeback {
-	struct list_head	link;		/* link in vnode->writebacks */
-	struct work_struct	writer;		/* work item to perform the writeback */
-	struct afs_vnode	*vnode;		/* vnode to which this write applies */
-	struct key		*key;		/* owner of this write */
-	wait_queue_head_t	waitq;		/* completion and ready wait queue */
-	pgoff_t			first;		/* first page in batch */
-	pgoff_t			point;		/* last page in current store op */
-	pgoff_t			last;		/* last page in batch (inclusive) */
-	unsigned		offset_first;	/* offset into first page of start of write */
-	unsigned		to_last;	/* offset into last page of end of write */
-	int			num_conflicts;	/* count of conflicting writes in list */
-	int			usage;
-	bool			conflicts;	/* T if has dependent conflicts */
-	enum {
-		AFS_WBACK_SYNCING,		/* synchronisation being performed */
-		AFS_WBACK_PENDING,		/* write pending */
-		AFS_WBACK_CONFLICTING,		/* conflicting writes posted */
-		AFS_WBACK_WRITING,		/* writing back */
-		AFS_WBACK_COMPLETE		/* the writeback record has been unlinked */
-	} state __attribute__((packed));
-};
-
-/*
  * AFS superblock private data
  * - there's one superblock per volume
  */
 struct afs_super_info {
+	struct afs_net		*net;		/* Network namespace */
+	struct afs_cell		*cell;		/* The cell in which the volume resides */
 	struct afs_volume	*volume;	/* volume record */
-	char			rwparent;	/* T if parent is R/W AFS volume */
 };
 
 static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
@@ -184,204 +196,297 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
 extern struct file_system_type afs_fs_type;
 
 /*
- * entry in the cached cell catalogue
+ * AFS network namespace record.
  */
-struct afs_cache_cell {
-	char		name[AFS_MAXCELLNAME];	/* cell name (padded with NULs) */
-	struct in_addr	vl_servers[15];		/* cached cell VL servers */
+struct afs_net {
+	struct afs_uuid		uuid;
+	bool			live;		/* F if this namespace is being removed */
+
+	/* AF_RXRPC I/O stuff */
+	struct socket		*socket;
+	struct afs_call		*spare_incoming_call;
+	struct work_struct	charge_preallocation_work;
+	struct mutex		socket_mutex;
+	atomic_t		nr_outstanding_calls;
+	atomic_t		nr_superblocks;
+
+	/* Cell database */
+	struct rb_root		cells;
+	struct afs_cell		*ws_cell;
+	struct work_struct	cells_manager;
+	struct timer_list	cells_timer;
+	atomic_t		cells_outstanding;
+	seqlock_t		cells_lock;
+
+	spinlock_t		proc_cells_lock;
+	struct list_head	proc_cells;
+
+	/* Known servers.  Theoretically each fileserver can only be in one
+	 * cell, but in practice, people create aliases and subsets and there's
+	 * no easy way to distinguish them.
+	 */
+	seqlock_t		fs_lock;	/* For fs_servers */
+	struct rb_root		fs_servers;	/* afs_server (by server UUID or address) */
+	struct list_head	fs_updates;	/* afs_server (by update_at) */
+	struct hlist_head	fs_proc;	/* procfs servers list */
+
+	struct hlist_head	fs_addresses4;	/* afs_server (by lowest IPv4 addr) */
+	struct hlist_head	fs_addresses6;	/* afs_server (by lowest IPv6 addr) */
+	seqlock_t		fs_addr_lock;	/* For fs_addresses[46] */
+
+	struct work_struct	fs_manager;
+	struct timer_list	fs_timer;
+	atomic_t		servers_outstanding;
+
+	/* File locking renewal management */
+	struct mutex		lock_manager_mutex;
+
+	/* Misc */
+	struct proc_dir_entry	*proc_afs;		/* /proc/net/afs directory */
+};
+
+extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns
+
+enum afs_cell_state {
+	AFS_CELL_UNSET,
+	AFS_CELL_ACTIVATING,
+	AFS_CELL_ACTIVE,
+	AFS_CELL_DEACTIVATING,
+	AFS_CELL_INACTIVE,
+	AFS_CELL_FAILED,
 };
 
 /*
- * AFS cell record
+ * AFS cell record.
+ *
+ * This is a tricky concept to get right as it is possible to create aliases
+ * simply by pointing AFSDB/SRV records for two names at the same set of VL
+ * servers; it is also possible to do things like setting up two sets of VL
+ * servers, one of which provides a superset of the volumes provided by the
+ * other (for internal/external division, for example).
+ *
+ * Cells only exist in the sense that (a) a cell's name maps to a set of VL
+ * servers and (b) a cell's name is used by the client to select the key to use
+ * for authentication and encryption.  The cell name is not typically used in
+ * the protocol.
+ *
+ * There is no easy way to determine if two cells are aliases or one is a
+ * subset of another.
  */
 struct afs_cell {
-	atomic_t		usage;
-	struct list_head	link;		/* main cell list link */
+	union {
+		struct rcu_head	rcu;
+		struct rb_node	net_node;	/* Node in net->cells */
+	};
+	struct afs_net		*net;
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
+	struct work_struct	manager;	/* Manager for init/deinit/dns */
 	struct list_head	proc_link;	/* /proc cell list link */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
-
-	/* server record management */
-	rwlock_t		servers_lock;	/* active server list lock */
-	struct list_head	servers;	/* active server list */
-
-	/* volume location record management */
-	struct rw_semaphore	vl_sem;		/* volume management serialisation semaphore */
-	struct list_head	vl_list;	/* cell's active VL record list */
-	spinlock_t		vl_lock;	/* vl_list lock */
-	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
-	unsigned short		vl_curr_svix;	/* current server index */
-	struct in_addr		vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
-
-	char			name[0];	/* cell name - must go last */
+	time64_t		dns_expiry;	/* Time AFSDB/SRV record expires */
+	time64_t		last_inactive;	/* Time of last drop of usage count */
+	atomic_t		usage;
+	unsigned long		flags;
+#define AFS_CELL_FL_NOT_READY	0		/* The cell record is not ready for use */
+#define AFS_CELL_FL_NO_GC	1		/* The cell was added manually, don't auto-gc */
+#define AFS_CELL_FL_NOT_FOUND	2		/* Permanent DNS error */
+#define AFS_CELL_FL_DNS_FAIL	3		/* Failed to access DNS */
+#define AFS_CELL_FL_NO_LOOKUP_YET 4		/* Not completed first DNS lookup yet */
+	enum afs_cell_state	state;
+	short			error;
+
+	/* Active fileserver interaction state. */
+	struct list_head	proc_volumes;	/* procfs volume list */
+	rwlock_t		proc_lock;
+
+	/* VL server list. */
+	rwlock_t		vl_addrs_lock;	/* Lock on vl_addrs */
+	struct afs_addr_list	__rcu *vl_addrs; /* List of VL servers */
+	u8			name_len;	/* Length of name */
+	char			name[64 + 1];	/* Cell name, case-flattened and NUL-padded */
 };
 
 /*
- * entry in the cached volume location catalogue
+ * Cached VLDB entry.
+ *
+ * This is pointed to by cell->vldb_entries, indexed by name.
  */
-struct afs_cache_vlocation {
-	/* volume name (lowercase, padded with NULs) */
-	uint8_t			name[AFS_MAXVOLNAME + 1];
+struct afs_vldb_entry {
+	afs_volid_t		vid[3];		/* Volume IDs for R/W, R/O and Bak volumes */
 
-	uint8_t			nservers;	/* number of entries used in servers[] */
-	uint8_t			vidmask;	/* voltype mask for vid[] */
-	uint8_t			srvtmask[8];	/* voltype masks for servers[] */
+	unsigned long		flags;
+#define AFS_VLDB_HAS_RW		0		/* - R/W volume exists */
+#define AFS_VLDB_HAS_RO		1		/* - R/O volume exists */
+#define AFS_VLDB_HAS_BAK	2		/* - Backup volume exists */
+#define AFS_VLDB_QUERY_VALID	3		/* - Record is valid */
+#define AFS_VLDB_QUERY_ERROR	4		/* - VL server returned error */
+
+	uuid_t			fs_server[AFS_NMAXNSERVERS];
+	u8			fs_mask[AFS_NMAXNSERVERS];
 #define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
 #define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
 #define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
-
-	afs_volid_t		vid[3];		/* volume IDs for R/W, R/O and Bak volumes */
-	struct in_addr		servers[8];	/* fileserver addresses */
-	time_t			rtime;		/* last retrieval time */
+	short			error;
+	u8			nr_servers;	/* Number of server records */
+	u8			name_len;
+	u8			name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */
 };
 
 /*
- * volume -> vnode hash table entry
+ * Record of fileserver with which we're actively communicating.
  */
-struct afs_cache_vhash {
-	afs_voltype_t		vtype;		/* which volume variation */
-	uint8_t			hash_bucket;	/* which hash bucket this represents */
-} __attribute__((packed));
+struct afs_server {
+	struct rcu_head		rcu;
+	union {
+		uuid_t		uuid;		/* Server ID */
+		struct afs_uuid	_uuid;
+	};
 
-/*
- * AFS volume location record
- */
-struct afs_vlocation {
+	struct afs_addr_list	__rcu *addresses;
+	struct rb_node		uuid_rb;	/* Link in net->servers */
+	struct hlist_node	addr4_link;	/* Link in net->fs_addresses4 */
+	struct hlist_node	addr6_link;	/* Link in net->fs_addresses6 */
+	struct hlist_node	proc_link;	/* Link in net->fs_proc */
+	struct afs_server	*gc_next;	/* Next server in manager's list */
+	time64_t		put_time;	/* Time at which last put */
+	time64_t		update_at;	/* Time at which to next update the record */
+	unsigned long		flags;
+#define AFS_SERVER_FL_NEW	0		/* New server, don't inc cb_s_break */
+#define AFS_SERVER_FL_NOT_READY	1		/* The record is not ready for use */
+#define AFS_SERVER_FL_NOT_FOUND	2		/* VL server says no such server */
+#define AFS_SERVER_FL_VL_FAIL	3		/* Failed to access VL server */
+#define AFS_SERVER_FL_UPDATING	4
+#define AFS_SERVER_FL_PROBED	5		/* The fileserver has been probed */
+#define AFS_SERVER_FL_PROBING	6		/* Fileserver is being probed */
 	atomic_t		usage;
-	time64_t		time_of_death;	/* time at which put reduced usage to 0 */
-	struct list_head	link;		/* link in cell volume location list */
-	struct list_head	grave;		/* link in master graveyard list */
-	struct list_head	update;		/* link in master update list */
-	struct afs_cell		*cell;		/* cell to which volume belongs */
-#ifdef CONFIG_AFS_FSCACHE
-	struct fscache_cookie	*cache;		/* caching cookie */
-#endif
-	struct afs_cache_vlocation vldb;	/* volume information DB record */
-	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
-	wait_queue_head_t	waitq;		/* status change waitqueue */
-	time64_t		update_at;	/* time at which record should be updated */
-	spinlock_t		lock;		/* access lock */
-	afs_vlocation_state_t	state;		/* volume location state */
-	unsigned short		upd_rej_cnt;	/* ENOMEDIUM count during update */
-	unsigned short		upd_busy_cnt;	/* EBUSY count during update */
-	bool			valid;		/* T if valid */
+	u32			addr_version;	/* Address list version */
+
+	/* file service access */
+	rwlock_t		fs_lock;	/* access lock */
+
+	/* callback promise management */
+	struct list_head	cb_interests;	/* List of superblocks using this server */
+	unsigned		cb_s_break;	/* Break-everything counter. */
+	rwlock_t		cb_break_lock;	/* Volume finding lock */
 };
 
 /*
- * AFS fileserver record
+ * Interest by a superblock on a server.
  */
-struct afs_server {
-	atomic_t		usage;
-	time64_t		time_of_death;	/* time at which put reduced usage to 0 */
-	struct in_addr		addr;		/* server address */
-	struct afs_cell		*cell;		/* cell in which server resides */
-	struct list_head	link;		/* link in cell's server list */
-	struct list_head	grave;		/* link in master graveyard list */
-	struct rb_node		master_rb;	/* link in master by-addr tree */
-	struct rw_semaphore	sem;		/* access lock */
+struct afs_cb_interest {
+	struct list_head	cb_link;	/* Link in server->cb_interests */
+	struct afs_server	*server;	/* Server on which this interest resides */
+	struct super_block	*sb;		/* Superblock on which inodes reside */
+	afs_volid_t		vid;		/* Volume ID to match */
+	refcount_t		usage;
+};
 
-	/* file service access */
-	struct rb_root		fs_vnodes;	/* vnodes backed by this server (ordered by FID) */
-	unsigned long		fs_act_jif;	/* time at which last activity occurred */
-	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
-	spinlock_t		fs_lock;	/* access lock */
-	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
+/*
+ * Replaceable server list.
+ */
+struct afs_server_entry {
+	struct afs_server	*server;
+	struct afs_cb_interest	*cb_interest;
+};
 
-	/* callback promise management */
-	struct rb_root		cb_promises;	/* vnode expiration list (ordered earliest first) */
-	struct delayed_work	cb_updater;	/* callback updater */
-	struct delayed_work	cb_break_work;	/* collected break dispatcher */
-	wait_queue_head_t	cb_break_waitq;	/* space available in cb_break waitqueue */
-	spinlock_t		cb_lock;	/* access lock */
-	struct afs_callback	cb_break[64];	/* ring of callbacks awaiting breaking */
-	atomic_t		cb_break_n;	/* number of pending breaks */
-	u8			cb_break_head;	/* head of callback breaking ring */
-	u8			cb_break_tail;	/* tail of callback breaking ring */
+struct afs_server_list {
+	refcount_t		usage;
+	unsigned short		nr_servers;
+	unsigned short		index;		/* Server currently in use */
+	unsigned short		vnovol_mask;	/* Servers to be skipped due to VNOVOL */
+	unsigned int		seq;		/* Set to ->servers_seq when installed */
+	struct afs_server_entry	servers[];
 };
 
 /*
- * AFS volume access record
+ * Live AFS volume management.
  */
 struct afs_volume {
+	afs_volid_t		vid;		/* volume ID */
 	atomic_t		usage;
-	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
-	struct afs_vlocation	*vlocation;	/* volume location */
+	time64_t		update_at;	/* Time at which to next update */
+	struct afs_cell		*cell;		/* Cell to which belongs (pins ref) */
+	struct list_head	proc_link;	/* Link in cell->vl_proc */
+	unsigned long		flags;
+#define AFS_VOLUME_NEEDS_UPDATE	0	/* - T if an update needs performing */
+#define AFS_VOLUME_UPDATING	1	/* - T if an update is in progress */
+#define AFS_VOLUME_WAIT		2	/* - T if users must wait for update */
+#define AFS_VOLUME_DELETED	3	/* - T if volume appears deleted */
+#define AFS_VOLUME_OFFLINE	4	/* - T if volume offline notice given */
+#define AFS_VOLUME_BUSY		5	/* - T if volume busy notice given */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
-	afs_volid_t		vid;		/* volume ID */
+	struct afs_server_list	*servers;	/* List of servers on which volume resides */
+	rwlock_t		servers_lock;	/* Lock for ->servers */
+	unsigned int		servers_seq;	/* Incremented each time ->servers changes */
+
 	afs_voltype_t		type;		/* type of volume */
+	short			error;
 	char			type_force;	/* force volume type (suppress R/O -> R/W) */
-	unsigned short		nservers;	/* number of server slots filled */
-	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
-	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
-	struct rw_semaphore	server_sem;	/* lock for accessing current server */
+	u8			name_len;
+	u8			name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */
 };
 
-/*
- * vnode catalogue entry
- */
-struct afs_cache_vnode {
-	afs_vnodeid_t		vnode_id;	/* vnode ID */
-	unsigned		vnode_unique;	/* vnode ID uniquifier */
-	afs_dataversion_t	data_version;	/* data version */
+enum afs_lock_state {
+	AFS_VNODE_LOCK_NONE,		/* The vnode has no lock on the server */
+	AFS_VNODE_LOCK_WAITING_FOR_CB,	/* We're waiting for the server to break the callback */
+	AFS_VNODE_LOCK_SETTING,		/* We're asking the server for a lock */
+	AFS_VNODE_LOCK_GRANTED,		/* We have a lock on the server */
+	AFS_VNODE_LOCK_EXTENDING,	/* We're extending a lock on the server */
+	AFS_VNODE_LOCK_NEED_UNLOCK,	/* We need to unlock on the server */
+	AFS_VNODE_LOCK_UNLOCKING,	/* We're telling the server to unlock */
 };
 
 /*
- * AFS inode private data
+ * AFS inode private data.
+ *
+ * Note that afs_alloc_inode() *must* reset anything that could incorrectly
+ * leak from one inode to another.
  */
 struct afs_vnode {
 	struct inode		vfs_inode;	/* the VFS's inode record */
 
 	struct afs_volume	*volume;	/* volume on which vnode resides */
-	struct afs_server	*server;	/* server currently supplying this file */
 	struct afs_fid		fid;		/* the file identifier for this inode */
 	struct afs_file_status	status;		/* AFS status info for this file */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
-	struct afs_permits	*permits;	/* cache of permits so far obtained */
-	struct mutex		permits_lock;	/* lock for altering permits list */
+	struct afs_permits	*permit_cache;	/* cache of permits so far obtained */
+	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
 	struct mutex		validate_lock;	/* lock for validating this vnode */
-	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
-	int			update_cnt;	/* number of outstanding ops that will update the
-						 * status */
-	spinlock_t		writeback_lock;	/* lock for writebacks */
+	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
-#define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
+#define AFS_VNODE_CB_PROMISED	0		/* Set if vnode has a callback promise */
 #define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
-#define AFS_VNODE_MODIFIED	2		/* set if vnode's data modified */
+#define AFS_VNODE_DIR_MODIFIED	2		/* set if dir vnode's data modified */
 #define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
 #define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
 #define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
-#define AFS_VNODE_LOCKING	6		/* set if waiting for lock on vnode */
-#define AFS_VNODE_READLOCKED	7		/* set if vnode is read-locked on the server */
-#define AFS_VNODE_WRITELOCKED	8		/* set if vnode is write-locked on the server */
-#define AFS_VNODE_UNLOCKING	9		/* set if vnode is being unlocked on the server */
-#define AFS_VNODE_AUTOCELL	10		/* set if Vnode is an auto mount point */
-#define AFS_VNODE_PSEUDODIR	11		/* set if Vnode is a pseudo directory */
+#define AFS_VNODE_AUTOCELL	6		/* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR	7 		/* set if Vnode is a pseudo directory */
 
-	long			acl_order;	/* ACL check count (callback break count) */
-
-	struct list_head	writebacks;	/* alterations in pagecache that need writing */
+	struct list_head	wb_keys;	/* List of keys available for writeback */
 	struct list_head	pending_locks;	/* locks waiting to be granted */
 	struct list_head	granted_locks;	/* locks granted on this file */
 	struct delayed_work	lock_work;	/* work to be done in locking */
-	struct key		*unlock_key;	/* key to be used in unlocking */
+	struct key		*lock_key;	/* Key to be used in lock ops */
+	enum afs_lock_state	lock_state : 8;
+	afs_lock_type_t		lock_type : 8;
 
 	/* outstanding callback notification on this file */
-	struct rb_node		server_rb;	/* link in server->fs_vnodes */
-	struct rb_node		cb_promise;	/* link in server->cb_promises */
-	struct work_struct	cb_broken_work;	/* work to be done on callback break */
-	time64_t		cb_expires;	/* time at which callback expires */
-	time64_t		cb_expires_at;	/* time used to order cb_promise */
+	struct afs_cb_interest	*cb_interest;	/* Server on which this resides */
+	unsigned int		cb_s_break;	/* Mass break counter on ->server */
+	unsigned int		cb_break;	/* Break counter on vnode */
+	seqlock_t		cb_lock;	/* Lock for ->cb_interest, ->status, ->cb_*break */
+
+	time64_t		cb_expires_at;	/* time at which callback expires */
 	unsigned		cb_version;	/* callback version */
-	unsigned		cb_expiry;	/* callback expiry time */
 	afs_callback_type_t	cb_type;	/* type of callback */
-	bool			cb_promised;	/* true if promise still holds */
 };
 
 /*
@@ -389,16 +494,21 @@ struct afs_vnode {
  */
 struct afs_permit {
 	struct key		*key;		/* RxRPC ticket holding a security context */
-	afs_access_t		access_mask;	/* access mask for this key */
+	afs_access_t		access;		/* CallerAccess value for this key */
 };
 
 /*
- * cache of security records from attempts to access a vnode
+ * Immutable cache of CallerAccess records from attempts to access vnodes.
+ * These may be shared between multiple vnodes.
  */
 struct afs_permits {
-	struct rcu_head		rcu;		/* disposal procedure */
-	int			count;		/* number of records */
-	struct afs_permit	permits[0];	/* the permits so far examined */
+	struct rcu_head		rcu;
+	struct hlist_node	hash_node;	/* Link in hash */
+	unsigned long		h;		/* Hash value for this permit list */
+	refcount_t		usage;
+	unsigned short		nr_permits;	/* Number of records */
+	bool			invalidated;	/* Invalidated due to key change */
+	struct afs_permit	permits[];	/* List of permits sorted by key pointer */
 };
 
 /*
@@ -410,28 +520,78 @@ struct afs_interface {
 	unsigned	mtu;		/* MTU of interface */
 };
 
-struct afs_uuid {
-	__be32		time_low;			/* low part of timestamp */
-	__be16		time_mid;			/* mid part of timestamp */
-	__be16		time_hi_and_version;		/* high part of timestamp and version  */
-	__u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
-	__u8		clock_seq_low;			/* clock seq low */
-	__u8		node[6];			/* spatially unique node ID (MAC addr) */
+/*
+ * Cursor for iterating over a server's address list.
+ */
+struct afs_addr_cursor {
+	struct afs_addr_list	*alist;		/* Current address list (pins ref) */
+	struct sockaddr_rxrpc	*addr;
+	u32			abort_code;
+	unsigned short		start;		/* Starting point in alist->addrs[] */
+	unsigned short		index;		/* Wrapping offset from start to current addr */
+	short			error;
+	bool			begun;		/* T if we've begun iteration */
+	bool			responded;	/* T if the current address responded */
+};
+
+/*
+ * Cursor for iterating over a set of fileservers.
+ */
+struct afs_fs_cursor {
+	struct afs_addr_cursor	ac;
+	struct afs_vnode	*vnode;
+	struct afs_server_list	*server_list;	/* Current server list (pins ref) */
+	struct afs_cb_interest	*cbi;		/* Server on which this resides (pins ref) */
+	struct key		*key;		/* Key for the server */
+	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
+	unsigned int		cb_break_2;	/* cb_break + cb_s_break (2nd vnode) */
+	unsigned char		start;		/* Initial index in server list */
+	unsigned char		index;		/* Number of servers tried beyond start */
+	unsigned short		flags;
+#define AFS_FS_CURSOR_STOP	0x0001		/* Set to cease iteration */
+#define AFS_FS_CURSOR_VBUSY	0x0002		/* Set if seen VBUSY */
+#define AFS_FS_CURSOR_VMOVED	0x0004		/* Set if seen VMOVED */
+#define AFS_FS_CURSOR_VNOVOL	0x0008		/* Set if seen VNOVOL */
+#define AFS_FS_CURSOR_CUR_ONLY	0x0010		/* Set if current server only (file lock held) */
+#define AFS_FS_CURSOR_NO_VSLEEP	0x0020		/* Set to prevent sleep on VBUSY, VOFFLINE, ... */
 };
 
+#include <trace/events/afs.h>
+
 /*****************************************************************************/
 /*
+ * addr_list.c
+ */
+static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist)
+{
+	if (alist)
+		refcount_inc(&alist->usage);
+	return alist;
+}
+extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
+						unsigned short,
+						unsigned short);
+extern void afs_put_addrlist(struct afs_addr_list *);
+extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char,
+						  unsigned short, unsigned short);
+extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *);
+extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+extern int afs_end_cursor(struct afs_addr_cursor *);
+extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *);
+
+extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
+extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
+
+/*
  * cache.c
  */
 #ifdef CONFIG_AFS_FSCACHE
 extern struct fscache_netfs afs_cache_netfs;
 extern struct fscache_cookie_def afs_cell_cache_index_def;
-extern struct fscache_cookie_def afs_vlocation_cache_index_def;
 extern struct fscache_cookie_def afs_volume_cache_index_def;
 extern struct fscache_cookie_def afs_vnode_cache_index_def;
 #else
 #define afs_cell_cache_index_def	(*(struct fscache_cookie_def *) NULL)
-#define afs_vlocation_cache_index_def	(*(struct fscache_cookie_def *) NULL)
 #define afs_volume_cache_index_def	(*(struct fscache_cookie_def *) NULL)
 #define afs_vnode_cache_index_def	(*(struct fscache_cookie_def *) NULL)
 #endif
@@ -440,29 +600,31 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
  * callback.c
  */
 extern void afs_init_callback_state(struct afs_server *);
-extern void afs_broken_callback_work(struct work_struct *);
-extern void afs_break_callbacks(struct afs_server *, size_t,
-				struct afs_callback[]);
-extern void afs_discard_callback_on_delete(struct afs_vnode *);
-extern void afs_give_up_callback(struct afs_vnode *);
-extern void afs_dispatch_give_up_callbacks(struct work_struct *);
-extern void afs_flush_callback_breaks(struct afs_server *);
-extern int __init afs_callback_update_init(void);
-extern void afs_callback_update_kill(void);
+extern void afs_break_callback(struct afs_vnode *);
+extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]);
+
+extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *);
+extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *);
+extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *);
+
+static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi)
+{
+	refcount_inc(&cbi->usage);
+	return cbi;
+}
 
 /*
  * cell.c
  */
-extern struct rw_semaphore afs_proc_cells_sem;
-extern struct list_head afs_proc_cells;
-
-#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
-extern struct afs_cell *afs_grab_cell(struct afs_cell *);
-extern void afs_put_cell(struct afs_cell *);
-extern void afs_cell_purge(void);
+extern int afs_cell_init(struct afs_net *, const char *);
+extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned);
+extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
+					const char *, bool);
+extern struct afs_cell *afs_get_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_net *, struct afs_cell *);
+extern void afs_manage_cells(struct work_struct *);
+extern void afs_cells_timer(struct timer_list *);
+extern void __net_exit afs_cell_purge(struct afs_net *);
 
 /*
  * cmservice.c
@@ -472,6 +634,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 /*
  * dir.c
  */
+extern bool afs_dir_check_page(struct inode *, struct page *);
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct dentry_operations afs_fs_dentry_operations;
 extern const struct file_operations afs_dir_file_operations;
@@ -483,15 +646,19 @@ extern const struct address_space_operations afs_fs_aops;
 extern const struct inode_operations afs_file_inode_operations;
 extern const struct file_operations afs_file_operations;
 
+extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
+extern void afs_put_wb_key(struct afs_wb_key *);
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *);
 extern int afs_page_filler(void *, struct page *);
 extern void afs_put_read(struct afs_read *);
 
 /*
  * flock.c
  */
-extern void __exit afs_kill_lock_manager(void);
+extern struct workqueue_struct *afs_lock_manager;
+
 extern void afs_lock_work(struct work_struct *);
 extern void afs_lock_may_be_available(struct afs_vnode *);
 extern int afs_lock(struct file *, int, struct file_lock *);
@@ -500,48 +667,40 @@ extern int afs_flock(struct file *, int, struct file_lock *);
 /*
  * fsclient.c
  */
-extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
-				    struct afs_vnode *, struct afs_volsync *,
-				    bool);
-extern int afs_fs_give_up_callbacks(struct afs_server *, bool);
-extern int afs_fs_fetch_data(struct afs_server *, struct key *,
-			     struct afs_vnode *, struct afs_read *, bool);
-extern int afs_fs_create(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *, umode_t,
-			 struct afs_fid *, struct afs_file_status *,
-			 struct afs_callback *, bool);
-extern int afs_fs_remove(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *, bool, bool);
-extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
-		       struct afs_vnode *, const char *, bool);
-extern int afs_fs_symlink(struct afs_server *, struct key *,
-			  struct afs_vnode *, const char *, const char *,
-			  struct afs_fid *, struct afs_file_status *, bool);
-extern int afs_fs_rename(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *,
-			 struct afs_vnode *, const char *, bool);
-extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
-			     pgoff_t, pgoff_t, unsigned, unsigned, bool);
-extern int afs_fs_setattr(struct afs_server *, struct key *,
-			  struct afs_vnode *, struct iattr *, bool);
-extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
-				    struct afs_vnode *,
-				    struct afs_volume_status *, bool);
-extern int afs_fs_set_lock(struct afs_server *, struct key *,
-			   struct afs_vnode *, afs_lock_type_t, bool);
-extern int afs_fs_extend_lock(struct afs_server *, struct key *,
-			      struct afs_vnode *, bool);
-extern int afs_fs_release_lock(struct afs_server *, struct key *,
-			       struct afs_vnode *, bool);
+extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *);
+extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
+extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
+extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t,
+			 struct afs_fid *, struct afs_file_status *, struct afs_callback *);
+extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool);
+extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *);
+extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+			  struct afs_fid *, struct afs_file_status *);
+extern int afs_fs_rename(struct afs_fs_cursor *, const char *,
+			 struct afs_vnode *, const char *);
+extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
+			     pgoff_t, pgoff_t, unsigned, unsigned);
+extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
+extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
+extern int afs_fs_extend_lock(struct afs_fs_cursor *);
+extern int afs_fs_release_lock(struct afs_fs_cursor *);
+extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
+					struct afs_addr_cursor *, struct key *);
+extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
+				   struct afs_addr_cursor *, struct key *);
 
 /*
  * inode.c
  */
+extern int afs_fetch_status(struct afs_vnode *, struct key *);
+extern int afs_iget5_test(struct inode *, void *);
 extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
 				       struct key *);
 extern struct inode *afs_iget(struct super_block *, struct key *,
 			      struct afs_fid *, struct afs_file_status *,
-			      struct afs_callback *);
+			      struct afs_callback *,
+			      struct afs_cb_interest *);
 extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -553,7 +712,35 @@ extern int afs_drop_inode(struct inode *);
  * main.c
  */
 extern struct workqueue_struct *afs_wq;
-extern struct afs_uuid afs_uuid;
+
+static inline struct afs_net *afs_d2net(struct dentry *dentry)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_i2net(struct inode *inode)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_sock2net(struct sock *sk)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_get_net(struct afs_net *net)
+{
+	return net;
+}
+
+static inline void afs_put_net(struct afs_net *net)
+{
+}
 
 /*
  * misc.c
@@ -578,23 +765,33 @@ extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
 /*
  * proc.c
  */
-extern int afs_proc_init(void);
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *);
-extern void afs_proc_cell_remove(struct afs_cell *);
+extern int __net_init afs_proc_init(struct afs_net *);
+extern void __net_exit afs_proc_cleanup(struct afs_net *);
+extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *);
+
+/*
+ * rotate.c
+ */
+extern bool afs_begin_vnode_operation(struct afs_fs_cursor *, struct afs_vnode *,
+				      struct key *);
+extern bool afs_select_fileserver(struct afs_fs_cursor *);
+extern bool afs_select_current_fileserver(struct afs_fs_cursor *);
+extern int afs_end_vnode_operation(struct afs_fs_cursor *);
 
 /*
  * rxrpc.c
  */
-extern struct socket *afs_socket;
-extern atomic_t afs_outstanding_calls;
+extern struct workqueue_struct *afs_async_calls;
 
-extern int afs_open_socket(void);
-extern void afs_close_socket(void);
+extern int __net_init afs_open_socket(struct afs_net *);
+extern void __net_exit afs_close_socket(struct afs_net *);
+extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
 extern int afs_queue_call_work(struct afs_call *);
-extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, bool);
-extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool);
+extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
+					    const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
 extern void afs_send_empty_reply(struct afs_call *);
@@ -606,117 +803,136 @@ static inline int afs_transfer_reply(struct afs_call *call)
 	return afs_extract_data(call, call->buffer, call->reply_max, false);
 }
 
+static inline bool afs_check_call_state(struct afs_call *call,
+					enum afs_call_state state)
+{
+	return READ_ONCE(call->state) == state;
+}
+
+static inline bool afs_set_call_state(struct afs_call *call,
+				      enum afs_call_state from,
+				      enum afs_call_state to)
+{
+	bool ok = false;
+
+	spin_lock_bh(&call->state_lock);
+	if (call->state == from) {
+		call->state = to;
+		trace_afs_call_state(call, from, to, 0, 0);
+		ok = true;
+	}
+	spin_unlock_bh(&call->state_lock);
+	return ok;
+}
+
+static inline void afs_set_call_complete(struct afs_call *call,
+					 int error, u32 remote_abort)
+{
+	enum afs_call_state state;
+	bool ok = false;
+
+	spin_lock_bh(&call->state_lock);
+	state = call->state;
+	if (state != AFS_CALL_COMPLETE) {
+		call->abort_code = remote_abort;
+		call->error = error;
+		call->state = AFS_CALL_COMPLETE;
+		trace_afs_call_state(call, state, AFS_CALL_COMPLETE,
+				     error, remote_abort);
+		ok = true;
+	}
+	spin_unlock_bh(&call->state_lock);
+	if (ok)
+		trace_afs_call_done(call);
+}
+
 /*
  * security.c
  */
+extern void afs_put_permits(struct afs_permits *);
 extern void afs_clear_permits(struct afs_vnode *);
-extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
 extern int afs_permission(struct inode *, int);
+extern void __exit afs_clean_up_permit_cache(void);
 
 /*
  * server.c
  */
 extern spinlock_t afs_server_peer_lock;
 
-#define afs_get_server(S)					\
-do {								\
-	_debug("GET SERVER %d", atomic_read(&(S)->usage));	\
-	atomic_inc(&(S)->usage);				\
-} while(0)
+static inline struct afs_server *afs_get_server(struct afs_server *server)
+{
+	atomic_inc(&server->usage);
+	return server;
+}
 
-extern struct afs_server *afs_lookup_server(struct afs_cell *,
-					    const struct in_addr *);
-extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *);
-extern void afs_put_server(struct afs_server *);
-extern void __exit afs_purge_servers(void);
+extern struct afs_server *afs_find_server(struct afs_net *,
+					  const struct sockaddr_rxrpc *);
+extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *);
+extern void afs_put_server(struct afs_net *, struct afs_server *);
+extern void afs_manage_servers(struct work_struct *);
+extern void afs_servers_timer(struct timer_list *);
+extern void __net_exit afs_purge_servers(struct afs_net *);
+extern bool afs_probe_fileserver(struct afs_fs_cursor *);
+extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *);
 
 /*
- * super.c
+ * server_list.c
  */
-extern int afs_fs_init(void);
-extern void afs_fs_exit(void);
+static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist)
+{
+	refcount_inc(&slist->usage);
+	return slist;
+}
 
-/*
- * vlclient.c
- */
-extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
-				    const char *, struct afs_cache_vlocation *,
-				    bool);
-extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
-				  afs_volid_t, afs_voltype_t,
-				  struct afs_cache_vlocation *, bool);
+extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *);
+extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *,
+						     struct afs_vldb_entry *,
+						     u8);
+extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *);
 
 /*
- * vlocation.c
+ * super.c
  */
-#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-
-extern int __init afs_vlocation_update_init(void);
-extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
-						  struct key *,
-						  const char *, size_t);
-extern void afs_put_vlocation(struct afs_vlocation *);
-extern void afs_vlocation_purge(void);
+extern int __init afs_fs_init(void);
+extern void __exit afs_fs_exit(void);
 
 /*
- * vnode.c
+ * vlclient.c
  */
-static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
-{
-	return container_of(inode, struct afs_vnode, vfs_inode);
-}
-
-static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
-{
-	return &vnode->vfs_inode;
-}
-
-extern void afs_vnode_finalise_status_update(struct afs_vnode *,
-					     struct afs_server *);
-extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
-				  struct key *);
-extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
-				struct afs_read *);
-extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
-			    umode_t, struct afs_fid *, struct afs_file_status *,
-			    struct afs_callback *, struct afs_server **);
-extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
-			    bool);
-extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
-			  const char *);
-extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
-			     const char *, struct afs_fid *,
-			     struct afs_file_status *, struct afs_server **);
-extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
-			    struct key *, const char *, const char *);
-extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
-				unsigned, unsigned);
-extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
-extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
-				       struct afs_volume_status *);
-extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
-			      afs_lock_type_t);
-extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
-extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
+extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *,
+							 struct afs_addr_cursor *,
+							 struct key *, const char *, int);
+extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *,
+						struct key *, const uuid_t *);
+extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *);
+extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *,
+						     struct key *, const uuid_t *);
 
 /*
  * volume.c
  */
-#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume)
+{
+	if (volume)
+		atomic_inc(&volume->usage);
+	return volume;
+}
 
-extern void afs_put_volume(struct afs_volume *);
-extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
-extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
-extern int afs_volume_release_fileserver(struct afs_vnode *,
-					 struct afs_server *, int);
+extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern void afs_activate_volume(struct afs_volume *);
+extern void afs_deactivate_volume(struct afs_volume *);
+extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
+extern int afs_check_volume_status(struct afs_volume *, struct key *);
 
 /*
  * write.c
  */
 extern int afs_set_page_dirty(struct page *);
-extern void afs_put_writeback(struct afs_writeback *);
 extern int afs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata);
@@ -727,9 +943,11 @@ extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
 extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
-extern int afs_writeback_all(struct afs_vnode *);
 extern int afs_flush(struct file *, fl_owner_t);
 extern int afs_fsync(struct file *, loff_t, loff_t, int);
+extern int afs_page_mkwrite(struct vm_fault *);
+extern void afs_prune_wb_keys(struct afs_vnode *);
+extern int afs_launder_page(struct page *);
 
 /*
  * xattr.c
@@ -737,12 +955,42 @@ extern int afs_fsync(struct file *, loff_t, loff_t, int);
 extern const struct xattr_handler *afs_xattr_handlers[];
 extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 
+
+/*
+ * Miscellaneous inline functions.
+ */
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+	return container_of(inode, struct afs_vnode, vfs_inode);
+}
+
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+	return &vnode->vfs_inode;
+}
+
+static inline void afs_vnode_commit_status(struct afs_fs_cursor *fc,
+					   struct afs_vnode *vnode,
+					   unsigned int cb_break)
+{
+	if (fc->ac.error == 0)
+		afs_cache_permit(vnode, fc->key, cb_break);
+}
+
+static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc,
+						 struct afs_vnode *vnode)
+{
+	if (fc->ac.error == -ENOENT) {
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		afs_break_callback(vnode);
+	}
+}
+
+
 /*****************************************************************************/
 /*
  * debug tracing
  */
-#include <trace/events/afs.h>
-
 extern unsigned afs_debug;
 
 #define dbgprintk(FMT,...) \
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9944770849da..15a02a05ff40 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -31,57 +31,112 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-struct afs_uuid afs_uuid;
 struct workqueue_struct *afs_wq;
+struct afs_net __afs_net;
 
 /*
- * initialise the AFS client FS module
+ * Initialise an AFS network namespace record.
  */
-static int __init afs_init(void)
+static int __net_init afs_net_init(struct afs_net *net)
 {
 	int ret;
 
-	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
+	net->live = true;
+	generate_random_uuid((unsigned char *)&net->uuid);
 
-	generate_random_uuid((unsigned char *)&afs_uuid);
+	INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+	mutex_init(&net->socket_mutex);
 
-	/* create workqueue */
-	ret = -ENOMEM;
-	afs_wq = alloc_workqueue("afs", 0, 0);
-	if (!afs_wq)
-		return ret;
+	net->cells = RB_ROOT;
+	seqlock_init(&net->cells_lock);
+	INIT_WORK(&net->cells_manager, afs_manage_cells);
+	timer_setup(&net->cells_timer, afs_cells_timer, 0);
 
-	/* register the /proc stuff */
-	ret = afs_proc_init();
-	if (ret < 0)
-		goto error_proc;
+	spin_lock_init(&net->proc_cells_lock);
+	INIT_LIST_HEAD(&net->proc_cells);
 
-#ifdef CONFIG_AFS_FSCACHE
-	/* we want to be able to cache */
-	ret = fscache_register_netfs(&afs_cache_netfs);
+	seqlock_init(&net->fs_lock);
+	net->fs_servers = RB_ROOT;
+	INIT_LIST_HEAD(&net->fs_updates);
+	INIT_HLIST_HEAD(&net->fs_proc);
+
+	INIT_HLIST_HEAD(&net->fs_addresses4);
+	INIT_HLIST_HEAD(&net->fs_addresses6);
+	seqlock_init(&net->fs_addr_lock);
+
+	INIT_WORK(&net->fs_manager, afs_manage_servers);
+	timer_setup(&net->fs_timer, afs_servers_timer, 0);
+
+	/* Register the /proc stuff */
+	ret = afs_proc_init(net);
 	if (ret < 0)
-		goto error_cache;
-#endif
+		goto error_proc;
 
-	/* initialise the cell DB */
-	ret = afs_cell_init(rootcell);
+	/* Initialise the cell DB */
+	ret = afs_cell_init(net, rootcell);
 	if (ret < 0)
 		goto error_cell_init;
 
-	/* initialise the VL update process */
-	ret = afs_vlocation_update_init();
+	/* Create the RxRPC transport */
+	ret = afs_open_socket(net);
 	if (ret < 0)
-		goto error_vl_update_init;
+		goto error_open_socket;
 
-	/* initialise the callback update process */
-	ret = afs_callback_update_init();
+	return 0;
+
+error_open_socket:
+	net->live = false;
+	afs_cell_purge(net);
+	afs_purge_servers(net);
+error_cell_init:
+	net->live = false;
+	afs_proc_cleanup(net);
+error_proc:
+	net->live = false;
+	return ret;
+}
+
+/*
+ * Clean up and destroy an AFS network namespace record.
+ */
+static void __net_exit afs_net_exit(struct afs_net *net)
+{
+	net->live = false;
+	afs_cell_purge(net);
+	afs_purge_servers(net);
+	afs_close_socket(net);
+	afs_proc_cleanup(net);
+}
+
+/*
+ * initialise the AFS client FS module
+ */
+static int __init afs_init(void)
+{
+	int ret = -ENOMEM;
+
+	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
+
+	afs_wq = alloc_workqueue("afs", 0, 0);
+	if (!afs_wq)
+		goto error_afs_wq;
+	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
+	if (!afs_async_calls)
+		goto error_async;
+	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+	if (!afs_lock_manager)
+		goto error_lockmgr;
+
+#ifdef CONFIG_AFS_FSCACHE
+	/* we want to be able to cache */
+	ret = fscache_register_netfs(&afs_cache_netfs);
 	if (ret < 0)
-		goto error_callback_update_init;
+		goto error_cache;
+#endif
 
-	/* create the RxRPC transport */
-	ret = afs_open_socket();
+	ret = afs_net_init(&__afs_net);
 	if (ret < 0)
-		goto error_open_socket;
+		goto error_net;
 
 	/* register the filesystems */
 	ret = afs_fs_init();
@@ -91,21 +146,18 @@ static int __init afs_init(void)
 	return ret;
 
 error_fs:
-	afs_close_socket();
-error_open_socket:
-	afs_callback_update_kill();
-error_callback_update_init:
-	afs_vlocation_purge();
-error_vl_update_init:
-	afs_cell_purge();
-error_cell_init:
+	afs_net_exit(&__afs_net);
+error_net:
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
-	afs_proc_cleanup();
-error_proc:
+	destroy_workqueue(afs_lock_manager);
+error_lockmgr:
+	destroy_workqueue(afs_async_calls);
+error_async:
 	destroy_workqueue(afs_wq);
+error_afs_wq:
 	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
@@ -124,17 +176,14 @@ static void __exit afs_exit(void)
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
 
 	afs_fs_exit();
-	afs_kill_lock_manager();
-	afs_close_socket();
-	afs_purge_servers();
-	afs_callback_update_kill();
-	afs_vlocation_purge();
-	destroy_workqueue(afs_wq);
-	afs_cell_purge();
+	afs_net_exit(&__afs_net);
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 #endif
-	afs_proc_cleanup();
+	destroy_workqueue(afs_lock_manager);
+	destroy_workqueue(afs_async_calls);
+	destroy_workqueue(afs_wq);
+	afs_clean_up_permit_cache();
 	rcu_barrier();
 }
 
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index c05f1f1c0d41..700a5fa7f4ec 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -21,12 +21,12 @@
 int afs_abort_to_error(u32 abort_code)
 {
 	switch (abort_code) {
-	/* low errno codes inserted into abort namespace */
+		/* Low errno codes inserted into abort namespace */
 	case 13:		return -EACCES;
 	case 27:		return -EFBIG;
 	case 30:		return -EROFS;
 
-	/* VICE "special error" codes; 101 - 111 */
+		/* VICE "special error" codes; 101 - 111 */
 	case VSALVAGE:		return -EIO;
 	case VNOVNODE:		return -ENOENT;
 	case VNOVOL:		return -ENOMEDIUM;
@@ -39,7 +39,37 @@ int afs_abort_to_error(u32 abort_code)
 	case VBUSY:		return -EBUSY;
 	case VMOVED:		return -ENXIO;
 
-	/* Unified AFS error table; ET "uae" == 0x2f6df00 */
+		/* Volume Location server errors */
+	case AFSVL_IDEXIST:		return -EEXIST;
+	case AFSVL_IO:			return -EREMOTEIO;
+	case AFSVL_NAMEEXIST:		return -EEXIST;
+	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
+	case AFSVL_NOENT:		return -ENOMEDIUM;
+	case AFSVL_EMPTY:		return -ENOMEDIUM;
+	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
+	case AFSVL_BADNAME:		return -EINVAL;
+	case AFSVL_BADINDEX:		return -EINVAL;
+	case AFSVL_BADVOLTYPE:		return -EINVAL;
+	case AFSVL_BADSERVER:		return -EINVAL;
+	case AFSVL_BADPARTITION:	return -EINVAL;
+	case AFSVL_REPSFULL:		return -EFBIG;
+	case AFSVL_NOREPSERVER:		return -ENOENT;
+	case AFSVL_DUPREPSERVER:	return -EEXIST;
+	case AFSVL_RWNOTFOUND:		return -ENOENT;
+	case AFSVL_BADREFCOUNT:		return -EINVAL;
+	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
+	case AFSVL_BADENTRY:		return -EINVAL;
+	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
+	case AFSVL_IDALREADYHASHED:	return -EINVAL;
+	case AFSVL_ENTRYLOCKED:		return -EBUSY;
+	case AFSVL_BADVOLOPER:		return -EBADRQC;
+	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
+	case AFSVL_RERELEASE:		return -EREMOTEIO;
+	case AFSVL_BADSERVERFLAG:	return -EINVAL;
+	case AFSVL_PERM:		return -EACCES;
+	case AFSVL_NOMEM:		return -EREMOTEIO;
+
+		/* Unified AFS error table; ET "uae" == 0x2f6df00 */
 	case 0x2f6df00:		return -EPERM;
 	case 0x2f6df01:		return -ENOENT;
 	case 0x2f6df04:		return -EIO;
@@ -68,7 +98,7 @@ int afs_abort_to_error(u32 abort_code)
 	case 0x2f6df6c:		return -ETIMEDOUT;
 	case 0x2f6df78:		return -EDQUOT;
 
-	/* RXKAD abort codes; from include/rxrpc/packet.h.  ET "RXK" == 0x1260B00 */
+		/* RXKAD abort codes; from include/rxrpc/packet.h.  ET "RXK" == 0x1260B00 */
 	case RXKADINCONSISTENCY: return -EPROTO;
 	case RXKADPACKETSHORT:	return -EPROTO;
 	case RXKADLEVELFAIL:	return -EKEYREJECTED;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 35efb9a31dd7..4508dd54f789 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -17,8 +17,15 @@
 #include <linux/uaccess.h>
 #include "internal.h"
 
-static struct proc_dir_entry *proc_afs;
+static inline struct afs_net *afs_proc2net(struct file *f)
+{
+	return &__afs_net;
+}
 
+static inline struct afs_net *afs_seq2net(struct seq_file *m)
+{
+	return &__afs_net; // TODO: use seq_file_net(m)
+}
 
 static int afs_proc_cells_open(struct inode *inode, struct file *file);
 static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos);
@@ -98,22 +105,22 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.release	= seq_release,
 };
 
-static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
-static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
-static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
+static int afs_proc_servers_open(struct inode *inode, struct file *file);
+static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_servers_next(struct seq_file *p, void *v,
 					loff_t *pos);
-static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
-static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
-
-static const struct seq_operations afs_proc_cell_servers_ops = {
-	.start	= afs_proc_cell_servers_start,
-	.next	= afs_proc_cell_servers_next,
-	.stop	= afs_proc_cell_servers_stop,
-	.show	= afs_proc_cell_servers_show,
+static void afs_proc_servers_stop(struct seq_file *p, void *v);
+static int afs_proc_servers_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_servers_ops = {
+	.start	= afs_proc_servers_start,
+	.next	= afs_proc_servers_next,
+	.stop	= afs_proc_servers_stop,
+	.show	= afs_proc_servers_show,
 };
 
-static const struct file_operations afs_proc_cell_servers_fops = {
-	.open		= afs_proc_cell_servers_open,
+static const struct file_operations afs_proc_servers_fops = {
+	.open		= afs_proc_servers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
@@ -122,23 +129,24 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 /*
  * initialise the /proc/fs/afs/ directory
  */
-int afs_proc_init(void)
+int afs_proc_init(struct afs_net *net)
 {
 	_enter("");
 
-	proc_afs = proc_mkdir("fs/afs", NULL);
-	if (!proc_afs)
+	net->proc_afs = proc_mkdir("fs/afs", NULL);
+	if (!net->proc_afs)
 		goto error_dir;
 
-	if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
-	    !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
+	if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) ||
+	    !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops))
 		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
 error_tree:
-	remove_proc_subtree("fs/afs", NULL);
+	proc_remove(net->proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -147,9 +155,10 @@ error_dir:
 /*
  * clean up the /proc/fs/afs/ directory
  */
-void afs_proc_cleanup(void)
+void afs_proc_cleanup(struct afs_net *net)
 {
-	remove_proc_subtree("fs/afs", NULL);
+	proc_remove(net->proc_afs);
+	net->proc_afs = NULL;
 }
 
 /*
@@ -166,7 +175,6 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 
 	m = file->private_data;
 	m->private = PDE_DATA(inode);
-
 	return 0;
 }
 
@@ -176,25 +184,28 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
  */
 static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
 {
-	/* lock the list against modification */
-	down_read(&afs_proc_cells_sem);
-	return seq_list_start_head(&afs_proc_cells, *_pos);
+	struct afs_net *net = afs_seq2net(m);
+
+	rcu_read_lock();
+	return seq_list_start_head(&net->proc_cells, *_pos);
 }
 
 /*
  * move to next cell in cells list
  */
-static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
+static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	return seq_list_next(v, &afs_proc_cells, pos);
+	struct afs_net *net = afs_seq2net(m);
+
+	return seq_list_next(v, &net->proc_cells, pos);
 }
 
 /*
  * clean up after reading from the cells list
  */
-static void afs_proc_cells_stop(struct seq_file *p, void *v)
+static void afs_proc_cells_stop(struct seq_file *m, void *v)
 {
-	up_read(&afs_proc_cells_sem);
+	rcu_read_unlock();
 }
 
 /*
@@ -203,16 +214,16 @@ static void afs_proc_cells_stop(struct seq_file *p, void *v)
 static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
 	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+	struct afs_net *net = afs_seq2net(m);
 
-	if (v == &afs_proc_cells) {
+	if (v == &net->proc_cells) {
 		/* display header on line 1 */
 		seq_puts(m, "USE NAME\n");
 		return 0;
 	}
 
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3d %s\n",
-		   atomic_read(&cell->usage), cell->name);
+	seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name);
 	return 0;
 }
 
@@ -223,6 +234,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 				    size_t size, loff_t *_pos)
 {
+	struct afs_net *net = afs_proc2net(file);
 	char *kbuf, *name, *args;
 	int ret;
 
@@ -264,13 +276,13 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 	if (strcmp(kbuf, "add") == 0) {
 		struct afs_cell *cell;
 
-		cell = afs_cell_create(name, strlen(name), args, false);
+		cell = afs_lookup_cell(net, name, strlen(name), args, true);
 		if (IS_ERR(cell)) {
 			ret = PTR_ERR(cell);
 			goto done;
 		}
 
-		afs_put_cell(cell);
+		set_bit(AFS_CELL_FL_NO_GC, &cell->flags);
 		printk("kAFS: Added new cell '%s'\n", name);
 	} else {
 		goto inval;
@@ -303,6 +315,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       const char __user *buf,
 				       size_t size, loff_t *_pos)
 {
+	struct afs_net *net = afs_proc2net(file);
 	char *kbuf, *s;
 	int ret;
 
@@ -322,7 +335,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 	/* determine command to perform */
 	_debug("rootcell=%s", kbuf);
 
-	ret = afs_cell_init(kbuf);
+	ret = afs_cell_init(net, kbuf);
 	if (ret >= 0)
 		ret = size;	/* consume everything, always */
 
@@ -334,29 +347,27 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 /*
  * initialise /proc/fs/afs/<cell>/
  */
-int afs_proc_cell_setup(struct afs_cell *cell)
+int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell)
 {
 	struct proc_dir_entry *dir;
 
-	_enter("%p{%s}", cell, cell->name);
+	_enter("%p{%s},%p", cell, cell->name, net->proc_afs);
 
-	dir = proc_mkdir(cell->name, proc_afs);
+	dir = proc_mkdir(cell->name, net->proc_afs);
 	if (!dir)
 		goto error_dir;
 
-	if (!proc_create_data("servers", 0, dir,
-			     &afs_proc_cell_servers_fops, cell) ||
-	    !proc_create_data("vlservers", 0, dir,
-			     &afs_proc_cell_vlservers_fops, cell) ||
+	if (!proc_create_data("vlservers", 0, dir,
+			      &afs_proc_cell_vlservers_fops, cell) ||
 	    !proc_create_data("volumes", 0, dir,
-			     &afs_proc_cell_volumes_fops, cell))
+			      &afs_proc_cell_volumes_fops, cell))
 		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
 error_tree:
-	remove_proc_subtree(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, net->proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -365,11 +376,11 @@ error_dir:
 /*
  * remove /proc/fs/afs/<cell>/
  */
-void afs_proc_cell_remove(struct afs_cell *cell)
+void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell)
 {
 	_enter("");
 
-	remove_proc_subtree(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, net->proc_afs);
 
 	_leave("");
 }
@@ -407,9 +418,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 
 	_enter("cell=%p pos=%Ld", cell, *_pos);
 
-	/* lock the list against modification */
-	down_read(&cell->vl_sem);
-	return seq_list_start_head(&cell->vl_list, *_pos);
+	read_lock(&cell->proc_lock);
+	return seq_list_start_head(&cell->proc_volumes, *_pos);
 }
 
 /*
@@ -421,7 +431,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 	struct afs_cell *cell = p->private;
 
 	_enter("cell=%p pos=%Ld", cell, *_pos);
-	return seq_list_next(v, &cell->vl_list, _pos);
+	return seq_list_next(v, &cell->proc_volumes, _pos);
 }
 
 /*
@@ -431,17 +441,13 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
 {
 	struct afs_cell *cell = p->private;
 
-	up_read(&cell->vl_sem);
+	read_unlock(&cell->proc_lock);
 }
 
-static const char afs_vlocation_states[][4] = {
-	[AFS_VL_NEW]			= "New",
-	[AFS_VL_CREATING]		= "Crt",
-	[AFS_VL_VALID]			= "Val",
-	[AFS_VL_NO_VOLUME]		= "NoV",
-	[AFS_VL_UPDATING]		= "Upd",
-	[AFS_VL_VOLUME_DELETED]		= "Del",
-	[AFS_VL_UNCERTAIN]		= "Unc",
+static const char afs_vol_types[3][3] = {
+	[AFSVL_RWVOL]	= "RW",
+	[AFSVL_ROVOL]	= "RO",
+	[AFSVL_BACKVOL]	= "BK",
 };
 
 /*
@@ -450,23 +456,17 @@ static const char afs_vlocation_states[][4] = {
 static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 {
 	struct afs_cell *cell = m->private;
-	struct afs_vlocation *vlocation =
-		list_entry(v, struct afs_vlocation, link);
+	struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link);
 
-	/* display header on line 1 */
-	if (v == &cell->vl_list) {
-		seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
+	/* Display header on line 1 */
+	if (v == &cell->proc_volumes) {
+		seq_puts(m, "USE VID      TY\n");
 		return 0;
 	}
 
-	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3d %s %08x %08x %08x %s\n",
-		   atomic_read(&vlocation->usage),
-		   afs_vlocation_states[vlocation->state],
-		   vlocation->vldb.vid[0],
-		   vlocation->vldb.vid[1],
-		   vlocation->vldb.vid[2],
-		   vlocation->vldb.name);
+	seq_printf(m, "%3d %08x %s\n",
+		   atomic_read(&vol->usage), vol->vid,
+		   afs_vol_types[vol->type]);
 
 	return 0;
 }
@@ -501,23 +501,23 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
  */
 static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 {
+	struct afs_addr_list *alist;
 	struct afs_cell *cell = m->private;
 	loff_t pos = *_pos;
 
-	_enter("cell=%p pos=%Ld", cell, *_pos);
+	rcu_read_lock();
 
-	/* lock the list against modification */
-	down_read(&cell->vl_sem);
+	alist = rcu_dereference(cell->vl_addrs);
 
 	/* allow for the header line */
 	if (!pos)
 		return (void *) 1;
 	pos--;
 
-	if (pos >= cell->vl_naddrs)
+	if (!alist || pos >= alist->nr_addrs)
 		return NULL;
 
-	return &cell->vl_addrs[pos];
+	return alist->addrs + pos;
 }
 
 /*
@@ -526,17 +526,18 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *_pos)
 {
+	struct afs_addr_list *alist;
 	struct afs_cell *cell = p->private;
 	loff_t pos;
 
-	_enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos);
+	alist = rcu_dereference(cell->vl_addrs);
 
 	pos = *_pos;
 	(*_pos)++;
-	if (pos >= cell->vl_naddrs)
+	if (!alist || pos >= alist->nr_addrs)
 		return NULL;
 
-	return &cell->vl_addrs[pos];
+	return alist->addrs + pos;
 }
 
 /*
@@ -544,9 +545,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
  */
 static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
 {
-	struct afs_cell *cell = p->private;
-
-	up_read(&cell->vl_sem);
+	rcu_read_unlock();
 }
 
 /*
@@ -554,100 +553,76 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
  */
 static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
 {
-	struct in_addr *addr = v;
+	struct sockaddr_rxrpc *addr = v;
 
 	/* display header on line 1 */
-	if (v == (struct in_addr *) 1) {
+	if (v == (void *)1) {
 		seq_puts(m, "ADDRESS\n");
 		return 0;
 	}
 
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%pI4\n", &addr->s_addr);
+	seq_printf(m, "%pISp\n", &addr->transport);
 	return 0;
 }
 
 /*
- * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
+ * open "/proc/fs/afs/servers" which provides a summary of active
  * servers
  */
-static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
+static int afs_proc_servers_open(struct inode *inode, struct file *file)
 {
-	struct afs_cell *cell;
-	struct seq_file *m;
-	int ret;
-
-	cell = PDE_DATA(inode);
-	if (!cell)
-		return -ENOENT;
-
-	ret = seq_open(file, &afs_proc_cell_servers_ops);
-	if (ret < 0)
-		return ret;
-
-	m = file->private_data;
-	m->private = cell;
-	return 0;
+	return seq_open(file, &afs_proc_servers_ops);
 }
 
 /*
- * set up the iterator to start reading from the cells list and return the
- * first item
+ * Set up the iterator to start reading from the server list and return the
+ * first item.
  */
-static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
-	__acquires(m->private->servers_lock)
+static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos)
 {
-	struct afs_cell *cell = m->private;
-
-	_enter("cell=%p pos=%Ld", cell, *_pos);
+	struct afs_net *net = afs_seq2net(m);
 
-	/* lock the list against modification */
-	read_lock(&cell->servers_lock);
-	return seq_list_start_head(&cell->servers, *_pos);
+	rcu_read_lock();
+	return seq_hlist_start_head_rcu(&net->fs_proc, *_pos);
 }
 
 /*
  * move to next cell in cells list
  */
-static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
-					loff_t *_pos)
+static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	struct afs_cell *cell = p->private;
+	struct afs_net *net = afs_seq2net(m);
 
-	_enter("cell=%p pos=%Ld", cell, *_pos);
-	return seq_list_next(v, &cell->servers, _pos);
+	return seq_hlist_next_rcu(v, &net->fs_proc, _pos);
 }
 
 /*
  * clean up after reading from the cells list
  */
-static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
-	__releases(p->private->servers_lock)
+static void afs_proc_servers_stop(struct seq_file *p, void *v)
 {
-	struct afs_cell *cell = p->private;
-
-	read_unlock(&cell->servers_lock);
+	rcu_read_unlock();
 }
 
 /*
  * display a header line followed by a load of volume lines
  */
-static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
+static int afs_proc_servers_show(struct seq_file *m, void *v)
 {
-	struct afs_cell *cell = m->private;
-	struct afs_server *server = list_entry(v, struct afs_server, link);
-	char ipaddr[20];
+	struct afs_server *server;
+	struct afs_addr_list *alist;
 
-	/* display header on line 1 */
-	if (v == &cell->servers) {
-		seq_puts(m, "USE ADDR            STATE\n");
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "UUID                                 USE ADDR\n");
 		return 0;
 	}
 
-	/* display one cell per line on subsequent lines */
-	sprintf(ipaddr, "%pI4", &server->addr);
-	seq_printf(m, "%3d %-15.15s %5d\n",
-		   atomic_read(&server->usage), ipaddr, server->fs_state);
-
+	server = list_entry(v, struct afs_server, proc_link);
+	alist = rcu_dereference(server->addresses);
+	seq_printf(m, "%pU %3d %pISp\n",
+		   &server->uuid,
+		   atomic_read(&server->usage),
+		   &alist->addrs[alist->index].transport);
 	return 0;
 }
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
new file mode 100644
index 000000000000..d04511fb3879
--- /dev/null
+++ b/fs/afs/rotate.c
@@ -0,0 +1,757 @@
+/* Handle fileserver selection and rotation.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/sched/signal.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+/*
+ * Initialise a filesystem server cursor for iterating over FS servers.
+ */
+void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+{
+	memset(fc, 0, sizeof(*fc));
+}
+
+/*
+ * Begin an operation on the fileserver.
+ *
+ * Fileserver operations are serialised on the server by vnode, so we serialise
+ * them here also using the io_lock.
+ */
+bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+			       struct key *key)
+{
+	afs_init_fs_cursor(fc, vnode);
+	fc->vnode = vnode;
+	fc->key = key;
+	fc->ac.error = SHRT_MAX;
+
+	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+		fc->ac.error = -EINTR;
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		return false;
+	}
+
+	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
+		fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
+	return true;
+}
+
+/*
+ * Begin iteration through a server list, starting with the vnode's last used
+ * server if possible, or the last recorded good server if not.
+ */
+static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
+				   struct afs_vnode *vnode)
+{
+	struct afs_cb_interest *cbi;
+	int i;
+
+	read_lock(&vnode->volume->servers_lock);
+	fc->server_list = afs_get_serverlist(vnode->volume->servers);
+	read_unlock(&vnode->volume->servers_lock);
+
+	cbi = vnode->cb_interest;
+	if (cbi) {
+		/* See if the vnode's preferred record is still available */
+		for (i = 0; i < fc->server_list->nr_servers; i++) {
+			if (fc->server_list->servers[i].cb_interest == cbi) {
+				fc->start = i;
+				goto found_interest;
+			}
+		}
+
+		/* If we have a lock outstanding on a server that's no longer
+		 * serving this vnode, then we can't switch to another server
+		 * and have to return an error.
+		 */
+		if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
+			fc->ac.error = -ESTALE;
+			return false;
+		}
+
+		/* Note that the callback promise is effectively broken */
+		write_seqlock(&vnode->cb_lock);
+		ASSERTCMP(cbi, ==, vnode->cb_interest);
+		vnode->cb_interest = NULL;
+		if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+			vnode->cb_break++;
+		write_sequnlock(&vnode->cb_lock);
+
+		afs_put_cb_interest(afs_v2net(vnode), cbi);
+		cbi = NULL;
+	} else {
+		fc->start = READ_ONCE(fc->server_list->index);
+	}
+
+found_interest:
+	fc->index = fc->start;
+	return true;
+}
+
+/*
+ * Post volume busy note.
+ */
+static void afs_busy(struct afs_volume *volume, u32 abort_code)
+{
+	const char *m;
+
+	switch (abort_code) {
+	case VOFFLINE:		m = "offline";		break;
+	case VRESTARTING:	m = "restarting";	break;
+	case VSALVAGING:	m = "being salvaged";	break;
+	default:		m = "busy";		break;
+	}
+
+	pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m);
+}
+
+/*
+ * Sleep and retry the operation to the same fileserver.
+ */
+static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
+{
+	msleep_interruptible(1000);
+	if (signal_pending(current)) {
+		fc->ac.error = -ERESTARTSYS;
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Select the fileserver to use.  May be called multiple times to rotate
+ * through the fileservers.
+ */
+bool afs_select_fileserver(struct afs_fs_cursor *fc)
+{
+	struct afs_addr_list *alist;
+	struct afs_server *server;
+	struct afs_vnode *vnode = fc->vnode;
+
+	_enter("%u/%u,%u/%u,%d,%d",
+	       fc->index, fc->start,
+	       fc->ac.index, fc->ac.start,
+	       fc->ac.error, fc->ac.abort_code);
+
+	if (fc->flags & AFS_FS_CURSOR_STOP) {
+		_leave(" = f [stopped]");
+		return false;
+	}
+
+	/* Evaluate the result of the previous operation, if there was one. */
+	switch (fc->ac.error) {
+	case SHRT_MAX:
+		goto start;
+
+	case 0:
+	default:
+		/* Success or local failure.  Stop. */
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		_leave(" = f [okay/local %d]", fc->ac.error);
+		return false;
+
+	case -ECONNABORTED:
+		/* The far side rejected the operation on some grounds.  This
+		 * might involve the server being busy or the volume having been moved.
+		 */
+		switch (fc->ac.abort_code) {
+		case VNOVOL:
+			/* This fileserver doesn't know about the volume.
+			 * - May indicate that the VL is wrong - retry once and compare
+			 *   the results.
+			 * - May indicate that the fileserver couldn't attach to the vol.
+			 */
+			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
+				fc->ac.error = -EREMOTEIO;
+				goto failed;
+			}
+
+			write_lock(&vnode->volume->servers_lock);
+			fc->server_list->vnovol_mask |= 1 << fc->index;
+			write_unlock(&vnode->volume->servers_lock);
+
+			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
+			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
+			if (fc->ac.error < 0)
+				goto failed;
+
+			if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
+				fc->ac.error = -ENOMEDIUM;
+				goto failed;
+			}
+
+			/* If the server list didn't change, then assume that
+			 * it's the fileserver having trouble.
+			 */
+			if (vnode->volume->servers == fc->server_list) {
+				fc->ac.error = -EREMOTEIO;
+				goto failed;
+			}
+
+			/* Try again */
+			fc->flags |= AFS_FS_CURSOR_VNOVOL;
+			_leave(" = t [vnovol]");
+			return true;
+
+		case VSALVAGE: /* TODO: Should this return an error or iterate? */
+		case VVOLEXISTS:
+		case VNOSERVICE:
+		case VONLINE:
+		case VDISKFULL:
+		case VOVERQUOTA:
+			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			goto next_server;
+
+		case VOFFLINE:
+			if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
+				afs_busy(vnode->volume, fc->ac.abort_code);
+				clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
+			}
+			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
+				fc->ac.error = -EADV;
+				goto failed;
+			}
+			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
+				fc->ac.error = -ESTALE;
+				goto failed;
+			}
+			goto busy;
+
+		case VSALVAGING:
+		case VRESTARTING:
+		case VBUSY:
+			/* Retry after going round all the servers unless we
+			 * have a file lock we need to maintain.
+			 */
+			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
+				fc->ac.error = -EBUSY;
+				goto failed;
+			}
+			if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
+				afs_busy(vnode->volume, fc->ac.abort_code);
+				clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
+			}
+		busy:
+			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
+				if (!afs_sleep_and_retry(fc))
+					goto failed;
+
+				 /* Retry with same server & address */
+				_leave(" = t [vbusy]");
+				return true;
+			}
+
+			fc->flags |= AFS_FS_CURSOR_VBUSY;
+			goto next_server;
+
+		case VMOVED:
+			/* The volume migrated to another server.  We consider
+			 * consider all locks and callbacks broken and request
+			 * an update from the VLDB.
+			 *
+			 * We also limit the number of VMOVED hops we will
+			 * honour, just in case someone sets up a loop.
+			 */
+			if (fc->flags & AFS_FS_CURSOR_VMOVED) {
+				fc->ac.error = -EREMOTEIO;
+				goto failed;
+			}
+			fc->flags |= AFS_FS_CURSOR_VMOVED;
+
+			set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
+			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
+			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
+			if (fc->ac.error < 0)
+				goto failed;
+
+			/* If the server list didn't change, then the VLDB is
+			 * out of sync with the fileservers.  This is hopefully
+			 * a temporary condition, however, so we don't want to
+			 * permanently block access to the file.
+			 *
+			 * TODO: Try other fileservers if we can.
+			 *
+			 * TODO: Retry a few times with sleeps.
+			 */
+			if (vnode->volume->servers == fc->server_list) {
+				fc->ac.error = -ENOMEDIUM;
+				goto failed;
+			}
+
+			goto restart_from_beginning;
+
+		default:
+			clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
+			clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
+			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			goto failed;
+		}
+
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIMEDOUT:
+	case -ETIME:
+		_debug("no conn");
+		goto iterate_address;
+	}
+
+restart_from_beginning:
+	_debug("restart");
+	afs_end_cursor(&fc->ac);
+	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+	fc->cbi = NULL;
+	afs_put_serverlist(afs_v2net(vnode), fc->server_list);
+	fc->server_list = NULL;
+start:
+	_debug("start");
+	/* See if we need to do an update of the volume record.  Note that the
+	 * volume may have moved or even have been deleted.
+	 */
+	fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
+	if (fc->ac.error < 0)
+		goto failed;
+
+	if (!afs_start_fs_iteration(fc, vnode))
+		goto failed;
+	goto use_server;
+
+next_server:
+	_debug("next");
+	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+	fc->cbi = NULL;
+	fc->index++;
+	if (fc->index >= fc->server_list->nr_servers)
+		fc->index = 0;
+	if (fc->index != fc->start)
+		goto use_server;
+
+	/* That's all the servers poked to no good effect.  Try again if some
+	 * of them were busy.
+	 */
+	if (fc->flags & AFS_FS_CURSOR_VBUSY)
+		goto restart_from_beginning;
+
+	fc->ac.error = -EDESTADDRREQ;
+	goto failed;
+
+use_server:
+	_debug("use");
+	/* We're starting on a different fileserver from the list.  We need to
+	 * check it, create a callback intercept, find its address list and
+	 * probe its capabilities before we use it.
+	 */
+	ASSERTCMP(fc->ac.alist, ==, NULL);
+	server = fc->server_list->servers[fc->index].server;
+
+	if (!afs_check_server_record(fc, server))
+		goto failed;
+
+	_debug("USING SERVER: %pU", &server->uuid);
+
+	/* Make sure we've got a callback interest record for this server.  We
+	 * have to link it in before we send the request as we can be sent a
+	 * break request before we've finished decoding the reply and
+	 * installing the vnode.
+	 */
+	fc->ac.error = afs_register_server_cb_interest(
+		vnode, &fc->server_list->servers[fc->index]);
+	if (fc->ac.error < 0)
+		goto failed;
+
+	fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+
+	read_lock(&server->fs_lock);
+	alist = rcu_dereference_protected(server->addresses,
+					  lockdep_is_held(&server->fs_lock));
+	afs_get_addrlist(alist);
+	read_unlock(&server->fs_lock);
+
+
+	/* Probe the current fileserver if we haven't done so yet. */
+	if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
+		fc->ac.alist = afs_get_addrlist(alist);
+
+		if (!afs_probe_fileserver(fc))
+			goto failed;
+	}
+
+	if (!fc->ac.alist)
+		fc->ac.alist = alist;
+	else
+		afs_put_addrlist(alist);
+
+	fc->ac.addr  = NULL;
+	fc->ac.start = READ_ONCE(alist->index);
+	fc->ac.index = fc->ac.start;
+	fc->ac.error = 0;
+	fc->ac.begun = false;
+	goto iterate_address;
+
+iterate_address:
+	ASSERT(fc->ac.alist);
+	_debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs);
+	/* Iterate over the current server's address list to try and find an
+	 * address on which it will respond to us.
+	 */
+	if (afs_iterate_addresses(&fc->ac)) {
+		_leave(" = t");
+		return true;
+	}
+
+	afs_end_cursor(&fc->ac);
+	goto next_server;
+
+failed:
+	fc->flags |= AFS_FS_CURSOR_STOP;
+	_leave(" = f [failed %d]", fc->ac.error);
+	return false;
+}
+
+/*
+ * Select the same fileserver we used for a vnode before and only that
+ * fileserver.  We use this when we have a lock on that file, which is backed
+ * only by the fileserver we obtained it from.
+ */
+bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_cb_interest *cbi = vnode->cb_interest;
+	struct afs_addr_list *alist;
+
+	_enter("");
+
+	switch (fc->ac.error) {
+	case SHRT_MAX:
+		if (!cbi) {
+			fc->ac.error = -ESTALE;
+			fc->flags |= AFS_FS_CURSOR_STOP;
+			return false;
+		}
+
+		fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+
+		read_lock(&cbi->server->fs_lock);
+		alist = rcu_dereference_protected(cbi->server->addresses,
+						  lockdep_is_held(&cbi->server->fs_lock));
+		afs_get_addrlist(alist);
+		read_unlock(&cbi->server->fs_lock);
+		if (!alist) {
+			fc->ac.error = -ESTALE;
+			fc->flags |= AFS_FS_CURSOR_STOP;
+			return false;
+		}
+
+		fc->ac.alist = alist;
+		fc->ac.addr  = NULL;
+		fc->ac.start = READ_ONCE(alist->index);
+		fc->ac.index = fc->ac.start;
+		fc->ac.error = 0;
+		fc->ac.begun = false;
+		goto iterate_address;
+
+	case 0:
+	default:
+		/* Success or local failure.  Stop. */
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		_leave(" = f [okay/local %d]", fc->ac.error);
+		return false;
+
+	case -ECONNABORTED:
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		_leave(" = f [abort]");
+		return false;
+
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIMEDOUT:
+	case -ETIME:
+		_debug("no conn");
+		goto iterate_address;
+	}
+
+iterate_address:
+	/* Iterate over the current server's address list to try and find an
+	 * address on which it will respond to us.
+	 */
+	if (afs_iterate_addresses(&fc->ac)) {
+		_leave(" = t");
+		return true;
+	}
+
+	afs_end_cursor(&fc->ac);
+	return false;
+}
+
+/*
+ * Tidy up a filesystem cursor and unlock the vnode.
+ */
+int afs_end_vnode_operation(struct afs_fs_cursor *fc)
+{
+	struct afs_net *net = afs_v2net(fc->vnode);
+	int ret;
+
+	mutex_unlock(&fc->vnode->io_lock);
+
+	afs_end_cursor(&fc->ac);
+	afs_put_cb_interest(net, fc->cbi);
+	afs_put_serverlist(net, fc->server_list);
+
+	ret = fc->ac.error;
+	if (ret == -ECONNABORTED)
+		afs_abort_to_error(fc->ac.abort_code);
+
+	return fc->ac.error;
+}
+
+#if 0
+/*
+ * Set a filesystem server cursor for using a specific FS server.
+ */
+int afs_set_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+{
+	afs_init_fs_cursor(fc, vnode);
+
+	read_seqlock_excl(&vnode->cb_lock);
+	if (vnode->cb_interest) {
+		if (vnode->cb_interest->server->fs_state == 0)
+			fc->server = afs_get_server(vnode->cb_interest->server);
+		else
+			fc->ac.error = vnode->cb_interest->server->fs_state;
+	} else {
+		fc->ac.error = -ESTALE;
+	}
+	read_sequnlock_excl(&vnode->cb_lock);
+
+	return fc->ac.error;
+}
+
+/*
+ * pick a server to use to try accessing this volume
+ * - returns with an elevated usage count on the server chosen
+ */
+bool afs_volume_pick_fileserver(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+{
+	struct afs_volume *volume = vnode->volume;
+	struct afs_server *server;
+	int ret, state, loop;
+
+	_enter("%s", volume->vlocation->vldb.name);
+
+	/* stick with the server we're already using if we can */
+	if (vnode->cb_interest && vnode->cb_interest->server->fs_state == 0) {
+		fc->server = afs_get_server(vnode->cb_interest->server);
+		goto set_server;
+	}
+
+	down_read(&volume->server_sem);
+
+	/* handle the no-server case */
+	if (volume->nservers == 0) {
+		fc->ac.error = volume->rjservers ? -ENOMEDIUM : -ESTALE;
+		up_read(&volume->server_sem);
+		_leave(" = f [no servers %d]", fc->ac.error);
+		return false;
+	}
+
+	/* basically, just search the list for the first live server and use
+	 * that */
+	ret = 0;
+	for (loop = 0; loop < volume->nservers; loop++) {
+		server = volume->servers[loop];
+		state = server->fs_state;
+
+		_debug("consider %d [%d]", loop, state);
+
+		switch (state) {
+		case 0:
+			goto picked_server;
+
+		case -ENETUNREACH:
+			if (ret == 0)
+				ret = state;
+			break;
+
+		case -EHOSTUNREACH:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH)
+				ret = state;
+			break;
+
+		case -ECONNREFUSED:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH)
+				ret = state;
+			break;
+
+		default:
+		case -EREMOTEIO:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH ||
+			    ret == -ECONNREFUSED)
+				ret = state;
+			break;
+		}
+	}
+
+error:
+	fc->ac.error = ret;
+
+	/* no available servers
+	 * - TODO: handle the no active servers case better
+	 */
+	up_read(&volume->server_sem);
+	_leave(" = f [%d]", fc->ac.error);
+	return false;
+
+picked_server:
+	/* Found an apparently healthy server.  We need to register an interest
+	 * in receiving callbacks before we talk to it.
+	 */
+	ret = afs_register_server_cb_interest(vnode,
+					      &volume->cb_interests[loop], server);
+	if (ret < 0)
+		goto error;
+
+	fc->server = afs_get_server(server);
+	up_read(&volume->server_sem);
+set_server:
+	fc->ac.alist = afs_get_addrlist(fc->server->addrs);
+	fc->ac.addr = &fc->ac.alist->addrs[0];
+	_debug("USING SERVER: %pIS\n", &fc->ac.addr->transport);
+	_leave(" = t (picked %pIS)", &fc->ac.addr->transport);
+	return true;
+}
+
+/*
+ * release a server after use
+ * - releases the ref on the server struct that was acquired by picking
+ * - records result of using a particular server to access a volume
+ * - return true to try again, false if okay or to issue error
+ * - the caller must release the server struct if result was false
+ */
+bool afs_iterate_fs_cursor(struct afs_fs_cursor *fc,
+			   struct afs_vnode *vnode)
+{
+	struct afs_volume *volume = vnode->volume;
+	struct afs_server *server = fc->server;
+	unsigned loop;
+
+	_enter("%s,%pIS,%d",
+	       volume->vlocation->vldb.name, &fc->ac.addr->transport,
+	       fc->ac.error);
+
+	switch (fc->ac.error) {
+		/* success */
+	case 0:
+		server->fs_state = 0;
+		_leave(" = f");
+		return false;
+
+		/* the fileserver denied all knowledge of the volume */
+	case -ENOMEDIUM:
+		down_write(&volume->server_sem);
+
+		/* firstly, find where the server is in the active list (if it
+		 * is) */
+		for (loop = 0; loop < volume->nservers; loop++)
+			if (volume->servers[loop] == server)
+				goto present;
+
+		/* no longer there - may have been discarded by another op */
+		goto try_next_server_upw;
+
+	present:
+		volume->nservers--;
+		memmove(&volume->servers[loop],
+			&volume->servers[loop + 1],
+			sizeof(volume->servers[loop]) *
+			(volume->nservers - loop));
+		volume->servers[volume->nservers] = NULL;
+		afs_put_server(afs_v2net(vnode), server);
+		volume->rjservers++;
+
+		if (volume->nservers > 0)
+			/* another server might acknowledge its existence */
+			goto try_next_server_upw;
+
+		/* handle the case where all the fileservers have rejected the
+		 * volume
+		 * - TODO: try asking the fileservers for volume information
+		 * - TODO: contact the VL server again to see if the volume is
+		 *         no longer registered
+		 */
+		up_write(&volume->server_sem);
+		afs_put_server(afs_v2net(vnode), server);
+		fc->server = NULL;
+		_leave(" = f [completely rejected]");
+		return false;
+
+		/* problem reaching the server */
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIME:
+	case -ETIMEDOUT:
+	case -EREMOTEIO:
+		/* mark the server as dead
+		 * TODO: vary dead timeout depending on error
+		 */
+		spin_lock(&server->fs_lock);
+		if (!server->fs_state) {
+			server->fs_state = fc->ac.error;
+			printk("kAFS: SERVER DEAD state=%d\n", fc->ac.error);
+		}
+		spin_unlock(&server->fs_lock);
+		goto try_next_server;
+
+		/* miscellaneous error */
+	default:
+	case -ENOMEM:
+	case -ENONET:
+		/* tell the caller to accept the result */
+		afs_put_server(afs_v2net(vnode), server);
+		fc->server = NULL;
+		_leave(" = f [local failure]");
+		return false;
+	}
+
+	/* tell the caller to loop around and try the next server */
+try_next_server_upw:
+	up_write(&volume->server_sem);
+try_next_server:
+	afs_put_server(afs_v2net(vnode), server);
+	_leave(" = t [try next server]");
+	return true;
+}
+
+/*
+ * Clean up a fileserver cursor.
+ */
+int afs_end_fs_cursor(struct afs_fs_cursor *fc, struct afs_net *net)
+{
+	afs_end_cursor(&fc->ac);
+	afs_put_server(net, fc->server);
+	return fc->ac.error;
+}
+
+#endif
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 0bf191f0dbaf..ea1460b9b71a 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -17,13 +17,10 @@
 #include "internal.h"
 #include "afs_cm.h"
 
-struct socket *afs_socket; /* my RxRPC socket */
-static struct workqueue_struct *afs_async_calls;
-static struct afs_call *afs_spare_incoming_call;
-atomic_t afs_outstanding_calls;
+struct workqueue_struct *afs_async_calls;
 
 static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
-static int afs_wait_for_call_to_complete(struct afs_call *);
+static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_process_async_call(struct work_struct *);
 static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
@@ -34,24 +31,13 @@ static int afs_deliver_cm_op_id(struct afs_call *);
 static const struct afs_call_type afs_RXCMxxxx = {
 	.name		= "CB.xxxx",
 	.deliver	= afs_deliver_cm_op_id,
-	.abort_to_error	= afs_abort_to_error,
 };
 
-static void afs_charge_preallocation(struct work_struct *);
-
-static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
-
-static int afs_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
  */
-int afs_open_socket(void)
+int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
@@ -59,28 +45,26 @@ int afs_open_socket(void)
 
 	_enter("");
 
-	ret = -ENOMEM;
-	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
-	if (!afs_async_calls)
-		goto error_0;
-
-	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket);
 	if (ret < 0)
 		goto error_1;
 
 	socket->sk->sk_allocation = GFP_NOFS;
 
 	/* bind the callback manager's address to make this a server socket */
+	memset(&srx, 0, sizeof(srx));
 	srx.srx_family			= AF_RXRPC;
 	srx.srx_service			= CM_SERVICE;
 	srx.transport_type		= SOCK_DGRAM;
-	srx.transport_len		= sizeof(srx.transport.sin);
-	srx.transport.sin.sin_family	= AF_INET;
-	srx.transport.sin.sin_port	= htons(AFS_CM_PORT);
-	memset(&srx.transport.sin.sin_addr, 0,
-	       sizeof(srx.transport.sin.sin_addr));
+	srx.transport_len		= sizeof(srx.transport.sin6);
+	srx.transport.sin6.sin6_family	= AF_INET6;
+	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
 	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	if (ret == -EADDRINUSE) {
+		srx.transport.sin6.sin6_port = 0;
+		ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	}
 	if (ret < 0)
 		goto error_2;
 
@@ -91,16 +75,14 @@ int afs_open_socket(void)
 	if (ret < 0)
 		goto error_2;
 
-	afs_socket = socket;
-	afs_charge_preallocation(NULL);
+	net->socket = socket;
+	afs_charge_preallocation(&net->charge_preallocation_work);
 	_leave(" = 0");
 	return 0;
 
 error_2:
 	sock_release(socket);
 error_1:
-	destroy_workqueue(afs_async_calls);
-error_0:
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -108,36 +90,36 @@ error_0:
 /*
  * close the RxRPC socket AFS was using
  */
-void afs_close_socket(void)
+void afs_close_socket(struct afs_net *net)
 {
 	_enter("");
 
-	kernel_listen(afs_socket, 0);
+	kernel_listen(net->socket, 0);
 	flush_workqueue(afs_async_calls);
 
-	if (afs_spare_incoming_call) {
-		afs_put_call(afs_spare_incoming_call);
-		afs_spare_incoming_call = NULL;
+	if (net->spare_incoming_call) {
+		afs_put_call(net->spare_incoming_call);
+		net->spare_incoming_call = NULL;
 	}
 
-	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
-	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
+	_debug("outstanding %u", atomic_read(&net->nr_outstanding_calls));
+	wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
-	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
+	kernel_sock_shutdown(net->socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
-	sock_release(afs_socket);
+	sock_release(net->socket);
 
 	_debug("dework");
-	destroy_workqueue(afs_async_calls);
 	_leave("");
 }
 
 /*
  * Allocate a call.
  */
-static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
+static struct afs_call *afs_alloc_call(struct afs_net *net,
+				       const struct afs_call_type *type,
 				       gfp_t gfp)
 {
 	struct afs_call *call;
@@ -148,11 +130,13 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
 		return NULL;
 
 	call->type = type;
+	call->net = net;
 	atomic_set(&call->usage, 1);
 	INIT_WORK(&call->async_work, afs_process_async_call);
 	init_waitqueue_head(&call->waitq);
+	spin_lock_init(&call->state_lock);
 
-	o = atomic_inc_return(&afs_outstanding_calls);
+	o = atomic_inc_return(&net->nr_outstanding_calls);
 	trace_afs_call(call, afs_call_trace_alloc, 1, o,
 		       __builtin_return_address(0));
 	return call;
@@ -163,8 +147,9 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
  */
 void afs_put_call(struct afs_call *call)
 {
+	struct afs_net *net = call->net;
 	int n = atomic_dec_return(&call->usage);
-	int o = atomic_read(&afs_outstanding_calls);
+	int o = atomic_read(&net->nr_outstanding_calls);
 
 	trace_afs_call(call, afs_call_trace_put, n + 1, o,
 		       __builtin_return_address(0));
@@ -175,20 +160,22 @@ void afs_put_call(struct afs_call *call)
 		ASSERT(call->type->name != NULL);
 
 		if (call->rxcall) {
-			rxrpc_kernel_end_call(afs_socket, call->rxcall);
+			rxrpc_kernel_end_call(net->socket, call->rxcall);
 			call->rxcall = NULL;
 		}
 		if (call->type->destructor)
 			call->type->destructor(call);
 
+		afs_put_server(call->net, call->cm_server);
+		afs_put_cb_interest(call->net, call->cbi);
 		kfree(call->request);
 		kfree(call);
 
-		o = atomic_dec_return(&afs_outstanding_calls);
+		o = atomic_dec_return(&net->nr_outstanding_calls);
 		trace_afs_call(call, afs_call_trace_free, 0, o,
 			       __builtin_return_address(0));
 		if (o == 0)
-			wake_up_atomic_t(&afs_outstanding_calls);
+			wake_up_atomic_t(&net->nr_outstanding_calls);
 	}
 }
 
@@ -200,7 +187,7 @@ int afs_queue_call_work(struct afs_call *call)
 	int u = atomic_inc_return(&call->usage);
 
 	trace_afs_call(call, afs_call_trace_work, u,
-		       atomic_read(&afs_outstanding_calls),
+		       atomic_read(&call->net->nr_outstanding_calls),
 		       __builtin_return_address(0));
 
 	INIT_WORK(&call->work, call->type->work);
@@ -213,12 +200,13 @@ int afs_queue_call_work(struct afs_call *call)
 /*
  * allocate a call with flat request and reply buffers
  */
-struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+struct afs_call *afs_alloc_flat_call(struct afs_net *net,
+				     const struct afs_call_type *type,
 				     size_t request_size, size_t reply_max)
 {
 	struct afs_call *call;
 
-	call = afs_alloc_call(type, GFP_NOFS);
+	call = afs_alloc_call(net, type, GFP_NOFS);
 	if (!call)
 		goto nomem_call;
 
@@ -236,6 +224,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
 			goto nomem_free;
 	}
 
+	call->operation_ID = type->op;
 	init_waitqueue_head(&call->waitq);
 	return call;
 
@@ -300,8 +289,7 @@ static void afs_notify_end_request_tx(struct sock *sock,
 {
 	struct afs_call *call = (struct afs_call *)call_user_ID;
 
-	if (call->state == AFS_CALL_REQUESTING)
-		call->state = AFS_CALL_AWAIT_REPLY;
+	afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY);
 }
 
 /*
@@ -319,11 +307,13 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 
 	do {
 		afs_load_bvec(call, msg, bv, first, last, offset);
+		trace_afs_send_pages(call, msg, first, last, offset);
+
 		offset = 0;
 		bytes = msg->msg_iter.count;
 		nr = msg->msg_iter.nr_segs;
 
-		ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg,
+		ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg,
 					     bytes, afs_notify_end_request_tx);
 		for (loop = 0; loop < nr; loop++)
 			put_page(bv[loop].bv_page);
@@ -333,62 +323,62 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 		first += nr;
 	} while (first <= last);
 
+	trace_afs_sent_pages(call, call->first, last, first, ret);
 	return ret;
 }
 
 /*
  * initiate a call
  */
-int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
-		  bool async)
+long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
+		   gfp_t gfp, bool async)
 {
-	struct sockaddr_rxrpc srx;
+	struct sockaddr_rxrpc *srx = ac->addr;
 	struct rxrpc_call *rxcall;
 	struct msghdr msg;
 	struct kvec iov[1];
 	size_t offset;
 	s64 tx_total_len;
-	u32 abort_code;
 	int ret;
 
-	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+	_enter(",{%pISp},", &srx->transport);
 
 	ASSERT(call->type != NULL);
 	ASSERT(call->type->name != NULL);
 
 	_debug("____MAKE %p{%s,%x} [%d]____",
 	       call, call->type->name, key_serial(call->key),
-	       atomic_read(&afs_outstanding_calls));
+	       atomic_read(&call->net->nr_outstanding_calls));
 
 	call->async = async;
 
-	memset(&srx, 0, sizeof(srx));
-	srx.srx_family = AF_RXRPC;
-	srx.srx_service = call->service_id;
-	srx.transport_type = SOCK_DGRAM;
-	srx.transport_len = sizeof(srx.transport.sin);
-	srx.transport.sin.sin_family = AF_INET;
-	srx.transport.sin.sin_port = call->port;
-	memcpy(&srx.transport.sin.sin_addr, addr, 4);
-
 	/* Work out the length we're going to transmit.  This is awkward for
 	 * calls such as FS.StoreData where there's an extra injection of data
 	 * after the initial fixed part.
 	 */
 	tx_total_len = call->request_size;
 	if (call->send_pages) {
-		tx_total_len += call->last_to - call->first_offset;
-		tx_total_len += (call->last - call->first) * PAGE_SIZE;
+		if (call->last == call->first) {
+			tx_total_len += call->last_to - call->first_offset;
+		} else {
+			/* It looks mathematically like you should be able to
+			 * combine the following lines with the ones above, but
+			 * unsigned arithmetic is fun when it wraps...
+			 */
+			tx_total_len += PAGE_SIZE - call->first_offset;
+			tx_total_len += call->last_to;
+			tx_total_len += (call->last - call->first - 1) * PAGE_SIZE;
+		}
 	}
 
 	/* create a call */
-	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+	rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
 					 (unsigned long)call,
 					 tx_total_len, gfp,
 					 (async ?
 					  afs_wake_up_async_call :
-					  afs_wake_up_call_waiter));
-	call->key = NULL;
+					  afs_wake_up_call_waiter),
+					 call->upgrade);
 	if (IS_ERR(rxcall)) {
 		ret = PTR_ERR(rxcall);
 		goto error_kill_call;
@@ -406,16 +396,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 		      call->request_size);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
-	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
+	msg.msg_flags		= MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
 
-	/* We have to change the state *before* sending the last packet as
-	 * rxrpc might give us the reply before it returns from sending the
-	 * request.  Further, if the send fails, we may already have been given
-	 * a notification and may have collected it.
-	 */
-	if (!call->send_pages)
-		call->state = AFS_CALL_AWAIT_REPLY;
-	ret = rxrpc_kernel_send_data(afs_socket, rxcall,
+	ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
 				     &msg, call->request_size,
 				     afs_notify_end_request_tx);
 	if (ret < 0)
@@ -432,22 +415,26 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	if (call->async)
 		return -EINPROGRESS;
 
-	return afs_wait_for_call_to_complete(call);
+	return afs_wait_for_call_to_complete(call, ac);
 
 error_do_abort:
 	call->state = AFS_CALL_COMPLETE;
 	if (ret != -ECONNABORTED) {
-		rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT,
-					ret, "KSD");
+		rxrpc_kernel_abort_call(call->net->socket, rxcall,
+					RX_USER_ABORT, ret, "KSD");
 	} else {
-		abort_code = 0;
 		offset = 0;
-		rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset,
-				       false, &abort_code);
-		ret = call->type->abort_to_error(abort_code);
+		rxrpc_kernel_recv_data(call->net->socket, rxcall, NULL,
+				       0, &offset, false, &call->abort_code,
+				       &call->service_id);
+		ac->abort_code = call->abort_code;
+		ac->responded = true;
 	}
+	call->error = ret;
+	trace_afs_call_done(call);
 error_kill_call:
 	afs_put_call(call);
+	ac->error = ret;
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -457,123 +444,174 @@ error_kill_call:
  */
 static void afs_deliver_to_call(struct afs_call *call)
 {
-	u32 abort_code;
+	enum afs_call_state state;
+	u32 abort_code, remote_abort = 0;
 	int ret;
 
 	_enter("%s", call->type->name);
 
-	while (call->state == AFS_CALL_AWAIT_REPLY ||
-	       call->state == AFS_CALL_AWAIT_OP_ID ||
-	       call->state == AFS_CALL_AWAIT_REQUEST ||
-	       call->state == AFS_CALL_AWAIT_ACK
+	while (state = READ_ONCE(call->state),
+	       state == AFS_CALL_CL_AWAIT_REPLY ||
+	       state == AFS_CALL_SV_AWAIT_OP_ID ||
+	       state == AFS_CALL_SV_AWAIT_REQUEST ||
+	       state == AFS_CALL_SV_AWAIT_ACK
 	       ) {
-		if (call->state == AFS_CALL_AWAIT_ACK) {
+		if (state == AFS_CALL_SV_AWAIT_ACK) {
 			size_t offset = 0;
-			ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+			ret = rxrpc_kernel_recv_data(call->net->socket,
+						     call->rxcall,
 						     NULL, 0, &offset, false,
-						     &call->abort_code);
+						     &remote_abort,
+						     &call->service_id);
 			trace_afs_recv_data(call, 0, offset, false, ret);
 
 			if (ret == -EINPROGRESS || ret == -EAGAIN)
 				return;
-			if (ret == 1 || ret < 0) {
-				call->state = AFS_CALL_COMPLETE;
-				goto done;
+			if (ret < 0 || ret == 1) {
+				if (ret == 1)
+					ret = 0;
+				goto call_complete;
 			}
 			return;
 		}
 
 		ret = call->type->deliver(call);
+		state = READ_ONCE(call->state);
 		switch (ret) {
 		case 0:
-			if (call->state == AFS_CALL_AWAIT_REPLY)
-				call->state = AFS_CALL_COMPLETE;
+			if (state == AFS_CALL_CL_PROC_REPLY)
+				goto call_complete;
+			ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY);
 			goto done;
 		case -EINPROGRESS:
 		case -EAGAIN:
 			goto out;
+		case -EIO:
 		case -ECONNABORTED:
-			goto call_complete;
+			ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+			goto done;
 		case -ENOTCONN:
 			abort_code = RX_CALL_DEAD;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, ret, "KNC");
-			goto save_error;
+			goto local_abort;
 		case -ENOTSUPP:
 			abort_code = RXGEN_OPCODE;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, ret, "KIV");
-			goto save_error;
+			goto local_abort;
 		case -ENODATA:
 		case -EBADMSG:
 		case -EMSGSIZE:
 		default:
 			abort_code = RXGEN_CC_UNMARSHAL;
-			if (call->state != AFS_CALL_AWAIT_REPLY)
+			if (state != AFS_CALL_CL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, -EBADMSG, "KUM");
-			goto save_error;
+			goto local_abort;
 		}
 	}
 
 done:
-	if (call->state == AFS_CALL_COMPLETE && call->incoming)
+	if (state == AFS_CALL_COMPLETE && call->incoming)
 		afs_put_call(call);
 out:
 	_leave("");
 	return;
 
-save_error:
-	call->error = ret;
+local_abort:
+	abort_code = 0;
 call_complete:
-	call->state = AFS_CALL_COMPLETE;
+	afs_set_call_complete(call, ret, remote_abort);
+	state = AFS_CALL_COMPLETE;
 	goto done;
 }
 
 /*
  * wait synchronously for a call to complete
  */
-static int afs_wait_for_call_to_complete(struct afs_call *call)
+static long afs_wait_for_call_to_complete(struct afs_call *call,
+					  struct afs_addr_cursor *ac)
 {
-	int ret;
+	signed long rtt2, timeout;
+	long ret;
+	u64 rtt;
+	u32 life, last_life;
 
 	DECLARE_WAITQUEUE(myself, current);
 
 	_enter("");
 
+	rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+	rtt2 = nsecs_to_jiffies64(rtt) * 2;
+	if (rtt2 < 2)
+		rtt2 = 2;
+
+	timeout = rtt2;
+	last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+
 	add_wait_queue(&call->waitq, &myself);
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		/* deliver any messages that are in the queue */
-		if (call->state < AFS_CALL_COMPLETE && call->need_attention) {
+		if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+		    call->need_attention) {
 			call->need_attention = false;
 			__set_current_state(TASK_RUNNING);
 			afs_deliver_to_call(call);
 			continue;
 		}
 
-		if (call->state == AFS_CALL_COMPLETE ||
-		    signal_pending(current))
+		if (afs_check_call_state(call, AFS_CALL_COMPLETE))
 			break;
-		schedule();
+
+		life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+		if (timeout == 0 &&
+		    life == last_life && signal_pending(current))
+				break;
+
+		if (life != last_life) {
+			timeout = rtt2;
+			last_life = life;
+		}
+
+		timeout = schedule_timeout(timeout);
 	}
 
 	remove_wait_queue(&call->waitq, &myself);
 	__set_current_state(TASK_RUNNING);
 
 	/* Kill off the call if it's still live. */
-	if (call->state < AFS_CALL_COMPLETE) {
+	if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
 		_debug("call interrupted");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT, -EINTR, "KWI");
+		if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+					    RX_USER_ABORT, -EINTR, "KWI"))
+			afs_set_call_complete(call, -EINTR, 0);
+	}
+
+	spin_lock_bh(&call->state_lock);
+	ac->abort_code = call->abort_code;
+	ac->error = call->error;
+	spin_unlock_bh(&call->state_lock);
+
+	ret = ac->error;
+	switch (ret) {
+	case 0:
+		if (call->ret_reply0) {
+			ret = (long)call->reply[0];
+			call->reply[0] = NULL;
+		}
+		/* Fall through */
+	case -ECONNABORTED:
+		ac->responded = true;
+		break;
 	}
 
-	ret = call->error;
 	_debug("call complete");
 	afs_put_call(call);
-	_leave(" = %d", ret);
+	_leave(" = %p", (void *)ret);
 	return ret;
 }
 
@@ -604,7 +642,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 	u = __atomic_add_unless(&call->usage, 1, 0);
 	if (u != 0) {
 		trace_afs_call(call, afs_call_trace_wake, u,
-			       atomic_read(&afs_outstanding_calls),
+			       atomic_read(&call->net->nr_outstanding_calls),
 			       __builtin_return_address(0));
 
 		if (!queue_work(afs_async_calls, &call->async_work))
@@ -643,7 +681,7 @@ static void afs_process_async_call(struct work_struct *work)
 	}
 
 	if (call->state == AFS_CALL_COMPLETE) {
-		call->reply = NULL;
+		call->reply[0] = NULL;
 
 		/* We have two refs to release - one from the alloc and one
 		 * queued with the work item - and we can't just deallocate the
@@ -668,22 +706,24 @@ static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
 /*
  * Charge the incoming call preallocation.
  */
-static void afs_charge_preallocation(struct work_struct *work)
+void afs_charge_preallocation(struct work_struct *work)
 {
-	struct afs_call *call = afs_spare_incoming_call;
+	struct afs_net *net =
+		container_of(work, struct afs_net, charge_preallocation_work);
+	struct afs_call *call = net->spare_incoming_call;
 
 	for (;;) {
 		if (!call) {
-			call = afs_alloc_call(&afs_RXCMxxxx, GFP_KERNEL);
+			call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL);
 			if (!call)
 				break;
 
 			call->async = true;
-			call->state = AFS_CALL_AWAIT_OP_ID;
+			call->state = AFS_CALL_SV_AWAIT_OP_ID;
 			init_waitqueue_head(&call->waitq);
 		}
 
-		if (rxrpc_kernel_charge_accept(afs_socket,
+		if (rxrpc_kernel_charge_accept(net->socket,
 					       afs_wake_up_async_call,
 					       afs_rx_attach,
 					       (unsigned long)call,
@@ -691,7 +731,7 @@ static void afs_charge_preallocation(struct work_struct *work)
 			break;
 		call = NULL;
 	}
-	afs_spare_incoming_call = call;
+	net->spare_incoming_call = call;
 }
 
 /*
@@ -712,7 +752,9 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
-	queue_work(afs_wq, &afs_charge_preallocation_work);
+	struct afs_net *net = afs_sock2net(sk);
+
+	queue_work(afs_wq, &net->charge_preallocation_work);
 }
 
 /*
@@ -733,7 +775,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
 		return ret;
 
 	call->operation_ID = ntohl(call->tmp);
-	call->state = AFS_CALL_AWAIT_REQUEST;
+	afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST);
 	call->offset = 0;
 
 	/* ask the cache manager to route the call (it'll change the call type
@@ -758,8 +800,7 @@ static void afs_notify_end_reply_tx(struct sock *sock,
 {
 	struct afs_call *call = (struct afs_call *)call_user_ID;
 
-	if (call->state == AFS_CALL_REPLYING)
-		call->state = AFS_CALL_AWAIT_ACK;
+	afs_set_call_state(call, AFS_CALL_SV_REPLYING, AFS_CALL_SV_AWAIT_ACK);
 }
 
 /*
@@ -767,11 +808,12 @@ static void afs_notify_end_reply_tx(struct sock *sock,
  */
 void afs_send_empty_reply(struct afs_call *call)
 {
+	struct afs_net *net = call->net;
 	struct msghdr msg;
 
 	_enter("");
 
-	rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0);
+	rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0);
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
@@ -780,8 +822,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
 
-	call->state = AFS_CALL_AWAIT_ACK;
-	switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0,
+	switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0,
 				       afs_notify_end_reply_tx)) {
 	case 0:
 		_leave(" [replied]");
@@ -789,7 +830,7 @@ void afs_send_empty_reply(struct afs_call *call)
 
 	case -ENOMEM:
 		_debug("oom");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
 	default:
 		_leave(" [error]");
@@ -802,13 +843,14 @@ void afs_send_empty_reply(struct afs_call *call)
  */
 void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 {
+	struct afs_net *net = call->net;
 	struct msghdr msg;
 	struct kvec iov[1];
 	int n;
 
 	_enter("");
 
-	rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len);
+	rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len);
 
 	iov[0].iov_base		= (void *) buf;
 	iov[0].iov_len		= len;
@@ -819,8 +861,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
 
-	call->state = AFS_CALL_AWAIT_ACK;
-	n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len,
+	n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len,
 				   afs_notify_end_reply_tx);
 	if (n >= 0) {
 		/* Success */
@@ -830,7 +871,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 
 	if (n == -ENOMEM) {
 		_debug("oom");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
 	}
 	_leave(" [error]");
@@ -842,6 +883,9 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 		     bool want_more)
 {
+	struct afs_net *net = call->net;
+	enum afs_call_state state;
+	u32 remote_abort;
 	int ret;
 
 	_enter("{%s,%zu},,%zu,%d",
@@ -849,31 +893,32 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 
 	ASSERTCMP(call->offset, <=, count);
 
-	ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+	ret = rxrpc_kernel_recv_data(net->socket, call->rxcall,
 				     buf, count, &call->offset,
-				     want_more, &call->abort_code);
+				     want_more, &remote_abort,
+				     &call->service_id);
 	trace_afs_recv_data(call, count, call->offset, want_more, ret);
 	if (ret == 0 || ret == -EAGAIN)
 		return ret;
 
+	state = READ_ONCE(call->state);
 	if (ret == 1) {
-		switch (call->state) {
-		case AFS_CALL_AWAIT_REPLY:
-			call->state = AFS_CALL_COMPLETE;
+		switch (state) {
+		case AFS_CALL_CL_AWAIT_REPLY:
+			afs_set_call_state(call, state, AFS_CALL_CL_PROC_REPLY);
 			break;
-		case AFS_CALL_AWAIT_REQUEST:
-			call->state = AFS_CALL_REPLYING;
+		case AFS_CALL_SV_AWAIT_REQUEST:
+			afs_set_call_state(call, state, AFS_CALL_SV_REPLYING);
 			break;
+		case AFS_CALL_COMPLETE:
+			kdebug("prem complete %d", call->error);
+			return -EIO;
 		default:
 			break;
 		}
 		return 0;
 	}
 
-	if (ret == -ECONNABORTED)
-		call->error = call->type->abort_to_error(call->abort_code);
-	else
-		call->error = ret;
-	call->state = AFS_CALL_COMPLETE;
+	afs_set_call_complete(call, ret, remote_abort);
 	return ret;
 }
diff --git a/fs/afs/security.c b/fs/afs/security.c
index faca66227ecf..b88b7d45fdaa 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -1,6 +1,6 @@
 /* AFS security handling
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -14,9 +14,13 @@
 #include <linux/fs.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
+#include <linux/hashtable.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
+static DEFINE_HASHTABLE(afs_permits_cache, 10);
+static DEFINE_SPINLOCK(afs_permits_lock);
+
 /*
  * get a key
  */
@@ -46,167 +50,235 @@ struct key *afs_request_key(struct afs_cell *cell)
 }
 
 /*
- * dispose of a permits list
+ * Dispose of a list of permits.
  */
-void afs_zap_permits(struct rcu_head *rcu)
+static void afs_permits_rcu(struct rcu_head *rcu)
 {
 	struct afs_permits *permits =
 		container_of(rcu, struct afs_permits, rcu);
-	int loop;
-
-	_enter("{%d}", permits->count);
+	int i;
 
-	for (loop = permits->count - 1; loop >= 0; loop--)
-		key_put(permits->permits[loop].key);
+	for (i = 0; i < permits->nr_permits; i++)
+		key_put(permits->permits[i].key);
 	kfree(permits);
 }
 
 /*
- * dispose of a permits list in which all the key pointers have been copied
+ * Discard a permission cache.
  */
-static void afs_dispose_of_permits(struct rcu_head *rcu)
+void afs_put_permits(struct afs_permits *permits)
 {
-	struct afs_permits *permits =
-		container_of(rcu, struct afs_permits, rcu);
-
-	_enter("{%d}", permits->count);
-
-	kfree(permits);
+	if (permits && refcount_dec_and_test(&permits->usage)) {
+		spin_lock(&afs_permits_lock);
+		hash_del_rcu(&permits->hash_node);
+		spin_unlock(&afs_permits_lock);
+		call_rcu(&permits->rcu, afs_permits_rcu);
+	}
 }
 
 /*
- * get the authorising vnode - this is the specified inode itself if it's a
- * directory or it's the parent directory if the specified inode is a file or
- * symlink
- * - the caller must release the ref on the inode
+ * Clear a permit cache on callback break.
  */
-static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
-					    struct key *key)
+void afs_clear_permits(struct afs_vnode *vnode)
 {
-	struct afs_vnode *auth_vnode;
-	struct inode *auth_inode;
+	struct afs_permits *permits;
 
-	_enter("");
+	spin_lock(&vnode->lock);
+	permits = rcu_dereference_protected(vnode->permit_cache,
+					    lockdep_is_held(&vnode->lock));
+	RCU_INIT_POINTER(vnode->permit_cache, NULL);
+	vnode->cb_break++;
+	spin_unlock(&vnode->lock);
 
-	if (S_ISDIR(vnode->vfs_inode.i_mode)) {
-		auth_inode = igrab(&vnode->vfs_inode);
-		ASSERT(auth_inode != NULL);
-	} else {
-		auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
-				      &vnode->status.parent, NULL, NULL);
-		if (IS_ERR(auth_inode))
-			return ERR_CAST(auth_inode);
-	}
-
-	auth_vnode = AFS_FS_I(auth_inode);
-	_leave(" = {%x}", auth_vnode->fid.vnode);
-	return auth_vnode;
+	if (permits)
+		afs_put_permits(permits);
 }
 
 /*
- * clear the permit cache on a directory vnode
+ * Hash a list of permits.  Use simple addition to make it easy to add an extra
+ * one at an as-yet indeterminate position in the list.
  */
-void afs_clear_permits(struct afs_vnode *vnode)
+static void afs_hash_permits(struct afs_permits *permits)
 {
-	struct afs_permits *permits;
-
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+	unsigned long h = permits->nr_permits;
+	int i;
 
-	mutex_lock(&vnode->permits_lock);
-	permits = vnode->permits;
-	RCU_INIT_POINTER(vnode->permits, NULL);
-	mutex_unlock(&vnode->permits_lock);
+	for (i = 0; i < permits->nr_permits; i++) {
+		h += (unsigned long)permits->permits[i].key / sizeof(void *);
+		h += permits->permits[i].access;
+	}
 
-	if (permits)
-		call_rcu(&permits->rcu, afs_zap_permits);
-	_leave("");
+	permits->h = h;
 }
 
 /*
- * add the result obtained for a vnode to its or its parent directory's cache
- * for the key used to access it
+ * Cache the CallerAccess result obtained from doing a fileserver operation
+ * that returned a vnode status for a particular key.  If a callback break
+ * occurs whilst the operation was in progress then we have to ditch the cache
+ * as the ACL *may* have changed.
  */
-void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
+		      unsigned int cb_break)
 {
-	struct afs_permits *permits, *xpermits;
-	struct afs_permit *permit;
-	struct afs_vnode *auth_vnode;
-	int count, loop;
-
-	_enter("{%x:%u},%x,%lx",
-	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
-
-	auth_vnode = afs_get_auth_inode(vnode, key);
-	if (IS_ERR(auth_vnode)) {
-		_leave(" [get error %ld]", PTR_ERR(auth_vnode));
-		return;
-	}
+	struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL;
+	afs_access_t caller_access = READ_ONCE(vnode->status.caller_access);
+	size_t size = 0;
+	bool changed = false;
+	int i, j;
+
+	_enter("{%x:%u},%x,%x",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access);
+
+	rcu_read_lock();
+
+	/* Check for the common case first: We got back the same access as last
+	 * time we tried and already have it recorded.
+	 */
+	permits = rcu_dereference(vnode->permit_cache);
+	if (permits) {
+		if (!permits->invalidated) {
+			for (i = 0; i < permits->nr_permits; i++) {
+				if (permits->permits[i].key < key)
+					continue;
+				if (permits->permits[i].key > key)
+					break;
+				if (permits->permits[i].access != caller_access) {
+					changed = true;
+					break;
+				}
+
+				if (cb_break != (vnode->cb_break +
+						 vnode->cb_interest->server->cb_s_break)) {
+					changed = true;
+					break;
+				}
+
+				/* The cache is still good. */
+				rcu_read_unlock();
+				return;
+			}
+		}
 
-	mutex_lock(&auth_vnode->permits_lock);
+		changed |= permits->invalidated;
+		size = permits->nr_permits;
 
-	/* guard against a rename being detected whilst we waited for the
-	 * lock */
-	if (memcmp(&auth_vnode->fid, &vnode->status.parent,
-		   sizeof(struct afs_fid)) != 0) {
-		_debug("renamed");
-		goto out_unlock;
+		/* If this set of permits is now wrong, clear the permits
+		 * pointer so that no one tries to use the stale information.
+		 */
+		if (changed) {
+			spin_lock(&vnode->lock);
+			if (permits != rcu_access_pointer(vnode->permit_cache))
+				goto someone_else_changed_it_unlock;
+			RCU_INIT_POINTER(vnode->permit_cache, NULL);
+			spin_unlock(&vnode->lock);
+
+			afs_put_permits(permits);
+			permits = NULL;
+			size = 0;
+		}
 	}
 
-	/* have to be careful as the directory's callback may be broken between
-	 * us receiving the status we're trying to cache and us getting the
-	 * lock to update the cache for the status */
-	if (auth_vnode->acl_order - acl_order > 0) {
-		_debug("ACL changed?");
-		goto out_unlock;
+	if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) {
+		rcu_read_unlock();
+		goto someone_else_changed_it;
 	}
 
-	/* always update the anonymous mask */
-	_debug("anon access %x", vnode->status.anon_access);
-	auth_vnode->status.anon_access = vnode->status.anon_access;
-	if (key == vnode->volume->cell->anonymous_key)
-		goto out_unlock;
-
-	xpermits = auth_vnode->permits;
-	count = 0;
-	if (xpermits) {
-		/* see if the permit is already in the list
-		 * - if it is then we just amend the list
-		 */
-		count = xpermits->count;
-		permit = xpermits->permits;
-		for (loop = count; loop > 0; loop--) {
-			if (permit->key == key) {
-				permit->access_mask =
-					vnode->status.caller_access;
-				goto out_unlock;
+	/* We need a ref on any permits list we want to copy as we'll have to
+	 * drop the lock to do memory allocation.
+	 */
+	if (permits && !refcount_inc_not_zero(&permits->usage)) {
+		rcu_read_unlock();
+		goto someone_else_changed_it;
+	}
+
+	rcu_read_unlock();
+
+	/* Speculatively create a new list with the revised permission set.  We
+	 * discard this if we find an extant match already in the hash, but
+	 * it's easier to compare with memcmp this way.
+	 *
+	 * We fill in the key pointers at this time, but we don't get the refs
+	 * yet.
+	 */
+	size++;
+	new = kzalloc(sizeof(struct afs_permits) +
+		      sizeof(struct afs_permit) * size, GFP_NOFS);
+	if (!new)
+		goto out_put;
+
+	refcount_set(&new->usage, 1);
+	new->nr_permits = size;
+	i = j = 0;
+	if (permits) {
+		for (i = 0; i < permits->nr_permits; i++) {
+			if (j == i && permits->permits[i].key > key) {
+				new->permits[j].key = key;
+				new->permits[j].access = caller_access;
+				j++;
 			}
-			permit++;
+			new->permits[j].key = permits->permits[i].key;
+			new->permits[j].access = permits->permits[i].access;
+			j++;
 		}
 	}
 
-	permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
-			  GFP_NOFS);
-	if (!permits)
-		goto out_unlock;
-
-	if (xpermits)
-		memcpy(permits->permits, xpermits->permits,
-			count * sizeof(struct afs_permit));
-
-	_debug("key %x access %x",
-	       key_serial(key), vnode->status.caller_access);
-	permits->permits[count].access_mask = vnode->status.caller_access;
-	permits->permits[count].key = key_get(key);
-	permits->count = count + 1;
-
-	rcu_assign_pointer(auth_vnode->permits, permits);
-	if (xpermits)
-		call_rcu(&xpermits->rcu, afs_dispose_of_permits);
-
-out_unlock:
-	mutex_unlock(&auth_vnode->permits_lock);
-	iput(&auth_vnode->vfs_inode);
-	_leave("");
+	if (j == i) {
+		new->permits[j].key = key;
+		new->permits[j].access = caller_access;
+	}
+
+	afs_hash_permits(new);
+
+	/* Now see if the permit list we want is actually already available */
+	spin_lock(&afs_permits_lock);
+
+	hash_for_each_possible(afs_permits_cache, xpermits, hash_node, new->h) {
+		if (xpermits->h != new->h ||
+		    xpermits->invalidated ||
+		    xpermits->nr_permits != new->nr_permits ||
+		    memcmp(xpermits->permits, new->permits,
+			   new->nr_permits * sizeof(struct afs_permit)) != 0)
+			continue;
+
+		if (refcount_inc_not_zero(&xpermits->usage)) {
+			replacement = xpermits;
+			goto found;
+		}
+
+		break;
+	}
+
+	for (i = 0; i < new->nr_permits; i++)
+		key_get(new->permits[i].key);
+	hash_add_rcu(afs_permits_cache, &new->hash_node, new->h);
+	replacement = new;
+	new = NULL;
+
+found:
+	spin_unlock(&afs_permits_lock);
+
+	kfree(new);
+
+	spin_lock(&vnode->lock);
+	zap = rcu_access_pointer(vnode->permit_cache);
+	if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) &&
+	    zap == permits)
+		rcu_assign_pointer(vnode->permit_cache, replacement);
+	else
+		zap = replacement;
+	spin_unlock(&vnode->lock);
+	afs_put_permits(zap);
+out_put:
+	afs_put_permits(permits);
+	return;
+
+someone_else_changed_it_unlock:
+	spin_unlock(&vnode->lock);
+someone_else_changed_it:
+	/* Someone else changed the cache under us - don't recheck at this
+	 * time.
+	 */
+	return;
 }
 
 /*
@@ -214,60 +286,49 @@ out_unlock:
  * permitted to be accessed with this authorisation, and if so, what access it
  * is granted
  */
-static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
-			    afs_access_t *_access)
+int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+		     afs_access_t *_access)
 {
 	struct afs_permits *permits;
-	struct afs_permit *permit;
-	struct afs_vnode *auth_vnode;
-	bool valid;
-	int loop, ret;
+	bool valid = false;
+	int i, ret;
 
 	_enter("{%x:%u},%x",
 	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
 
-	auth_vnode = afs_get_auth_inode(vnode, key);
-	if (IS_ERR(auth_vnode)) {
-		*_access = 0;
-		_leave(" = %ld", PTR_ERR(auth_vnode));
-		return PTR_ERR(auth_vnode);
-	}
-
-	ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+	permits = vnode->permit_cache;
 
 	/* check the permits to see if we've got one yet */
-	if (key == auth_vnode->volume->cell->anonymous_key) {
+	if (key == vnode->volume->cell->anonymous_key) {
 		_debug("anon");
-		*_access = auth_vnode->status.anon_access;
+		*_access = vnode->status.anon_access;
 		valid = true;
 	} else {
-		valid = false;
 		rcu_read_lock();
-		permits = rcu_dereference(auth_vnode->permits);
+		permits = rcu_dereference(vnode->permit_cache);
 		if (permits) {
-			permit = permits->permits;
-			for (loop = permits->count; loop > 0; loop--) {
-				if (permit->key == key) {
-					_debug("found in cache");
-					*_access = permit->access_mask;
-					valid = true;
+			for (i = 0; i < permits->nr_permits; i++) {
+				if (permits->permits[i].key < key)
+					continue;
+				if (permits->permits[i].key > key)
 					break;
-				}
-				permit++;
+
+				*_access = permits->permits[i].access;
+				valid = !permits->invalidated;
+				break;
 			}
 		}
 		rcu_read_unlock();
 	}
 
 	if (!valid) {
-		/* check the status on the file we're actually interested in
-		 * (the post-processing will cache the result on auth_vnode) */
+		/* Check the status on the file we're actually interested in
+		 * (the post-processing will cache the result).
+		 */
 		_debug("no valid permit");
 
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+		ret = afs_fetch_status(vnode, key);
 		if (ret < 0) {
-			iput(&auth_vnode->vfs_inode);
 			*_access = 0;
 			_leave(" = %d", ret);
 			return ret;
@@ -275,7 +336,6 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 		*_access = vnode->status.caller_access;
 	}
 
-	iput(&auth_vnode->vfs_inode);
 	_leave(" = 0 [access %x]", *_access);
 	return 0;
 }
@@ -304,14 +364,9 @@ int afs_permission(struct inode *inode, int mask)
 		return PTR_ERR(key);
 	}
 
-	/* if the promise has expired, we need to check the server again */
-	if (!vnode->cb_promised) {
-		_debug("not promised");
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
-		if (ret < 0)
-			goto error;
-		_debug("new promise [fl=%lx]", vnode->flags);
-	}
+	ret = afs_validate(vnode, key);
+	if (ret < 0)
+		goto error;
 
 	/* check the permits to see if we've got one yet */
 	ret = afs_check_permit(vnode, key, &access);
@@ -365,3 +420,12 @@ error:
 	_leave(" = %d", ret);
 	return ret;
 }
+
+void __exit afs_clean_up_permit_cache(void)
+{
+	int i;
+
+	for (i = 0; i < HASH_SIZE(afs_permits_cache); i++)
+		WARN_ON_ONCE(!hlist_empty(&afs_permits_cache[i]));
+
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c001b1f2455f..1880f1b6a9f1 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -11,317 +11,689 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "afs_fs.h"
 #include "internal.h"
 
-static unsigned afs_server_timeout = 10;	/* server timeout in seconds */
+static unsigned afs_server_gc_delay = 10;	/* Server record timeout in seconds */
+static unsigned afs_server_update_delay = 30;	/* Time till VLDB recheck in secs */
 
-static void afs_reap_server(struct work_struct *);
+static void afs_inc_servers_outstanding(struct afs_net *net)
+{
+	atomic_inc(&net->servers_outstanding);
+}
+
+static void afs_dec_servers_outstanding(struct afs_net *net)
+{
+	if (atomic_dec_and_test(&net->servers_outstanding))
+		wake_up_atomic_t(&net->servers_outstanding);
+}
+
+/*
+ * Find a server by one of its addresses.
+ */
+struct afs_server *afs_find_server(struct afs_net *net,
+				   const struct sockaddr_rxrpc *srx)
+{
+	const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
+	const struct afs_addr_list *alist;
+	struct afs_server *server = NULL;
+	unsigned int i;
+	bool ipv6 = true;
+	int seq = 0, diff;
+
+	if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 ||
+	    srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 ||
+	    srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff))
+		ipv6 = false;
+
+	rcu_read_lock();
+
+	do {
+		if (server)
+			afs_put_server(net, server);
+		server = NULL;
+		read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
+
+		if (ipv6) {
+			hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+				alist = rcu_dereference(server->addresses);
+				for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+					b = &alist->addrs[i].transport.sin6;
+					diff = (u16)a->sin6_port - (u16)b->sin6_port;
+					if (diff == 0)
+						diff = memcmp(&a->sin6_addr,
+							      &b->sin6_addr,
+							      sizeof(struct in6_addr));
+					if (diff == 0)
+						goto found;
+					if (diff < 0) {
+						// TODO: Sort the list
+						//if (i == alist->nr_ipv4)
+						//	goto not_found;
+						break;
+					}
+				}
+			}
+		} else {
+			hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
+				alist = rcu_dereference(server->addresses);
+				for (i = 0; i < alist->nr_ipv4; i++) {
+					b = &alist->addrs[i].transport.sin6;
+					diff = (u16)a->sin6_port - (u16)b->sin6_port;
+					if (diff == 0)
+						diff = ((u32)a->sin6_addr.s6_addr32[3] -
+							(u32)b->sin6_addr.s6_addr32[3]);
+					if (diff == 0)
+						goto found;
+					if (diff < 0) {
+						// TODO: Sort the list
+						//if (i == 0)
+						//	goto not_found;
+						break;
+					}
+				}
+			}
+		}
+
+	//not_found:
+		server = NULL;
+	found:
+		if (server && !atomic_inc_not_zero(&server->usage))
+			server = NULL;
+
+	} while (need_seqretry(&net->fs_addr_lock, seq));
 
-/* tree of all the servers, indexed by IP address */
-static struct rb_root afs_servers = RB_ROOT;
-static DEFINE_RWLOCK(afs_servers_lock);
+	done_seqretry(&net->fs_addr_lock, seq);
 
-/* LRU list of all the servers not currently in use */
-static LIST_HEAD(afs_server_graveyard);
-static DEFINE_SPINLOCK(afs_server_graveyard_lock);
-static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+	rcu_read_unlock();
+	return server;
+}
 
 /*
- * install a server record in the master tree
+ * Look up a server by its UUID
  */
-static int afs_install_server(struct afs_server *server)
+struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
 {
-	struct afs_server *xserver;
+	struct afs_server *server = NULL;
+	struct rb_node *p;
+	int diff, seq = 0;
+
+	_enter("%pU", uuid);
+
+	do {
+		/* Unfortunately, rbtree walking doesn't give reliable results
+		 * under just the RCU read lock, so we have to check for
+		 * changes.
+		 */
+		if (server)
+			afs_put_server(net, server);
+		server = NULL;
+
+		read_seqbegin_or_lock(&net->fs_lock, &seq);
+
+		p = net->fs_servers.rb_node;
+		while (p) {
+			server = rb_entry(p, struct afs_server, uuid_rb);
+
+			diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
+			if (diff < 0) {
+				p = p->rb_left;
+			} else if (diff > 0) {
+				p = p->rb_right;
+			} else {
+				afs_get_server(server);
+				break;
+			}
+
+			server = NULL;
+		}
+	} while (need_seqretry(&net->fs_lock, seq));
+
+	done_seqretry(&net->fs_lock, seq);
+
+	_leave(" = %p", server);
+	return server;
+}
+
+/*
+ * Install a server record in the namespace tree
+ */
+static struct afs_server *afs_install_server(struct afs_net *net,
+					     struct afs_server *candidate)
+{
+	const struct afs_addr_list *alist;
+	struct afs_server *server;
 	struct rb_node **pp, *p;
-	int ret;
+	int ret = -EEXIST, diff;
 
-	_enter("%p", server);
+	_enter("%p", candidate);
 
-	write_lock(&afs_servers_lock);
+	write_seqlock(&net->fs_lock);
 
-	ret = -EEXIST;
-	pp = &afs_servers.rb_node;
+	/* Firstly install the server in the UUID lookup tree */
+	pp = &net->fs_servers.rb_node;
 	p = NULL;
 	while (*pp) {
 		p = *pp;
 		_debug("- consider %p", p);
-		xserver = rb_entry(p, struct afs_server, master_rb);
-		if (server->addr.s_addr < xserver->addr.s_addr)
+		server = rb_entry(p, struct afs_server, uuid_rb);
+		diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
+		if (diff < 0)
 			pp = &(*pp)->rb_left;
-		else if (server->addr.s_addr > xserver->addr.s_addr)
+		else if (diff > 0)
 			pp = &(*pp)->rb_right;
 		else
-			goto error;
+			goto exists;
 	}
 
-	rb_link_node(&server->master_rb, p, pp);
-	rb_insert_color(&server->master_rb, &afs_servers);
+	server = candidate;
+	rb_link_node(&server->uuid_rb, p, pp);
+	rb_insert_color(&server->uuid_rb, &net->fs_servers);
+	hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
+
+	write_seqlock(&net->fs_addr_lock);
+	alist = rcu_dereference_protected(server->addresses,
+					  lockdep_is_held(&net->fs_addr_lock.lock));
+
+	/* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
+	 * it in the IPv4 and/or IPv6 reverse-map lists.
+	 *
+	 * TODO: For speed we want to use something other than a flat list
+	 * here; even sorting the list in terms of lowest address would help a
+	 * bit, but anything we might want to do gets messy and memory
+	 * intensive.
+	 */
+	if (alist->nr_ipv4 > 0)
+		hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4);
+	if (alist->nr_addrs > alist->nr_ipv4)
+		hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
+
+	write_sequnlock(&net->fs_addr_lock);
 	ret = 0;
 
-error:
-	write_unlock(&afs_servers_lock);
-	return ret;
+exists:
+	afs_get_server(server);
+	write_sequnlock(&net->fs_lock);
+	return server;
 }
 
 /*
  * allocate a new server record
  */
-static struct afs_server *afs_alloc_server(struct afs_cell *cell,
-					   const struct in_addr *addr)
+static struct afs_server *afs_alloc_server(struct afs_net *net,
+					   const uuid_t *uuid,
+					   struct afs_addr_list *alist)
 {
 	struct afs_server *server;
 
 	_enter("");
 
 	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
-	if (server) {
-		atomic_set(&server->usage, 1);
-		server->cell = cell;
-
-		INIT_LIST_HEAD(&server->link);
-		INIT_LIST_HEAD(&server->grave);
-		init_rwsem(&server->sem);
-		spin_lock_init(&server->fs_lock);
-		server->fs_vnodes = RB_ROOT;
-		server->cb_promises = RB_ROOT;
-		spin_lock_init(&server->cb_lock);
-		init_waitqueue_head(&server->cb_break_waitq);
-		INIT_DELAYED_WORK(&server->cb_break_work,
-				  afs_dispatch_give_up_callbacks);
-
-		memcpy(&server->addr, addr, sizeof(struct in_addr));
-		server->addr.s_addr = addr->s_addr;
-		_leave(" = %p{%d}", server, atomic_read(&server->usage));
-	} else {
-		_leave(" = NULL [nomem]");
-	}
+	if (!server)
+		goto enomem;
+
+	atomic_set(&server->usage, 1);
+	RCU_INIT_POINTER(server->addresses, alist);
+	server->addr_version = alist->version;
+	server->uuid = *uuid;
+	server->flags = (1UL << AFS_SERVER_FL_NEW);
+	server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
+	rwlock_init(&server->fs_lock);
+	INIT_LIST_HEAD(&server->cb_interests);
+	rwlock_init(&server->cb_break_lock);
+
+	afs_inc_servers_outstanding(net);
+	_leave(" = %p", server);
 	return server;
+
+enomem:
+	_leave(" = NULL [nomem]");
+	return NULL;
+}
+
+/*
+ * Look up an address record for a server
+ */
+static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
+						 struct key *key, const uuid_t *uuid)
+{
+	struct afs_addr_cursor ac;
+	struct afs_addr_list *alist;
+	int ret;
+
+	ret = afs_set_vl_cursor(&ac, cell);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	while (afs_iterate_addresses(&ac)) {
+		if (test_bit(ac.index, &ac.alist->yfs))
+			alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid);
+		else
+			alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid);
+		switch (ac.error) {
+		case 0:
+			afs_end_cursor(&ac);
+			return alist;
+		case -ECONNABORTED:
+			ac.error = afs_abort_to_error(ac.abort_code);
+			goto error;
+		case -ENOMEM:
+		case -ENONET:
+			goto error;
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			break;
+		default:
+			ac.error = -EIO;
+			goto error;
+		}
+	}
+
+error:
+	return ERR_PTR(afs_end_cursor(&ac));
 }
 
 /*
- * get an FS-server record for a cell
+ * Get or create a fileserver record.
  */
-struct afs_server *afs_lookup_server(struct afs_cell *cell,
-				     const struct in_addr *addr)
+struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
+				     const uuid_t *uuid)
 {
+	struct afs_addr_list *alist;
 	struct afs_server *server, *candidate;
 
-	_enter("%p,%pI4", cell, &addr->s_addr);
+	_enter("%p,%pU", cell->net, uuid);
 
-	/* quick scan of the list to see if we already have the server */
-	read_lock(&cell->servers_lock);
+	server = afs_find_server_by_uuid(cell->net, uuid);
+	if (server)
+		return server;
 
-	list_for_each_entry(server, &cell->servers, link) {
-		if (server->addr.s_addr == addr->s_addr)
-			goto found_server_quickly;
-	}
-	read_unlock(&cell->servers_lock);
+	alist = afs_vl_lookup_addrs(cell, key, uuid);
+	if (IS_ERR(alist))
+		return ERR_CAST(alist);
 
-	candidate = afs_alloc_server(cell, addr);
+	candidate = afs_alloc_server(cell->net, uuid, alist);
 	if (!candidate) {
-		_leave(" = -ENOMEM");
+		afs_put_addrlist(alist);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	write_lock(&cell->servers_lock);
-
-	/* check the cell's server list again */
-	list_for_each_entry(server, &cell->servers, link) {
-		if (server->addr.s_addr == addr->s_addr)
-			goto found_server;
+	server = afs_install_server(cell->net, candidate);
+	if (server != candidate) {
+		afs_put_addrlist(alist);
+		kfree(candidate);
 	}
 
-	_debug("new");
-	server = candidate;
-	if (afs_install_server(server) < 0)
-		goto server_in_two_cells;
-
-	afs_get_cell(cell);
-	list_add_tail(&server->link, &cell->servers);
-
-	write_unlock(&cell->servers_lock);
 	_leave(" = %p{%d}", server, atomic_read(&server->usage));
 	return server;
+}
 
-	/* found a matching server quickly */
-found_server_quickly:
-	_debug("found quickly");
-	afs_get_server(server);
-	read_unlock(&cell->servers_lock);
-no_longer_unused:
-	if (!list_empty(&server->grave)) {
-		spin_lock(&afs_server_graveyard_lock);
-		list_del_init(&server->grave);
-		spin_unlock(&afs_server_graveyard_lock);
+/*
+ * Set the server timer to fire after a given delay, assuming it's not already
+ * set for an earlier time.
+ */
+static void afs_set_server_timer(struct afs_net *net, time64_t delay)
+{
+	if (net->live) {
+		afs_inc_servers_outstanding(net);
+		if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
+			afs_dec_servers_outstanding(net);
 	}
-	_leave(" = %p{%d}", server, atomic_read(&server->usage));
-	return server;
+}
 
-	/* found a matching server on the second pass */
-found_server:
-	_debug("found");
-	afs_get_server(server);
-	write_unlock(&cell->servers_lock);
-	kfree(candidate);
-	goto no_longer_unused;
-
-	/* found a server that seems to be in two cells */
-server_in_two_cells:
-	write_unlock(&cell->servers_lock);
-	kfree(candidate);
-	printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n",
-	       addr);
-	_leave(" = -EEXIST");
-	return ERR_PTR(-EEXIST);
+/*
+ * Server management timer.  We have an increment on fs_outstanding that we
+ * need to pass along to the work item.
+ */
+void afs_servers_timer(struct timer_list *timer)
+{
+	struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
+
+	_enter("");
+	if (!queue_work(afs_wq, &net->fs_manager))
+		afs_dec_servers_outstanding(net);
 }
 
 /*
- * look up a server by its IP address
+ * Release a reference on a server record.
  */
-struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
+void afs_put_server(struct afs_net *net, struct afs_server *server)
 {
-	struct afs_server *server = NULL;
-	struct rb_node *p;
-	struct in_addr addr = srx->transport.sin.sin_addr;
+	unsigned int usage;
 
-	_enter("{%d,%pI4}", srx->transport.family, &addr.s_addr);
+	if (!server)
+		return;
 
-	if (srx->transport.family != AF_INET) {
-		WARN(true, "AFS does not yes support non-IPv4 addresses\n");
-		return NULL;
-	}
+	server->put_time = ktime_get_real_seconds();
 
-	read_lock(&afs_servers_lock);
+	usage = atomic_dec_return(&server->usage);
 
-	p = afs_servers.rb_node;
-	while (p) {
-		server = rb_entry(p, struct afs_server, master_rb);
+	_enter("{%u}", usage);
 
-		_debug("- consider %p", p);
+	if (likely(usage > 0))
+		return;
 
-		if (addr.s_addr < server->addr.s_addr) {
-			p = p->rb_left;
-		} else if (addr.s_addr > server->addr.s_addr) {
-			p = p->rb_right;
-		} else {
-			afs_get_server(server);
-			goto found;
-		}
-	}
+	afs_set_server_timer(net, afs_server_gc_delay);
+}
 
-	server = NULL;
-found:
-	read_unlock(&afs_servers_lock);
-	ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
-	_leave(" = %p", server);
-	return server;
+static void afs_server_rcu(struct rcu_head *rcu)
+{
+	struct afs_server *server = container_of(rcu, struct afs_server, rcu);
+
+	afs_put_addrlist(server->addresses);
+	kfree(server);
 }
 
 /*
- * destroy a server record
- * - removes from the cell list
+ * destroy a dead server
  */
-void afs_put_server(struct afs_server *server)
+static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
 {
-	if (!server)
-		return;
+	struct afs_addr_list *alist = server->addresses;
+	struct afs_addr_cursor ac = {
+		.alist	= alist,
+		.addr	= &alist->addrs[0],
+		.start	= alist->index,
+		.index	= alist->index,
+		.error	= 0,
+	};
+	_enter("%p", server);
 
-	_enter("%p{%d}", server, atomic_read(&server->usage));
+	afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+	call_rcu(&server->rcu, afs_server_rcu);
+	afs_dec_servers_outstanding(net);
+}
 
-	_debug("PUT SERVER %d", atomic_read(&server->usage));
+/*
+ * Garbage collect any expired servers.
+ */
+static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
+{
+	struct afs_server *server;
+	bool deleted;
+	int usage;
+
+	while ((server = gc_list)) {
+		gc_list = server->gc_next;
+
+		write_seqlock(&net->fs_lock);
+		usage = 1;
+		deleted = atomic_try_cmpxchg(&server->usage, &usage, 0);
+		if (deleted) {
+			rb_erase(&server->uuid_rb, &net->fs_servers);
+			hlist_del_rcu(&server->proc_link);
+		}
+		write_sequnlock(&net->fs_lock);
 
-	ASSERTCMP(atomic_read(&server->usage), >, 0);
+		if (deleted)
+			afs_destroy_server(net, server);
+	}
+}
 
-	if (likely(!atomic_dec_and_test(&server->usage))) {
-		_leave("");
-		return;
+/*
+ * Manage the records of servers known to be within a network namespace.  This
+ * includes garbage collecting unused servers.
+ *
+ * Note also that we were given an increment on net->servers_outstanding by
+ * whoever queued us that we need to deal with before returning.
+ */
+void afs_manage_servers(struct work_struct *work)
+{
+	struct afs_net *net = container_of(work, struct afs_net, fs_manager);
+	struct afs_server *gc_list = NULL;
+	struct rb_node *cursor;
+	time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
+	bool purging = !net->live;
+
+	_enter("");
+
+	/* Trawl the server list looking for servers that have expired from
+	 * lack of use.
+	 */
+	read_seqlock_excl(&net->fs_lock);
+
+	for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
+		struct afs_server *server =
+			rb_entry(cursor, struct afs_server, uuid_rb);
+		int usage = atomic_read(&server->usage);
+
+		_debug("manage %pU %u", &server->uuid, usage);
+
+		ASSERTCMP(usage, >=, 1);
+		ASSERTIFCMP(purging, usage, ==, 1);
+
+		if (usage == 1) {
+			time64_t expire_at = server->put_time;
+
+			if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
+			    !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
+				expire_at += afs_server_gc_delay;
+			if (purging || expire_at <= now) {
+				server->gc_next = gc_list;
+				gc_list = server;
+			} else if (expire_at < next_manage) {
+				next_manage = expire_at;
+			}
+		}
 	}
 
-	afs_flush_callback_breaks(server);
+	read_sequnlock_excl(&net->fs_lock);
+
+	/* Update the timer on the way out.  We have to pass an increment on
+	 * servers_outstanding in the namespace that we are in to the timer or
+	 * the work scheduler.
+	 */
+	if (!purging && next_manage < TIME64_MAX) {
+		now = ktime_get_real_seconds();
 
-	spin_lock(&afs_server_graveyard_lock);
-	if (atomic_read(&server->usage) == 0) {
-		list_move_tail(&server->grave, &afs_server_graveyard);
-		server->time_of_death = ktime_get_real_seconds();
-		queue_delayed_work(afs_wq, &afs_server_reaper,
-				   afs_server_timeout * HZ);
+		if (next_manage - now <= 0) {
+			if (queue_work(afs_wq, &net->fs_manager))
+				afs_inc_servers_outstanding(net);
+		} else {
+			afs_set_server_timer(net, next_manage - now);
+		}
 	}
-	spin_unlock(&afs_server_graveyard_lock);
-	_leave(" [dead]");
+
+	afs_gc_servers(net, gc_list);
+
+	afs_dec_servers_outstanding(net);
+	_leave(" [%d]", atomic_read(&net->servers_outstanding));
+}
+
+static void afs_queue_server_manager(struct afs_net *net)
+{
+	afs_inc_servers_outstanding(net);
+	if (!queue_work(afs_wq, &net->fs_manager))
+		afs_dec_servers_outstanding(net);
 }
 
 /*
- * destroy a dead server
+ * Purge list of servers.
  */
-static void afs_destroy_server(struct afs_server *server)
+void afs_purge_servers(struct afs_net *net)
 {
-	_enter("%p", server);
+	_enter("");
 
-	ASSERTIF(server->cb_break_head != server->cb_break_tail,
-		 delayed_work_pending(&server->cb_break_work));
+	if (del_timer_sync(&net->fs_timer))
+		atomic_dec(&net->servers_outstanding);
 
-	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
-	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
-	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
-	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
+	afs_queue_server_manager(net);
 
-	afs_put_cell(server->cell);
-	kfree(server);
+	_debug("wait");
+	wait_on_atomic_t(&net->servers_outstanding, atomic_t_wait,
+			 TASK_UNINTERRUPTIBLE);
+	_leave("");
 }
 
 /*
- * reap dead server records
+ * Probe a fileserver to find its capabilities.
+ *
+ * TODO: Try service upgrade.
  */
-static void afs_reap_server(struct work_struct *work)
+static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
 {
-	LIST_HEAD(corpses);
-	struct afs_server *server;
-	unsigned long delay, expiry;
-	time64_t now;
-
-	now = ktime_get_real_seconds();
-	spin_lock(&afs_server_graveyard_lock);
-
-	while (!list_empty(&afs_server_graveyard)) {
-		server = list_entry(afs_server_graveyard.next,
-				    struct afs_server, grave);
+	_enter("");
 
-		/* the queue is ordered most dead first */
-		expiry = server->time_of_death + afs_server_timeout;
-		if (expiry > now) {
-			delay = (expiry - now) * HZ;
-			mod_delayed_work(afs_wq, &afs_server_reaper, delay);
+	fc->ac.addr = NULL;
+	fc->ac.start = READ_ONCE(fc->ac.alist->index);
+	fc->ac.index = fc->ac.start;
+	fc->ac.error = 0;
+	fc->ac.begun = false;
+
+	while (afs_iterate_addresses(&fc->ac)) {
+		afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
+					&fc->ac, fc->key);
+		switch (fc->ac.error) {
+		case 0:
+			afs_end_cursor(&fc->ac);
+			set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
+			return true;
+		case -ECONNABORTED:
+			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			goto error;
+		case -ENOMEM:
+		case -ENONET:
+			goto error;
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+		case -ETIMEDOUT:
+		case -ETIME:
 			break;
+		default:
+			fc->ac.error = -EIO;
+			goto error;
 		}
+	}
 
-		write_lock(&server->cell->servers_lock);
-		write_lock(&afs_servers_lock);
-		if (atomic_read(&server->usage) > 0) {
-			list_del_init(&server->grave);
-		} else {
-			list_move_tail(&server->grave, &corpses);
-			list_del_init(&server->link);
-			rb_erase(&server->master_rb, &afs_servers);
-		}
-		write_unlock(&afs_servers_lock);
-		write_unlock(&server->cell->servers_lock);
+error:
+	afs_end_cursor(&fc->ac);
+	return false;
+}
+
+/*
+ * If we haven't already, try probing the fileserver to get its capabilities.
+ * We try not to instigate parallel probes, but it's possible that the parallel
+ * probes will fail due to authentication failure when ours would succeed.
+ *
+ * TODO: Try sending an anonymous probe if an authenticated probe fails.
+ */
+bool afs_probe_fileserver(struct afs_fs_cursor *fc)
+{
+	bool success;
+	int ret, retries = 0;
+
+	_enter("");
+
+retry:
+	if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
+		_leave(" = t");
+		return true;
 	}
 
-	spin_unlock(&afs_server_graveyard_lock);
+	if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
+		success = afs_do_probe_fileserver(fc);
+		clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
+		wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
+		_leave(" = t");
+		return success;
+	}
+
+	_debug("wait");
+	ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
+			  TASK_INTERRUPTIBLE);
+	if (ret == -ERESTARTSYS) {
+		fc->ac.error = ret;
+		_leave(" = f [%d]", ret);
+		return false;
+	}
 
-	/* now reap the corpses we've extracted */
-	while (!list_empty(&corpses)) {
-		server = list_entry(corpses.next, struct afs_server, grave);
-		list_del(&server->grave);
-		afs_destroy_server(server);
+	retries++;
+	if (retries == 4) {
+		fc->ac.error = -ESTALE;
+		_leave(" = f [stale]");
+		return false;
 	}
+	_debug("retry");
+	goto retry;
 }
 
 /*
- * discard all the server records for rmmod
+ * Get an update for a server's address list.
  */
-void __exit afs_purge_servers(void)
+static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
 {
-	afs_server_timeout = 0;
-	mod_delayed_work(afs_wq, &afs_server_reaper, 0);
+	struct afs_addr_list *alist, *discard;
+
+	_enter("");
+
+	alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key,
+				    &server->uuid);
+	if (IS_ERR(alist)) {
+		fc->ac.error = PTR_ERR(alist);
+		_leave(" = f [%d]", fc->ac.error);
+		return false;
+	}
+
+	discard = alist;
+	if (server->addr_version != alist->version) {
+		write_lock(&server->fs_lock);
+		discard = rcu_dereference_protected(server->addresses,
+						    lockdep_is_held(&server->fs_lock));
+		rcu_assign_pointer(server->addresses, alist);
+		server->addr_version = alist->version;
+		write_unlock(&server->fs_lock);
+	}
+
+	server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
+	afs_put_addrlist(discard);
+	_leave(" = t");
+	return true;
+}
+
+/*
+ * See if a server's address list needs updating.
+ */
+bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
+{
+	time64_t now = ktime_get_real_seconds();
+	long diff;
+	bool success;
+	int ret, retries = 0;
+
+	_enter("");
+
+	ASSERT(server);
+
+retry:
+	diff = READ_ONCE(server->update_at) - now;
+	if (diff > 0) {
+		_leave(" = t [not now %ld]", diff);
+		return true;
+	}
+
+	if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) {
+		success = afs_update_server_record(fc, server);
+		clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags);
+		wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING);
+		_leave(" = %d", success);
+		return success;
+	}
+
+	ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
+			  TASK_INTERRUPTIBLE);
+	if (ret == -ERESTARTSYS) {
+		fc->ac.error = ret;
+		_leave(" = f [intr]");
+		return false;
+	}
+
+	retries++;
+	if (retries == 4) {
+		_leave(" = f [stale]");
+		ret = -ESTALE;
+		return false;
+	}
+	goto retry;
 }
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
new file mode 100644
index 000000000000..0ab3f8457839
--- /dev/null
+++ b/fs/afs/server_list.c
@@ -0,0 +1,153 @@
+/* AFS fileserver list management.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
+{
+	int i;
+
+	if (slist && refcount_dec_and_test(&slist->usage)) {
+		for (i = 0; i < slist->nr_servers; i++) {
+			afs_put_cb_interest(net, slist->servers[i].cb_interest);
+			afs_put_server(net, slist->servers[i].server);
+		}
+		kfree(slist);
+	}
+}
+
+/*
+ * Build a server list from a VLDB record.
+ */
+struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
+					      struct key *key,
+					      struct afs_vldb_entry *vldb,
+					      u8 type_mask)
+{
+	struct afs_server_list *slist;
+	struct afs_server *server;
+	int ret = -ENOMEM, nr_servers = 0, i, j;
+
+	for (i = 0; i < vldb->nr_servers; i++)
+		if (vldb->fs_mask[i] & type_mask)
+			nr_servers++;
+
+	slist = kzalloc(sizeof(struct afs_server_list) +
+			sizeof(struct afs_server_entry) * nr_servers,
+			GFP_KERNEL);
+	if (!slist)
+		goto error;
+
+	refcount_set(&slist->usage, 1);
+
+	/* Make sure a records exists for each server in the list. */
+	for (i = 0; i < vldb->nr_servers; i++) {
+		if (!(vldb->fs_mask[i] & type_mask))
+			continue;
+
+		server = afs_lookup_server(cell, key, &vldb->fs_server[i]);
+		if (IS_ERR(server)) {
+			ret = PTR_ERR(server);
+			if (ret == -ENOENT)
+				continue;
+			goto error_2;
+		}
+
+		/* Insertion-sort by server pointer */
+		for (j = 0; j < slist->nr_servers; j++)
+			if (slist->servers[j].server >= server)
+				break;
+		if (j < slist->nr_servers) {
+			if (slist->servers[j].server == server) {
+				afs_put_server(cell->net, server);
+				continue;
+			}
+
+			memmove(slist->servers + j + 1,
+				slist->servers + j,
+				(slist->nr_servers - j) * sizeof(struct afs_server_entry));
+		}
+
+		slist->servers[j].server = server;
+		slist->nr_servers++;
+	}
+
+	if (slist->nr_servers == 0) {
+		ret = -EDESTADDRREQ;
+		goto error_2;
+	}
+
+	return slist;
+
+error_2:
+	afs_put_serverlist(cell->net, slist);
+error:
+	return ERR_PTR(ret);
+}
+
+/*
+ * Copy the annotations from an old server list to its potential replacement.
+ */
+bool afs_annotate_server_list(struct afs_server_list *new,
+			      struct afs_server_list *old)
+{
+	struct afs_server *cur;
+	int i, j;
+
+	if (old->nr_servers != new->nr_servers)
+		goto changed;
+
+	for (i = 0; i < old->nr_servers; i++)
+		if (old->servers[i].server != new->servers[i].server)
+			goto changed;
+
+	return false;
+
+changed:
+	/* Maintain the same current server as before if possible. */
+	cur = old->servers[old->index].server;
+	for (j = 0; j < new->nr_servers; j++) {
+		if (new->servers[j].server == cur) {
+			new->index = j;
+			break;
+		}
+	}
+
+	/* Keep the old callback interest records where possible so that we
+	 * maintain callback interception.
+	 */
+	i = 0;
+	j = 0;
+	while (i < old->nr_servers && j < new->nr_servers) {
+		if (new->servers[j].server == old->servers[i].server) {
+			struct afs_cb_interest *cbi = old->servers[i].cb_interest;
+			if (cbi) {
+				new->servers[j].cb_interest = cbi;
+				refcount_inc(&cbi->usage);
+			}
+			i++;
+			j++;
+			continue;
+		}
+
+		if (new->servers[j].server < old->servers[i].server) {
+			j++;
+			continue;
+		}
+
+		i++;
+		continue;
+	}
+
+	return true;
+}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 689173c0a682..1037dd41a622 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -25,11 +25,10 @@
 #include <linux/statfs.h>
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
+#include <linux/magic.h>
 #include <net/net_namespace.h>
 #include "internal.h"
 
-#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-
 static void afs_i_init_once(void *foo);
 static struct dentry *afs_mount(struct file_system_type *fs_type,
 		      int flags, const char *dev_name, void *data);
@@ -143,9 +142,9 @@ void __exit afs_fs_exit(void)
  */
 static int afs_show_devname(struct seq_file *m, struct dentry *root)
 {
-	struct afs_super_info *as = root->d_sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(root->d_sb);
 	struct afs_volume *volume = as->volume;
-	struct afs_cell *cell = volume->cell;
+	struct afs_cell *cell = as->cell;
 	const char *suf = "";
 	char pref = '%';
 
@@ -163,7 +162,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root)
 		break;
 	}
 
-	seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->vlocation->vldb.name, suf);
+	seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->name, suf);
 	return 0;
 }
 
@@ -201,12 +200,14 @@ static int afs_parse_options(struct afs_mount_params *params,
 		token = match_token(p, afs_options_list, args);
 		switch (token) {
 		case afs_opt_cell:
-			cell = afs_cell_lookup(args[0].from,
-					       args[0].to - args[0].from,
-					       false);
+			rcu_read_lock();
+			cell = afs_lookup_cell_rcu(params->net,
+						   args[0].from,
+						   args[0].to - args[0].from);
+			rcu_read_unlock();
 			if (IS_ERR(cell))
 				return PTR_ERR(cell);
-			afs_put_cell(params->cell);
+			afs_put_cell(params->net, params->cell);
 			params->cell = cell;
 			break;
 
@@ -308,13 +309,14 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 
 	/* lookup the cell record */
 	if (cellname || !params->cell) {
-		cell = afs_cell_lookup(cellname, cellnamesz, true);
+		cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+				       NULL, false);
 		if (IS_ERR(cell)) {
 			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
 			       cellnamesz, cellnamesz, cellname ?: "");
 			return PTR_ERR(cell);
 		}
-		afs_put_cell(params->cell);
+		afs_put_cell(params->net, params->cell);
 		params->cell = cell;
 	}
 
@@ -332,14 +334,16 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 static int afs_test_super(struct super_block *sb, void *data)
 {
 	struct afs_super_info *as1 = data;
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);
 
-	return as->volume == as1->volume;
+	return as->net == as1->net && as->volume->vid == as1->volume->vid;
 }
 
 static int afs_set_super(struct super_block *sb, void *data)
 {
-	sb->s_fs_info = data;
+	struct afs_super_info *as = data;
+
+	sb->s_fs_info = as;
 	return set_anon_super(sb, NULL);
 }
 
@@ -349,7 +353,7 @@ static int afs_set_super(struct super_block *sb, void *data)
 static int afs_fill_super(struct super_block *sb,
 			  struct afs_mount_params *params)
 {
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);
 	struct afs_fid fid;
 	struct inode *inode = NULL;
 	int ret;
@@ -366,13 +370,15 @@ static int afs_fill_super(struct super_block *sb,
 	if (ret)
 		return ret;
 	sb->s_bdi->ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-	strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
+	sprintf(sb->s_id, "%u", as->volume->vid);
+
+	afs_activate_volume(as->volume);
 
 	/* allocate the root inode and dentry */
 	fid.vid		= as->volume->vid;
 	fid.vnode	= 1;
 	fid.unique	= 1;
-	inode = afs_iget(sb, params->key, &fid, NULL, NULL);
+	inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -394,23 +400,45 @@ error:
 	return ret;
 }
 
+static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+{
+	struct afs_super_info *as;
+
+	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+	if (as) {
+		as->net = afs_get_net(params->net);
+		as->cell = afs_get_cell(params->cell);
+	}
+	return as;
+}
+
+static void afs_destroy_sbi(struct afs_super_info *as)
+{
+	if (as) {
+		afs_put_volume(as->cell, as->volume);
+		afs_put_cell(as->net, as->cell);
+		afs_put_net(as->net);
+		kfree(as);
+	}
+}
+
 /*
  * get an AFS superblock
  */
 static struct dentry *afs_mount(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *options)
+				int flags, const char *dev_name, void *options)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
-	struct afs_volume *vol;
+	struct afs_volume *candidate;
 	struct key *key;
-	char *new_opts = kstrdup(options, GFP_KERNEL);
 	struct afs_super_info *as;
 	int ret;
 
 	_enter(",,%s,%p", dev_name, options);
 
 	memset(&params, 0, sizeof(params));
+	params.net = &__afs_net;
 
 	ret = -EINVAL;
 	if (current->nsproxy->net_ns != &init_net)
@@ -436,70 +464,81 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	}
 	params.key = key;
 
-	/* parse the device name */
-	vol = afs_volume_lookup(&params);
-	if (IS_ERR(vol)) {
-		ret = PTR_ERR(vol);
-		goto error;
-	}
-
 	/* allocate a superblock info record */
-	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
-	if (!as) {
-		ret = -ENOMEM;
-		afs_put_volume(vol);
-		goto error;
+	ret = -ENOMEM;
+	as = afs_alloc_sbi(&params);
+	if (!as)
+		goto error_key;
+
+	/* Assume we're going to need a volume record; at the very least we can
+	 * use it to update the volume record if we have one already.  This
+	 * checks that the volume exists within the cell.
+	 */
+	candidate = afs_create_volume(&params);
+	if (IS_ERR(candidate)) {
+		ret = PTR_ERR(candidate);
+		goto error_as;
 	}
-	as->volume = vol;
+
+	as->volume = candidate;
 
 	/* allocate a deviceless superblock */
 	sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
-		afs_put_volume(vol);
-		kfree(as);
-		goto error;
+		goto error_as;
 	}
 
 	if (!sb->s_root) {
 		/* initial superblock/root creation */
 		_debug("create");
 		ret = afs_fill_super(sb, &params);
-		if (ret < 0) {
-			deactivate_locked_super(sb);
-			goto error;
-		}
-		sb->s_flags |= MS_ACTIVE;
+		if (ret < 0)
+			goto error_sb;
+		as = NULL;
+		sb->s_flags |= SB_ACTIVE;
 	} else {
 		_debug("reuse");
-		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
-		afs_put_volume(vol);
-		kfree(as);
+		ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
+		afs_destroy_sbi(as);
+		as = NULL;
 	}
 
-	afs_put_cell(params.cell);
-	kfree(new_opts);
+	afs_put_cell(params.net, params.cell);
+	key_put(params.key);
 	_leave(" = 0 [%p]", sb);
 	return dget(sb->s_root);
 
-error:
-	afs_put_cell(params.cell);
+error_sb:
+	deactivate_locked_super(sb);
+	goto error_key;
+error_as:
+	afs_destroy_sbi(as);
+error_key:
 	key_put(params.key);
-	kfree(new_opts);
+error:
+	afs_put_cell(params.net, params.cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
 static void afs_kill_super(struct super_block *sb)
 {
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);
+
+	/* Clear the callback interests (which will do ilookup5) before
+	 * deactivating the superblock.
+	 */
+	afs_clear_callback_interests(as->net, as->volume->servers);
 	kill_anon_super(sb);
-	afs_put_volume(as->volume);
-	kfree(as);
+	afs_deactivate_volume(as->volume);
+	afs_destroy_sbi(as);
 }
 
 /*
- * initialise an inode cache slab element prior to any use
+ * Initialise an inode cache slab element prior to any use.  Note that
+ * afs_alloc_inode() *must* reset anything that could incorrectly leak from one
+ * inode to another.
  */
 static void afs_i_init_once(void *_vnode)
 {
@@ -507,16 +546,15 @@ static void afs_i_init_once(void *_vnode)
 
 	memset(vnode, 0, sizeof(*vnode));
 	inode_init_once(&vnode->vfs_inode);
-	init_waitqueue_head(&vnode->update_waitq);
-	mutex_init(&vnode->permits_lock);
+	mutex_init(&vnode->io_lock);
 	mutex_init(&vnode->validate_lock);
-	spin_lock_init(&vnode->writeback_lock);
+	spin_lock_init(&vnode->wb_lock);
 	spin_lock_init(&vnode->lock);
-	INIT_LIST_HEAD(&vnode->writebacks);
+	INIT_LIST_HEAD(&vnode->wb_keys);
 	INIT_LIST_HEAD(&vnode->pending_locks);
 	INIT_LIST_HEAD(&vnode->granted_locks);
 	INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
-	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
+	seqlock_init(&vnode->cb_lock);
 }
 
 /*
@@ -532,13 +570,21 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 
 	atomic_inc(&afs_count_active_inodes);
 
+	/* Reset anything that shouldn't leak from one inode to the next. */
 	memset(&vnode->fid, 0, sizeof(vnode->fid));
 	memset(&vnode->status, 0, sizeof(vnode->status));
 
 	vnode->volume		= NULL;
-	vnode->update_cnt	= 0;
+	vnode->lock_key		= NULL;
+	vnode->permit_cache	= NULL;
+	vnode->cb_interest	= NULL;
+#ifdef CONFIG_AFS_FSCACHE
+	vnode->cache		= NULL;
+#endif
+
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
-	vnode->cb_promised	= false;
+	vnode->cb_type		= 0;
+	vnode->lock_state	= AFS_VNODE_LOCK_NONE;
 
 	_leave(" = %p", &vnode->vfs_inode);
 	return &vnode->vfs_inode;
@@ -562,7 +608,7 @@ static void afs_destroy_inode(struct inode *inode)
 
 	_debug("DESTROY INODE %p", inode);
 
-	ASSERTCMP(vnode->server, ==, NULL);
+	ASSERTCMP(vnode->cb_interest, ==, NULL);
 
 	call_rcu(&inode->i_rcu, afs_i_callback);
 	atomic_dec(&afs_count_active_inodes);
@@ -573,6 +619,7 @@ static void afs_destroy_inode(struct inode *inode)
  */
 static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct afs_fs_cursor fc;
 	struct afs_volume_status vs;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 	struct key *key;
@@ -582,21 +629,32 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
-	ret = afs_vnode_get_volume_status(vnode, key, &vs);
-	key_put(key);
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ret;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_get_volume_status(&fc, &vs);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
 	}
 
-	buf->f_type	= dentry->d_sb->s_magic;
-	buf->f_bsize	= AFS_BLOCK_SIZE;
-	buf->f_namelen	= AFSNAMEMAX - 1;
+	key_put(key);
 
-	if (vs.max_quota == 0)
-		buf->f_blocks = vs.part_max_blocks;
-	else
-		buf->f_blocks = vs.max_quota;
-	buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
-	return 0;
+	if (ret == 0) {
+		buf->f_type	= dentry->d_sb->s_magic;
+		buf->f_bsize	= AFS_BLOCK_SIZE;
+		buf->f_namelen	= AFSNAMEMAX - 1;
+
+		if (vs.max_quota == 0)
+			buf->f_blocks = vs.part_max_blocks;
+		else
+			buf->f_blocks = vs.max_quota;
+		buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
+	}
+
+	return ret;
 }
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index a5e4cc561b6c..e372f89fd36a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -12,58 +12,19 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include "afs_fs.h"
 #include "internal.h"
 
 /*
- * map volume locator abort codes to error codes
+ * Deliver reply data to a VL.GetEntryByNameU call.
  */
-static int afs_vl_abort_to_error(u32 abort_code)
+static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 {
-	_enter("%u", abort_code);
-
-	switch (abort_code) {
-	case AFSVL_IDEXIST:		return -EEXIST;
-	case AFSVL_IO:			return -EREMOTEIO;
-	case AFSVL_NAMEEXIST:		return -EEXIST;
-	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
-	case AFSVL_NOENT:		return -ENOMEDIUM;
-	case AFSVL_EMPTY:		return -ENOMEDIUM;
-	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
-	case AFSVL_BADNAME:		return -EINVAL;
-	case AFSVL_BADINDEX:		return -EINVAL;
-	case AFSVL_BADVOLTYPE:		return -EINVAL;
-	case AFSVL_BADSERVER:		return -EINVAL;
-	case AFSVL_BADPARTITION:	return -EINVAL;
-	case AFSVL_REPSFULL:		return -EFBIG;
-	case AFSVL_NOREPSERVER:		return -ENOENT;
-	case AFSVL_DUPREPSERVER:	return -EEXIST;
-	case AFSVL_RWNOTFOUND:		return -ENOENT;
-	case AFSVL_BADREFCOUNT:		return -EINVAL;
-	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
-	case AFSVL_BADENTRY:		return -EINVAL;
-	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
-	case AFSVL_IDALREADYHASHED:	return -EINVAL;
-	case AFSVL_ENTRYLOCKED:		return -EBUSY;
-	case AFSVL_BADVOLOPER:		return -EBADRQC;
-	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
-	case AFSVL_RERELEASE:		return -EREMOTEIO;
-	case AFSVL_BADSERVERFLAG:	return -EINVAL;
-	case AFSVL_PERM:		return -EACCES;
-	case AFSVL_NOMEM:		return -EREMOTEIO;
-	default:
-		return afs_abort_to_error(abort_code);
-	}
-}
-
-/*
- * deliver reply data to a VL.GetEntryByXXX call
- */
-static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call)
-{
-	struct afs_cache_vlocation *entry;
-	__be32 *bp;
+	struct afs_uvldbentry__xdr *uvldb;
+	struct afs_vldb_entry *entry;
+	bool new_only = false;
 	u32 tmp;
-	int loop, ret;
+	int i, ret;
 
 	_enter("");
 
@@ -72,144 +33,613 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call)
 		return ret;
 
 	/* unmarshall the reply once we've received all of it */
-	entry = call->reply;
-	bp = call->buffer;
-
-	for (loop = 0; loop < 64; loop++)
-		entry->name[loop] = ntohl(*bp++);
-	entry->name[loop] = 0;
-	bp++; /* final NUL */
+	uvldb = call->buffer;
+	entry = call->reply[0];
 
-	bp++; /* type */
-	entry->nservers = ntohl(*bp++);
+	for (i = 0; i < ARRAY_SIZE(uvldb->name) - 1; i++)
+		entry->name[i] = (u8)ntohl(uvldb->name[i]);
+	entry->name[i] = 0;
+	entry->name_len = strlen(entry->name);
 
-	for (loop = 0; loop < 8; loop++)
-		entry->servers[loop].s_addr = *bp++;
+	/* If there is a new replication site that we can use, ignore all the
+	 * sites that aren't marked as new.
+	 */
+	for (i = 0; i < AFS_NMAXNSERVERS; i++) {
+		tmp = ntohl(uvldb->serverFlags[i]);
+		if (!(tmp & AFS_VLSF_DONTUSE) &&
+		    (tmp & AFS_VLSF_NEWREPSITE))
+			new_only = true;
+	}
 
-	bp += 8; /* partition IDs */
+	for (i = 0; i < AFS_NMAXNSERVERS; i++) {
+		struct afs_uuid__xdr *xdr;
+		struct afs_uuid *uuid;
+		int j;
 
-	for (loop = 0; loop < 8; loop++) {
-		tmp = ntohl(*bp++);
-		entry->srvtmask[loop] = 0;
+		tmp = ntohl(uvldb->serverFlags[i]);
+		if (tmp & AFS_VLSF_DONTUSE ||
+		    (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
+			continue;
 		if (tmp & AFS_VLSF_RWVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
+			entry->fs_mask[i] |= AFS_VOL_VTM_RW;
 		if (tmp & AFS_VLSF_ROVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
+			entry->fs_mask[i] |= AFS_VOL_VTM_RO;
 		if (tmp & AFS_VLSF_BACKVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-	}
+			entry->fs_mask[i] |= AFS_VOL_VTM_BAK;
+		if (!entry->fs_mask[i])
+			continue;
 
-	entry->vid[0] = ntohl(*bp++);
-	entry->vid[1] = ntohl(*bp++);
-	entry->vid[2] = ntohl(*bp++);
+		xdr = &uvldb->serverNumber[i];
+		uuid = (struct afs_uuid *)&entry->fs_server[i];
+		uuid->time_low			= xdr->time_low;
+		uuid->time_mid			= htons(ntohl(xdr->time_mid));
+		uuid->time_hi_and_version	= htons(ntohl(xdr->time_hi_and_version));
+		uuid->clock_seq_hi_and_reserved	= (u8)ntohl(xdr->clock_seq_hi_and_reserved);
+		uuid->clock_seq_low		= (u8)ntohl(xdr->clock_seq_low);
+		for (j = 0; j < 6; j++)
+			uuid->node[j] = (u8)ntohl(xdr->node[j]);
 
-	bp++; /* clone ID */
+		entry->nr_servers++;
+	}
+
+	for (i = 0; i < AFS_MAXTYPES; i++)
+		entry->vid[i] = ntohl(uvldb->volumeId[i]);
 
-	tmp = ntohl(*bp++); /* flags */
-	entry->vidmask = 0;
+	tmp = ntohl(uvldb->flags);
 	if (tmp & AFS_VLF_RWEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_RW;
+		__set_bit(AFS_VLDB_HAS_RW, &entry->flags);
 	if (tmp & AFS_VLF_ROEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_RO;
+		__set_bit(AFS_VLDB_HAS_RO, &entry->flags);
 	if (tmp & AFS_VLF_BACKEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_BAK;
-	if (!entry->vidmask)
-		return -EBADMSG;
+		__set_bit(AFS_VLDB_HAS_BAK, &entry->flags);
 
+	if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) {
+		entry->error = -ENOMEDIUM;
+		__set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags);
+	}
+
+	__set_bit(AFS_VLDB_QUERY_VALID, &entry->flags);
 	_leave(" = 0 [done]");
 	return 0;
 }
 
-/*
- * VL.GetEntryByName operation type
- */
-static const struct afs_call_type afs_RXVLGetEntryByName = {
-	.name		= "VL.GetEntryByName",
-	.deliver	= afs_deliver_vl_get_entry_by_xxx,
-	.abort_to_error	= afs_vl_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
-};
+static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
+{
+	kfree(call->reply[0]);
+	afs_flat_call_destructor(call);
+}
 
 /*
- * VL.GetEntryById operation type
+ * VL.GetEntryByNameU operation type.
  */
-static const struct afs_call_type afs_RXVLGetEntryById = {
-	.name		= "VL.GetEntryById",
-	.deliver	= afs_deliver_vl_get_entry_by_xxx,
-	.abort_to_error	= afs_vl_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+static const struct afs_call_type afs_RXVLGetEntryByNameU = {
+	.name		= "VL.GetEntryByNameU",
+	.op		= afs_VL_GetEntryByNameU,
+	.deliver	= afs_deliver_vl_get_entry_by_name_u,
+	.destructor	= afs_destroy_vl_get_entry_by_name_u,
 };
 
 /*
- * dispatch a get volume entry by name operation
+ * Dispatch a get volume entry by name or ID operation (uuid variant).  If the
+ * volname is a decimal number then it's a volume ID not a volume name.
  */
-int afs_vl_get_entry_by_name(struct in_addr *addr,
-			     struct key *key,
-			     const char *volname,
-			     struct afs_cache_vlocation *entry,
-			     bool async)
+struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
+						  struct afs_addr_cursor *ac,
+						  struct key *key,
+						  const char *volname,
+						  int volnamesz)
 {
+	struct afs_vldb_entry *entry;
 	struct afs_call *call;
-	size_t volnamesz, reqsz, padsz;
+	size_t reqsz, padsz;
 	__be32 *bp;
 
 	_enter("");
 
-	volnamesz = strlen(volname);
 	padsz = (4 - (volnamesz & 3)) & 3;
 	reqsz = 8 + volnamesz + padsz;
 
-	call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
-	if (!call)
-		return -ENOMEM;
+	entry = kzalloc(sizeof(struct afs_vldb_entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	call = afs_alloc_flat_call(net, &afs_RXVLGetEntryByNameU, reqsz,
+				   sizeof(struct afs_uvldbentry__xdr));
+	if (!call) {
+		kfree(entry);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	call->key = key;
-	call->reply = entry;
-	call->service_id = VL_SERVICE;
-	call->port = htons(AFS_VL_PORT);
+	call->reply[0] = entry;
+	call->ret_reply0 = true;
 
-	/* marshall the parameters */
+	/* Marshall the parameters */
 	bp = call->request;
-	*bp++ = htonl(VLGETENTRYBYNAME);
+	*bp++ = htonl(VLGETENTRYBYNAMEU);
 	*bp++ = htonl(volnamesz);
 	memcpy(bp, volname, volnamesz);
 	if (padsz > 0)
-		memset((void *) bp + volnamesz, 0, padsz);
+		memset((void *)bp + volnamesz, 0, padsz);
 
-	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, async);
+	trace_afs_make_vl_call(call);
+	return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false);
 }
 
 /*
- * dispatch a get volume entry by ID operation
+ * Deliver reply data to a VL.GetAddrsU call.
+ *
+ *	GetAddrsU(IN ListAddrByAttributes *inaddr,
+ *		  OUT afsUUID *uuidp1,
+ *		  OUT uint32_t *uniquifier,
+ *		  OUT uint32_t *nentries,
+ *		  OUT bulkaddrs *blkaddrs);
  */
-int afs_vl_get_entry_by_id(struct in_addr *addr,
-			   struct key *key,
-			   afs_volid_t volid,
-			   afs_voltype_t voltype,
-			   struct afs_cache_vlocation *entry,
-			   bool async)
+static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
 {
+	struct afs_addr_list *alist;
+	__be32 *bp;
+	u32 uniquifier, nentries, count;
+	int i, ret;
+
+	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract the returned uuid, uniquifier, nentries and blkaddrs size */
+	case 1:
+		ret = afs_extract_data(call, call->buffer,
+				       sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer + sizeof(struct afs_uuid__xdr);
+		uniquifier	= ntohl(*bp++);
+		nentries	= ntohl(*bp++);
+		count		= ntohl(*bp);
+
+		nentries = min(nentries, count);
+		alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
+		if (!alist)
+			return -ENOMEM;
+		alist->version = uniquifier;
+		call->reply[0] = alist;
+		call->count = count;
+		call->count2 = nentries;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract entries */
+	case 2:
+		count = min(call->count, 4U);
+		ret = afs_extract_data(call, call->buffer,
+				       count * sizeof(__be32),
+				       call->count > 4);
+		if (ret < 0)
+			return ret;
+
+		alist = call->reply[0];
+		bp = call->buffer;
+		for (i = 0; i < count; i++)
+			if (alist->nr_addrs < call->count2)
+				afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
+
+		call->count -= count;
+		if (call->count > 0)
+			goto again;
+		call->offset = 0;
+		call->unmarshall++;
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
+{
+	afs_put_server(call->net, (struct afs_server *)call->reply[0]);
+	kfree(call->reply[1]);
+	return afs_flat_call_destructor(call);
+}
+
+/*
+ * VL.GetAddrsU operation type.
+ */
+static const struct afs_call_type afs_RXVLGetAddrsU = {
+	.name		= "VL.GetAddrsU",
+	.op		= afs_VL_GetAddrsU,
+	.deliver	= afs_deliver_vl_get_addrs_u,
+	.destructor	= afs_vl_get_addrs_u_destructor,
+};
+
+/*
+ * Dispatch an operation to get the addresses for a server, where the server is
+ * nominated by UUID.
+ */
+struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
+					 struct afs_addr_cursor *ac,
+					 struct key *key,
+					 const uuid_t *uuid)
+{
+	struct afs_ListAddrByAttributes__xdr *r;
+	const struct afs_uuid *u = (const struct afs_uuid *)uuid;
 	struct afs_call *call;
 	__be32 *bp;
+	int i;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+	call = afs_alloc_flat_call(net, &afs_RXVLGetAddrsU,
+				   sizeof(__be32) + sizeof(struct afs_ListAddrByAttributes__xdr),
+				   sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32));
+	if (!call)
+		return ERR_PTR(-ENOMEM);
+
+	call->key = key;
+	call->reply[0] = NULL;
+	call->ret_reply0 = true;
+
+	/* Marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETADDRSU);
+	r = (struct afs_ListAddrByAttributes__xdr *)bp;
+	r->Mask		= htonl(AFS_VLADDR_UUID);
+	r->ipaddr	= 0;
+	r->index	= 0;
+	r->spare	= 0;
+	r->uuid.time_low			= u->time_low;
+	r->uuid.time_mid			= htonl(ntohs(u->time_mid));
+	r->uuid.time_hi_and_version		= htonl(ntohs(u->time_hi_and_version));
+	r->uuid.clock_seq_hi_and_reserved 	= htonl(u->clock_seq_hi_and_reserved);
+	r->uuid.clock_seq_low			= htonl(u->clock_seq_low);
+	for (i = 0; i < 6; i++)
+		r->uuid.node[i] = ntohl(u->node[i]);
+
+	trace_afs_make_vl_call(call);
+	return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+}
+
+/*
+ * Deliver reply data to an VL.GetCapabilities operation.
+ */
+static int afs_deliver_vl_get_capabilities(struct afs_call *call)
+{
+	u32 count;
+	int ret;
+
+	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract the capabilities word count */
+	case 1:
+		ret = afs_extract_data(call, &call->tmp,
+				       1 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		count = ntohl(call->tmp);
+
+		call->count = count;
+		call->count2 = count;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract capabilities words */
+	case 2:
+		count = min(call->count, 16U);
+		ret = afs_extract_data(call, call->buffer,
+				       count * sizeof(__be32),
+				       call->count > 16);
+		if (ret < 0)
+			return ret;
+
+		/* TODO: Examine capabilities */
+
+		call->count -= count;
+		if (call->count > 0)
+			goto again;
+		call->offset = 0;
+		call->unmarshall++;
+		break;
+	}
+
+	call->reply[0] = (void *)(unsigned long)call->service_id;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * VL.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_RXVLGetCapabilities = {
+	.name		= "VL.GetCapabilities",
+	.op		= afs_VL_GetCapabilities,
+	.deliver	= afs_deliver_vl_get_capabilities,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Probe a fileserver for the capabilities that it supports.  This can
+ * return up to 196 words.
+ *
+ * We use this to probe for service upgrade to determine what the server at the
+ * other end supports.
+ */
+int afs_vl_get_capabilities(struct afs_net *net,
+			    struct afs_addr_cursor *ac,
+			    struct key *key)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_RXVLGetCapabilities, 1 * 4, 16 * 4);
 	if (!call)
 		return -ENOMEM;
 
 	call->key = key;
-	call->reply = entry;
-	call->service_id = VL_SERVICE;
-	call->port = htons(AFS_VL_PORT);
+	call->upgrade = true; /* Let's see if this is a YFS server */
+	call->reply[0] = (void *)VLGETCAPABILITIES;
+	call->ret_reply0 = true;
 
 	/* marshall the parameters */
 	bp = call->request;
-	*bp++ = htonl(VLGETENTRYBYID);
-	*bp++ = htonl(volid);
-	*bp   = htonl(voltype);
+	*bp++ = htonl(VLGETCAPABILITIES);
+
+	/* Can't take a ref on server */
+	trace_afs_make_vl_call(call);
+	return afs_make_call(ac, call, GFP_KERNEL, false);
+}
+
+/*
+ * Deliver reply data to a YFSVL.GetEndpoints call.
+ *
+ *	GetEndpoints(IN yfsServerAttributes *attr,
+ *		     OUT opr_uuid *uuid,
+ *		     OUT afs_int32 *uniquifier,
+ *		     OUT endpoints *fsEndpoints,
+ *		     OUT endpoints *volEndpoints)
+ */
+static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
+{
+	struct afs_addr_list *alist;
+	__be32 *bp;
+	u32 uniquifier, size;
+	int ret;
+
+	_enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall = 1;
+
+		/* Extract the returned uuid, uniquifier, fsEndpoints count and
+		 * either the first fsEndpoint type or the volEndpoints
+		 * count if there are no fsEndpoints. */
+	case 1:
+		ret = afs_extract_data(call, call->buffer,
+				       sizeof(uuid_t) +
+				       3 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer + sizeof(uuid_t);
+		uniquifier	= ntohl(*bp++);
+		call->count	= ntohl(*bp++);
+		call->count2	= ntohl(*bp); /* Type or next count */
+
+		if (call->count > YFS_MAXENDPOINTS)
+			return -EBADMSG;
+
+		alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
+		if (!alist)
+			return -ENOMEM;
+		alist->version = uniquifier;
+		call->reply[0] = alist;
+		call->offset = 0;
+
+		if (call->count == 0)
+			goto extract_volendpoints;
+
+		call->unmarshall = 2;
+
+		/* Extract fsEndpoints[] entries */
+	case 2:
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			size = sizeof(__be32) * (1 + 1 + 1);
+			break;
+		case YFS_ENDPOINT_IPV6:
+			size = sizeof(__be32) * (1 + 4 + 1);
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		size += sizeof(__be32);
+		ret = afs_extract_data(call, call->buffer, size, true);
+		if (ret < 0)
+			return ret;
+
+		alist = call->reply[0];
+		bp = call->buffer;
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			if (ntohl(bp[0]) != sizeof(__be32) * 2)
+				return -EBADMSG;
+			afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
+			bp += 3;
+			break;
+		case YFS_ENDPOINT_IPV6:
+			if (ntohl(bp[0]) != sizeof(__be32) * 5)
+				return -EBADMSG;
+			afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
+			bp += 6;
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		/* Got either the type of the next entry or the count of
+		 * volEndpoints if no more fsEndpoints.
+		 */
+		call->count2 = htonl(*bp++);
+
+		call->offset = 0;
+		call->count--;
+		if (call->count > 0)
+			goto again;
+
+	extract_volendpoints:
+		/* Extract the list of volEndpoints. */
+		call->count = call->count2;
+		if (!call->count)
+			goto end;
+		if (call->count > YFS_MAXENDPOINTS)
+			return -EBADMSG;
+
+		call->unmarshall = 3;
+
+		/* Extract the type of volEndpoints[0].  Normally we would
+		 * extract the type of the next endpoint when we extract the
+		 * data of the current one, but this is the first...
+		 */
+	case 3:
+		ret = afs_extract_data(call, call->buffer, sizeof(__be32), true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		call->count2 = htonl(*bp++);
+		call->offset = 0;
+		call->unmarshall = 4;
+
+		/* Extract volEndpoints[] entries */
+	case 4:
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			size = sizeof(__be32) * (1 + 1 + 1);
+			break;
+		case YFS_ENDPOINT_IPV6:
+			size = sizeof(__be32) * (1 + 4 + 1);
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		if (call->count > 1)
+			size += sizeof(__be32);
+		ret = afs_extract_data(call, call->buffer, size, true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			if (ntohl(bp[0]) != sizeof(__be32) * 2)
+				return -EBADMSG;
+			bp += 3;
+			break;
+		case YFS_ENDPOINT_IPV6:
+			if (ntohl(bp[0]) != sizeof(__be32) * 5)
+				return -EBADMSG;
+			bp += 6;
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		/* Got either the type of the next entry or the count of
+		 * volEndpoints if no more fsEndpoints.
+		 */
+		call->offset = 0;
+		call->count--;
+		if (call->count > 0) {
+			call->count2 = htonl(*bp++);
+			goto again;
+		}
+
+	end:
+		call->unmarshall = 5;
+
+		/* Done */
+	case 5:
+		ret = afs_extract_data(call, call->buffer, 0, false);
+		if (ret < 0)
+			return ret;
+		call->unmarshall = 6;
+
+	case 6:
+		break;
+	}
+
+	alist = call->reply[0];
+
+	/* Start with IPv6 if available. */
+	if (alist->nr_ipv4 < alist->nr_addrs)
+		alist->index = alist->nr_ipv4;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFSVL.GetEndpoints operation type.
+ */
+static const struct afs_call_type afs_YFSVLGetEndpoints = {
+	.name		= "YFSVL.GetEndpoints",
+	.op		= afs_YFSVL_GetEndpoints,
+	.deliver	= afs_deliver_yfsvl_get_endpoints,
+	.destructor	= afs_vl_get_addrs_u_destructor,
+};
+
+/*
+ * Dispatch an operation to get the addresses for a server, where the server is
+ * nominated by UUID.
+ */
+struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
+					      struct afs_addr_cursor *ac,
+					      struct key *key,
+					      const uuid_t *uuid)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_YFSVLGetEndpoints,
+				   sizeof(__be32) * 2 + sizeof(*uuid),
+				   sizeof(struct in6_addr) + sizeof(__be32) * 3);
+	if (!call)
+		return ERR_PTR(-ENOMEM);
+
+	call->key = key;
+	call->reply[0] = NULL;
+	call->ret_reply0 = true;
+
+	/* Marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(YVLGETENDPOINTS);
+	*bp++ = htonl(YFS_SERVER_UUID);
+	memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
 
-	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, async);
+	trace_afs_make_vl_call(call);
+	return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
deleted file mode 100644
index 37b7c3b342a6..000000000000
--- a/fs/afs/vlocation.c
+++ /dev/null
@@ -1,720 +0,0 @@
-/* AFS volume location management
- *
- * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include "internal.h"
-
-static unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
-static unsigned afs_vlocation_update_timeout = 10 * 60;
-
-static void afs_vlocation_reaper(struct work_struct *);
-static void afs_vlocation_updater(struct work_struct *);
-
-static LIST_HEAD(afs_vlocation_updates);
-static LIST_HEAD(afs_vlocation_graveyard);
-static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
-static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
-static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
-static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
-static struct workqueue_struct *afs_vlocation_update_worker;
-
-/*
- * iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- */
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
-					   struct key *key,
-					   struct afs_cache_vlocation *vldb)
-{
-	struct afs_cell *cell = vl->cell;
-	struct in_addr addr;
-	int count, ret;
-
-	_enter("%s,%s", cell->name, vl->vldb.name);
-
-	down_write(&vl->cell->vl_sem);
-	ret = -ENOMEDIUM;
-	for (count = cell->vl_naddrs; count > 0; count--) {
-		addr = cell->vl_addrs[cell->vl_curr_svix];
-
-		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-
-		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
-					       false);
-		switch (ret) {
-		case 0:
-			goto out;
-		case -ENOMEM:
-		case -ENONET:
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			if (ret == -ENOMEM || ret == -ENONET)
-				goto out;
-			goto rotate;
-		case -ENOMEDIUM:
-		case -EKEYREJECTED:
-		case -EKEYEXPIRED:
-			goto out;
-		default:
-			ret = -EIO;
-			goto rotate;
-		}
-
-		/* rotate the server records upon lookup failure */
-	rotate:
-		cell->vl_curr_svix++;
-		cell->vl_curr_svix %= cell->vl_naddrs;
-	}
-
-out:
-	up_write(&vl->cell->vl_sem);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- */
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
-					 struct key *key,
-					 afs_volid_t volid,
-					 afs_voltype_t voltype,
-					 struct afs_cache_vlocation *vldb)
-{
-	struct afs_cell *cell = vl->cell;
-	struct in_addr addr;
-	int count, ret;
-
-	_enter("%s,%x,%d,", cell->name, volid, voltype);
-
-	down_write(&vl->cell->vl_sem);
-	ret = -ENOMEDIUM;
-	for (count = cell->vl_naddrs; count > 0; count--) {
-		addr = cell->vl_addrs[cell->vl_curr_svix];
-
-		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-
-		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
-					     false);
-		switch (ret) {
-		case 0:
-			goto out;
-		case -ENOMEM:
-		case -ENONET:
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			if (ret == -ENOMEM || ret == -ENONET)
-				goto out;
-			goto rotate;
-		case -EBUSY:
-			vl->upd_busy_cnt++;
-			if (vl->upd_busy_cnt <= 3) {
-				if (vl->upd_busy_cnt > 1) {
-					/* second+ BUSY - sleep a little bit */
-					set_current_state(TASK_UNINTERRUPTIBLE);
-					schedule_timeout(1);
-				}
-				continue;
-			}
-			break;
-		case -ENOMEDIUM:
-			vl->upd_rej_cnt++;
-			goto rotate;
-		default:
-			ret = -EIO;
-			goto rotate;
-		}
-
-		/* rotate the server records upon lookup failure */
-	rotate:
-		cell->vl_curr_svix++;
-		cell->vl_curr_svix %= cell->vl_naddrs;
-		vl->upd_busy_cnt = 0;
-	}
-
-out:
-	if (ret < 0 && vl->upd_rej_cnt > 0) {
-		printk(KERN_NOTICE "kAFS:"
-		       " Active volume no longer valid '%s'\n",
-		       vl->vldb.name);
-		vl->valid = 0;
-		ret = -ENOMEDIUM;
-	}
-
-	up_write(&vl->cell->vl_sem);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * allocate a volume location record
- */
-static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
-						 const char *name,
-						 size_t namesz)
-{
-	struct afs_vlocation *vl;
-
-	vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-	if (vl) {
-		vl->cell = cell;
-		vl->state = AFS_VL_NEW;
-		atomic_set(&vl->usage, 1);
-		INIT_LIST_HEAD(&vl->link);
-		INIT_LIST_HEAD(&vl->grave);
-		INIT_LIST_HEAD(&vl->update);
-		init_waitqueue_head(&vl->waitq);
-		spin_lock_init(&vl->lock);
-		memcpy(vl->vldb.name, name, namesz);
-	}
-
-	_leave(" = %p", vl);
-	return vl;
-}
-
-/*
- * update record if we found it in the cache
- */
-static int afs_vlocation_update_record(struct afs_vlocation *vl,
-				       struct key *key,
-				       struct afs_cache_vlocation *vldb)
-{
-	afs_voltype_t voltype;
-	afs_volid_t vid;
-	int ret;
-
-	/* try to look up a cached volume in the cell VL databases by ID */
-	_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-	       vl->vldb.name,
-	       vl->vldb.vidmask,
-	       ntohl(vl->vldb.servers[0].s_addr),
-	       vl->vldb.srvtmask[0],
-	       ntohl(vl->vldb.servers[1].s_addr),
-	       vl->vldb.srvtmask[1],
-	       ntohl(vl->vldb.servers[2].s_addr),
-	       vl->vldb.srvtmask[2]);
-
-	_debug("Vids: %08x %08x %08x",
-	       vl->vldb.vid[0],
-	       vl->vldb.vid[1],
-	       vl->vldb.vid[2]);
-
-	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
-		vid = vl->vldb.vid[0];
-		voltype = AFSVL_RWVOL;
-	} else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
-		vid = vl->vldb.vid[1];
-		voltype = AFSVL_ROVOL;
-	} else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
-		vid = vl->vldb.vid[2];
-		voltype = AFSVL_BACKVOL;
-	} else {
-		BUG();
-		vid = 0;
-		voltype = 0;
-	}
-
-	/* contact the server to make sure the volume is still available
-	 * - TODO: need to handle disconnected operation here
-	 */
-	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
-	switch (ret) {
-		/* net error */
-	default:
-		printk(KERN_WARNING "kAFS:"
-		       " failed to update volume '%s' (%x) up in '%s': %d\n",
-		       vl->vldb.name, vid, vl->cell->name, ret);
-		_leave(" = %d", ret);
-		return ret;
-
-		/* pulled from local cache into memory */
-	case 0:
-		_leave(" = 0");
-		return 0;
-
-		/* uh oh... looks like the volume got deleted */
-	case -ENOMEDIUM:
-		printk(KERN_ERR "kAFS:"
-		       " volume '%s' (%x) does not exist '%s'\n",
-		       vl->vldb.name, vid, vl->cell->name);
-
-		/* TODO: make existing record unavailable */
-		_leave(" = %d", ret);
-		return ret;
-	}
-}
-
-/*
- * apply the update to a VL record
- */
-static void afs_vlocation_apply_update(struct afs_vlocation *vl,
-				       struct afs_cache_vlocation *vldb)
-{
-	_debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-	       vldb->name, vldb->vidmask,
-	       ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
-	       ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
-	       ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
-
-	_debug("Vids: %08x %08x %08x",
-	       vldb->vid[0], vldb->vid[1], vldb->vid[2]);
-
-	if (strcmp(vldb->name, vl->vldb.name) != 0)
-		printk(KERN_NOTICE "kAFS:"
-		       " name of volume '%s' changed to '%s' on server\n",
-		       vl->vldb.name, vldb->name);
-
-	vl->vldb = *vldb;
-
-#ifdef CONFIG_AFS_FSCACHE
-	fscache_update_cookie(vl->cache);
-#endif
-}
-
-/*
- * fill in a volume location record, consulting the cache and the VL server
- * both
- */
-static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
-					struct key *key)
-{
-	struct afs_cache_vlocation vldb;
-	int ret;
-
-	_enter("");
-
-	ASSERTCMP(vl->valid, ==, 0);
-
-	memset(&vldb, 0, sizeof(vldb));
-
-	/* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef CONFIG_AFS_FSCACHE
-	vl->cache = fscache_acquire_cookie(vl->cell->cache,
-					   &afs_vlocation_cache_index_def, vl,
-					   true);
-#endif
-
-	if (vl->valid) {
-		/* try to update a known volume in the cell VL databases by
-		 * ID as the name may have changed */
-		_debug("found in cache");
-		ret = afs_vlocation_update_record(vl, key, &vldb);
-	} else {
-		/* try to look up an unknown volume in the cell VL databases by
-		 * name */
-		ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
-		if (ret < 0) {
-			printk("kAFS: failed to locate '%s' in cell '%s'\n",
-			       vl->vldb.name, vl->cell->name);
-			return ret;
-		}
-	}
-
-	afs_vlocation_apply_update(vl, &vldb);
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * queue a vlocation record for updates
- */
-static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
-{
-	struct afs_vlocation *xvl;
-
-	/* wait at least 10 minutes before updating... */
-	vl->update_at = ktime_get_real_seconds() +
-			afs_vlocation_update_timeout;
-
-	spin_lock(&afs_vlocation_updates_lock);
-
-	if (!list_empty(&afs_vlocation_updates)) {
-		/* ... but wait at least 1 second more than the newest record
-		 * already queued so that we don't spam the VL server suddenly
-		 * with lots of requests
-		 */
-		xvl = list_entry(afs_vlocation_updates.prev,
-				 struct afs_vlocation, update);
-		if (vl->update_at <= xvl->update_at)
-			vl->update_at = xvl->update_at + 1;
-	} else {
-		queue_delayed_work(afs_vlocation_update_worker,
-				   &afs_vlocation_update,
-				   afs_vlocation_update_timeout * HZ);
-	}
-
-	list_add_tail(&vl->update, &afs_vlocation_updates);
-	spin_unlock(&afs_vlocation_updates_lock);
-}
-
-/*
- * lookup volume location
- * - iterate through the VL servers in a cell until one of them admits knowing
- *   about the volume in question
- * - lookup in the local cache if not able to find on the VL server
- * - insert/update in the local cache if did get a VL response
- */
-struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
-					   struct key *key,
-					   const char *name,
-					   size_t namesz)
-{
-	struct afs_vlocation *vl;
-	int ret;
-
-	_enter("{%s},{%x},%*.*s,%zu",
-	       cell->name, key_serial(key),
-	       (int) namesz, (int) namesz, name, namesz);
-
-	if (namesz >= sizeof(vl->vldb.name)) {
-		_leave(" = -ENAMETOOLONG");
-		return ERR_PTR(-ENAMETOOLONG);
-	}
-
-	/* see if we have an in-memory copy first */
-	down_write(&cell->vl_sem);
-	spin_lock(&cell->vl_lock);
-	list_for_each_entry(vl, &cell->vl_list, link) {
-		if (vl->vldb.name[namesz] != '\0')
-			continue;
-		if (memcmp(vl->vldb.name, name, namesz) == 0)
-			goto found_in_memory;
-	}
-	spin_unlock(&cell->vl_lock);
-
-	/* not in the cell's in-memory lists - create a new record */
-	vl = afs_vlocation_alloc(cell, name, namesz);
-	if (!vl) {
-		up_write(&cell->vl_sem);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	afs_get_cell(cell);
-
-	list_add_tail(&vl->link, &cell->vl_list);
-	vl->state = AFS_VL_CREATING;
-	up_write(&cell->vl_sem);
-
-fill_in_record:
-	ret = afs_vlocation_fill_in_record(vl, key);
-	if (ret < 0)
-		goto error_abandon;
-	spin_lock(&vl->lock);
-	vl->state = AFS_VL_VALID;
-	spin_unlock(&vl->lock);
-	wake_up(&vl->waitq);
-
-	/* update volume entry in local cache */
-#ifdef CONFIG_AFS_FSCACHE
-	fscache_update_cookie(vl->cache);
-#endif
-
-	/* schedule for regular updates */
-	afs_vlocation_queue_for_updates(vl);
-	goto success;
-
-found_in_memory:
-	/* found in memory */
-	_debug("found in memory");
-	atomic_inc(&vl->usage);
-	spin_unlock(&cell->vl_lock);
-	if (!list_empty(&vl->grave)) {
-		spin_lock(&afs_vlocation_graveyard_lock);
-		list_del_init(&vl->grave);
-		spin_unlock(&afs_vlocation_graveyard_lock);
-	}
-	up_write(&cell->vl_sem);
-
-	/* see if it was an abandoned record that we might try filling in */
-	spin_lock(&vl->lock);
-	while (vl->state != AFS_VL_VALID) {
-		afs_vlocation_state_t state = vl->state;
-
-		_debug("invalid [state %d]", state);
-
-		if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
-			vl->state = AFS_VL_CREATING;
-			spin_unlock(&vl->lock);
-			goto fill_in_record;
-		}
-
-		/* must now wait for creation or update by someone else to
-		 * complete */
-		_debug("wait");
-
-		spin_unlock(&vl->lock);
-		ret = wait_event_interruptible(vl->waitq,
-					       vl->state == AFS_VL_NEW ||
-					       vl->state == AFS_VL_VALID ||
-					       vl->state == AFS_VL_NO_VOLUME);
-		if (ret < 0)
-			goto error;
-		spin_lock(&vl->lock);
-	}
-	spin_unlock(&vl->lock);
-
-success:
-	_leave(" = %p", vl);
-	return vl;
-
-error_abandon:
-	spin_lock(&vl->lock);
-	vl->state = AFS_VL_NEW;
-	spin_unlock(&vl->lock);
-	wake_up(&vl->waitq);
-error:
-	ASSERT(vl != NULL);
-	afs_put_vlocation(vl);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-}
-
-/*
- * finish using a volume location record
- */
-void afs_put_vlocation(struct afs_vlocation *vl)
-{
-	if (!vl)
-		return;
-
-	_enter("%s", vl->vldb.name);
-
-	ASSERTCMP(atomic_read(&vl->usage), >, 0);
-
-	if (likely(!atomic_dec_and_test(&vl->usage))) {
-		_leave("");
-		return;
-	}
-
-	spin_lock(&afs_vlocation_graveyard_lock);
-	if (atomic_read(&vl->usage) == 0) {
-		_debug("buried");
-		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
-		vl->time_of_death = ktime_get_real_seconds();
-		queue_delayed_work(afs_wq, &afs_vlocation_reap,
-				   afs_vlocation_timeout * HZ);
-
-		/* suspend updates on this record */
-		if (!list_empty(&vl->update)) {
-			spin_lock(&afs_vlocation_updates_lock);
-			list_del_init(&vl->update);
-			spin_unlock(&afs_vlocation_updates_lock);
-		}
-	}
-	spin_unlock(&afs_vlocation_graveyard_lock);
-	_leave(" [killed?]");
-}
-
-/*
- * destroy a dead volume location record
- */
-static void afs_vlocation_destroy(struct afs_vlocation *vl)
-{
-	_enter("%p", vl);
-
-#ifdef CONFIG_AFS_FSCACHE
-	fscache_relinquish_cookie(vl->cache, 0);
-#endif
-	afs_put_cell(vl->cell);
-	kfree(vl);
-}
-
-/*
- * reap dead volume location records
- */
-static void afs_vlocation_reaper(struct work_struct *work)
-{
-	LIST_HEAD(corpses);
-	struct afs_vlocation *vl;
-	unsigned long delay, expiry;
-	time64_t now;
-
-	_enter("");
-
-	now = ktime_get_real_seconds();
-	spin_lock(&afs_vlocation_graveyard_lock);
-
-	while (!list_empty(&afs_vlocation_graveyard)) {
-		vl = list_entry(afs_vlocation_graveyard.next,
-				struct afs_vlocation, grave);
-
-		_debug("check %p", vl);
-
-		/* the queue is ordered most dead first */
-		expiry = vl->time_of_death + afs_vlocation_timeout;
-		if (expiry > now) {
-			delay = (expiry - now) * HZ;
-			_debug("delay %lu", delay);
-			mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
-			break;
-		}
-
-		spin_lock(&vl->cell->vl_lock);
-		if (atomic_read(&vl->usage) > 0) {
-			_debug("no reap");
-			list_del_init(&vl->grave);
-		} else {
-			_debug("reap");
-			list_move_tail(&vl->grave, &corpses);
-			list_del_init(&vl->link);
-		}
-		spin_unlock(&vl->cell->vl_lock);
-	}
-
-	spin_unlock(&afs_vlocation_graveyard_lock);
-
-	/* now reap the corpses we've extracted */
-	while (!list_empty(&corpses)) {
-		vl = list_entry(corpses.next, struct afs_vlocation, grave);
-		list_del(&vl->grave);
-		afs_vlocation_destroy(vl);
-	}
-
-	_leave("");
-}
-
-/*
- * initialise the VL update process
- */
-int __init afs_vlocation_update_init(void)
-{
-	afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated",
-						      WQ_MEM_RECLAIM, 0);
-	return afs_vlocation_update_worker ? 0 : -ENOMEM;
-}
-
-/*
- * discard all the volume location records for rmmod
- */
-void afs_vlocation_purge(void)
-{
-	afs_vlocation_timeout = 0;
-
-	spin_lock(&afs_vlocation_updates_lock);
-	list_del_init(&afs_vlocation_updates);
-	spin_unlock(&afs_vlocation_updates_lock);
-	mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
-	destroy_workqueue(afs_vlocation_update_worker);
-
-	mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
-}
-
-/*
- * update a volume location
- */
-static void afs_vlocation_updater(struct work_struct *work)
-{
-	struct afs_cache_vlocation vldb;
-	struct afs_vlocation *vl, *xvl;
-	time64_t now;
-	long timeout;
-	int ret;
-
-	_enter("");
-
-	now = ktime_get_real_seconds();
-
-	/* find a record to update */
-	spin_lock(&afs_vlocation_updates_lock);
-	for (;;) {
-		if (list_empty(&afs_vlocation_updates)) {
-			spin_unlock(&afs_vlocation_updates_lock);
-			_leave(" [nothing]");
-			return;
-		}
-
-		vl = list_entry(afs_vlocation_updates.next,
-				struct afs_vlocation, update);
-		if (atomic_read(&vl->usage) > 0)
-			break;
-		list_del_init(&vl->update);
-	}
-
-	timeout = vl->update_at - now;
-	if (timeout > 0) {
-		queue_delayed_work(afs_vlocation_update_worker,
-				   &afs_vlocation_update, timeout * HZ);
-		spin_unlock(&afs_vlocation_updates_lock);
-		_leave(" [nothing]");
-		return;
-	}
-
-	list_del_init(&vl->update);
-	atomic_inc(&vl->usage);
-	spin_unlock(&afs_vlocation_updates_lock);
-
-	/* we can now perform the update */
-	_debug("update %s", vl->vldb.name);
-	vl->state = AFS_VL_UPDATING;
-	vl->upd_rej_cnt = 0;
-	vl->upd_busy_cnt = 0;
-
-	ret = afs_vlocation_update_record(vl, NULL, &vldb);
-	spin_lock(&vl->lock);
-	switch (ret) {
-	case 0:
-		afs_vlocation_apply_update(vl, &vldb);
-		vl->state = AFS_VL_VALID;
-		break;
-	case -ENOMEDIUM:
-		vl->state = AFS_VL_VOLUME_DELETED;
-		break;
-	default:
-		vl->state = AFS_VL_UNCERTAIN;
-		break;
-	}
-	spin_unlock(&vl->lock);
-	wake_up(&vl->waitq);
-
-	/* and then reschedule */
-	_debug("reschedule");
-	vl->update_at = ktime_get_real_seconds() +
-			afs_vlocation_update_timeout;
-
-	spin_lock(&afs_vlocation_updates_lock);
-
-	if (!list_empty(&afs_vlocation_updates)) {
-		/* next update in 10 minutes, but wait at least 1 second more
-		 * than the newest record already queued so that we don't spam
-		 * the VL server suddenly with lots of requests
-		 */
-		xvl = list_entry(afs_vlocation_updates.prev,
-				 struct afs_vlocation, update);
-		if (vl->update_at <= xvl->update_at)
-			vl->update_at = xvl->update_at + 1;
-		xvl = list_entry(afs_vlocation_updates.next,
-				 struct afs_vlocation, update);
-		timeout = xvl->update_at - now;
-		if (timeout < 0)
-			timeout = 0;
-	} else {
-		timeout = afs_vlocation_update_timeout;
-	}
-
-	ASSERT(list_empty(&vl->update));
-
-	list_add_tail(&vl->update, &afs_vlocation_updates);
-
-	_debug("timeout %ld", timeout);
-	queue_delayed_work(afs_vlocation_update_worker,
-			   &afs_vlocation_update, timeout * HZ);
-	spin_unlock(&afs_vlocation_updates_lock);
-	afs_put_vlocation(vl);
-}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
deleted file mode 100644
index dcb956143c86..000000000000
--- a/fs/afs/vnode.c
+++ /dev/null
@@ -1,1025 +0,0 @@
-/* AFS vnode management
- *
- * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include "internal.h"
-
-#if 0
-static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
-				   int depth, char lr)
-{
-	struct afs_vnode *vnode;
-	bool bad = false;
-
-	if (!node)
-		return false;
-
-	if (node->rb_left)
-		bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
-
-	vnode = rb_entry(node, struct afs_vnode, cb_promise);
-	_debug("%c %*.*s%c%p {%d}",
-	       rb_is_red(node) ? 'R' : 'B',
-	       depth, depth, "", lr,
-	       vnode, vnode->cb_expires_at);
-	if (rb_parent(node) != parent) {
-		printk("BAD: %p != %p\n", rb_parent(node), parent);
-		bad = true;
-	}
-
-	if (node->rb_right)
-		bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
-
-	return bad;
-}
-
-static noinline void dump_tree(const char *name, struct afs_server *server)
-{
-	_enter("%s", name);
-	if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
-		BUG();
-}
-#endif
-
-/*
- * insert a vnode into the backing server's vnode tree
- */
-static void afs_install_vnode(struct afs_vnode *vnode,
-			      struct afs_server *server)
-{
-	struct afs_server *old_server = vnode->server;
-	struct afs_vnode *xvnode;
-	struct rb_node *parent, **p;
-
-	_enter("%p,%p", vnode, server);
-
-	if (old_server) {
-		spin_lock(&old_server->fs_lock);
-		rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
-		spin_unlock(&old_server->fs_lock);
-	}
-
-	afs_get_server(server);
-	vnode->server = server;
-	afs_put_server(old_server);
-
-	/* insert into the server's vnode tree in FID order */
-	spin_lock(&server->fs_lock);
-
-	parent = NULL;
-	p = &server->fs_vnodes.rb_node;
-	while (*p) {
-		parent = *p;
-		xvnode = rb_entry(parent, struct afs_vnode, server_rb);
-		if (vnode->fid.vid < xvnode->fid.vid)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.vid > xvnode->fid.vid)
-			p = &(*p)->rb_right;
-		else if (vnode->fid.vnode < xvnode->fid.vnode)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.vnode > xvnode->fid.vnode)
-			p = &(*p)->rb_right;
-		else if (vnode->fid.unique < xvnode->fid.unique)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.unique > xvnode->fid.unique)
-			p = &(*p)->rb_right;
-		else
-			BUG(); /* can't happen unless afs_iget() malfunctions */
-	}
-
-	rb_link_node(&vnode->server_rb, parent, p);
-	rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
-
-	spin_unlock(&server->fs_lock);
-	_leave("");
-}
-
-/*
- * insert a vnode into the promising server's update/expiration tree
- * - caller must hold vnode->lock
- */
-static void afs_vnode_note_promise(struct afs_vnode *vnode,
-				   struct afs_server *server)
-{
-	struct afs_server *old_server;
-	struct afs_vnode *xvnode;
-	struct rb_node *parent, **p;
-
-	_enter("%p,%p", vnode, server);
-
-	ASSERT(server != NULL);
-
-	old_server = vnode->server;
-	if (vnode->cb_promised) {
-		if (server == old_server &&
-		    vnode->cb_expires == vnode->cb_expires_at) {
-			_leave(" [no change]");
-			return;
-		}
-
-		spin_lock(&old_server->cb_lock);
-		if (vnode->cb_promised) {
-			_debug("delete");
-			rb_erase(&vnode->cb_promise, &old_server->cb_promises);
-			vnode->cb_promised = false;
-		}
-		spin_unlock(&old_server->cb_lock);
-	}
-
-	if (vnode->server != server)
-		afs_install_vnode(vnode, server);
-
-	vnode->cb_expires_at = vnode->cb_expires;
-	_debug("PROMISE on %p {%lu}",
-	       vnode, (unsigned long) vnode->cb_expires_at);
-
-	/* abuse an RB-tree to hold the expiration order (we may have multiple
-	 * items with the same expiration time) */
-	spin_lock(&server->cb_lock);
-
-	parent = NULL;
-	p = &server->cb_promises.rb_node;
-	while (*p) {
-		parent = *p;
-		xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
-		if (vnode->cb_expires_at < xvnode->cb_expires_at)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&vnode->cb_promise, parent, p);
-	rb_insert_color(&vnode->cb_promise, &server->cb_promises);
-	vnode->cb_promised = true;
-
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * handle remote file deletion by discarding the callback promise
- */
-static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
-{
-	struct afs_server *server;
-
-	_enter("{%p}", vnode->server);
-
-	set_bit(AFS_VNODE_DELETED, &vnode->flags);
-
-	server = vnode->server;
-	if (server) {
-		if (vnode->cb_promised) {
-			spin_lock(&server->cb_lock);
-			if (vnode->cb_promised) {
-				rb_erase(&vnode->cb_promise,
-					 &server->cb_promises);
-				vnode->cb_promised = false;
-			}
-			spin_unlock(&server->cb_lock);
-		}
-
-		spin_lock(&server->fs_lock);
-		rb_erase(&vnode->server_rb, &server->fs_vnodes);
-		spin_unlock(&server->fs_lock);
-
-		vnode->server = NULL;
-		afs_put_server(server);
-	} else {
-		ASSERT(!vnode->cb_promised);
-	}
-
-	_leave("");
-}
-
-/*
- * finish off updating the recorded status of a file after a successful
- * operation completion
- * - starts callback expiry timer
- * - adds to server's callback list
- */
-void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
-				      struct afs_server *server)
-{
-	struct afs_server *oldserver = NULL;
-
-	_enter("%p,%p", vnode, server);
-
-	spin_lock(&vnode->lock);
-	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-	afs_vnode_note_promise(vnode, server);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-
-	wake_up_all(&vnode->update_waitq);
-	afs_put_server(oldserver);
-	_leave("");
-}
-
-/*
- * finish off updating the recorded status of a file after an operation failed
- */
-static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
-{
-	_enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret);
-
-	spin_lock(&vnode->lock);
-
-	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-
-	if (ret == -ENOENT) {
-		/* the file was deleted on the server */
-		_debug("got NOENT from server - marking file deleted");
-		afs_vnode_deleted_remotely(vnode);
-	}
-
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-
-	wake_up_all(&vnode->update_waitq);
-	_leave("");
-}
-
-/*
- * fetch file status from the volume
- * - don't issue a fetch if:
- *   - the changed bit is not set and there's a valid callback
- *   - there are any outstanding ops that will fetch the status
- * - TODO implement local caching
- */
-int afs_vnode_fetch_status(struct afs_vnode *vnode,
-			   struct afs_vnode *auth_vnode, struct key *key)
-{
-	struct afs_server *server;
-	unsigned long acl_order;
-	int ret;
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	_enter("%s,{%x:%u.%u}",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
-
-	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    vnode->cb_promised) {
-		_leave(" [unchanged]");
-		return 0;
-	}
-
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-		_leave(" [deleted]");
-		return -ENOENT;
-	}
-
-	acl_order = 0;
-	if (auth_vnode)
-		acl_order = auth_vnode->acl_order;
-
-	spin_lock(&vnode->lock);
-
-	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    vnode->cb_promised) {
-		spin_unlock(&vnode->lock);
-		_leave(" [unchanged]");
-		return 0;
-	}
-
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-
-	if (vnode->update_cnt > 0) {
-		/* someone else started a fetch */
-		_debug("wait on fetch %d", vnode->update_cnt);
-
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		ASSERT(myself.func != NULL);
-		add_wait_queue(&vnode->update_waitq, &myself);
-
-		/* wait for the status to be updated */
-		for (;;) {
-			if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
-				break;
-			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-				break;
-
-			/* check to see if it got updated and invalidated all
-			 * before we saw it */
-			if (vnode->update_cnt == 0) {
-				remove_wait_queue(&vnode->update_waitq,
-						  &myself);
-				set_current_state(TASK_RUNNING);
-				goto get_anyway;
-			}
-
-			spin_unlock(&vnode->lock);
-
-			schedule();
-			set_current_state(TASK_UNINTERRUPTIBLE);
-
-			spin_lock(&vnode->lock);
-		}
-
-		remove_wait_queue(&vnode->update_waitq, &myself);
-		spin_unlock(&vnode->lock);
-		set_current_state(TASK_RUNNING);
-
-		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
-			-ENOENT : 0;
-	}
-
-get_anyway:
-	/* okay... we're going to have to initiate the op */
-	vnode->update_cnt++;
-
-	spin_unlock(&vnode->lock);
-
-	/* merge AFS status fetches and clear outstanding callback on this
-	 * vnode */
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %p{%08x}",
-		       server, ntohl(server->addr.s_addr));
-
-		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
-					       false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		_debug("adjust");
-		if (auth_vnode)
-			afs_cache_permit(vnode, key, acl_order);
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		_debug("failed [%d]", ret);
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * fetch file data from the volume
- * - TODO implement caching
- */
-int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-			 struct afs_read *desc)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,,,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	/* merge in AFS status fetches and clear outstanding callback on this
-	 * vnode */
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_fetch_data(server, key, vnode, desc,
-					false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	return PTR_ERR(server);
-}
-
-/*
- * make a file or a directory
- */
-int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
-		     const char *name, umode_t mode, struct afs_fid *newfid,
-		     struct afs_file_status *newstatus,
-		     struct afs_callback *newcb, struct afs_server **_server)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%s,,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name);
-
-	/* this op will fetch the status on the directory we're creating in */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
-				    newstatus, newcb, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		*_server = server;
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-		*_server = NULL;
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * remove a file or directory
- */
-int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
-		     bool isdir)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%s",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name);
-
-	/* this op will fetch the status on the directory we're removing from */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_remove(server, key, vnode, name, isdir,
-				    false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * create a hard link
- */
-int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
-			  struct key *key, const char *name)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
-	       dvnode->volume->vlocation->vldb.name,
-	       dvnode->fid.vid,
-	       dvnode->fid.vnode,
-	       dvnode->fid.unique,
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name);
-
-	/* this op will fetch the status on the directory we're removing from */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	spin_lock(&dvnode->lock);
-	dvnode->update_cnt++;
-	spin_unlock(&dvnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(dvnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_link(server, key, dvnode, vnode, name,
-				  false);
-
-	} while (!afs_volume_release_fileserver(dvnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_vnode_finalise_status_update(dvnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-		afs_vnode_status_update_failed(dvnode, ret);
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	spin_lock(&dvnode->lock);
-	dvnode->update_cnt--;
-	ASSERTCMP(dvnode->update_cnt, >=, 0);
-	spin_unlock(&dvnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * create a symbolic link
- */
-int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
-		      const char *name, const char *content,
-		      struct afs_fid *newfid,
-		      struct afs_file_status *newstatus,
-		      struct afs_server **_server)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%s,%s,,,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name, content);
-
-	/* this op will fetch the status on the directory we're creating in */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_symlink(server, key, vnode, name, content,
-				     newfid, newstatus, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		*_server = server;
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-		*_server = NULL;
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * rename a file
- */
-int afs_vnode_rename(struct afs_vnode *orig_dvnode,
-		     struct afs_vnode *new_dvnode,
-		     struct key *key,
-		     const char *orig_name,
-		     const char *new_name)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
-	       orig_dvnode->volume->vlocation->vldb.name,
-	       orig_dvnode->fid.vid,
-	       orig_dvnode->fid.vnode,
-	       orig_dvnode->fid.unique,
-	       new_dvnode->volume->vlocation->vldb.name,
-	       new_dvnode->fid.vid,
-	       new_dvnode->fid.vnode,
-	       new_dvnode->fid.unique,
-	       key_serial(key),
-	       orig_name,
-	       new_name);
-
-	/* this op will fetch the status on both the directories we're dealing
-	 * with */
-	spin_lock(&orig_dvnode->lock);
-	orig_dvnode->update_cnt++;
-	spin_unlock(&orig_dvnode->lock);
-	if (new_dvnode != orig_dvnode) {
-		spin_lock(&new_dvnode->lock);
-		new_dvnode->update_cnt++;
-		spin_unlock(&new_dvnode->lock);
-	}
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(orig_dvnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
-				    new_dvnode, new_name, false);
-
-	} while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(orig_dvnode, server);
-		if (new_dvnode != orig_dvnode)
-			afs_vnode_finalise_status_update(new_dvnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(orig_dvnode, ret);
-		if (new_dvnode != orig_dvnode)
-			afs_vnode_status_update_failed(new_dvnode, ret);
-	}
-
-	_leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&orig_dvnode->lock);
-	orig_dvnode->update_cnt--;
-	ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
-	spin_unlock(&orig_dvnode->lock);
-	if (new_dvnode != orig_dvnode) {
-		spin_lock(&new_dvnode->lock);
-		new_dvnode->update_cnt--;
-		ASSERTCMP(new_dvnode->update_cnt, >=, 0);
-		spin_unlock(&new_dvnode->lock);
-	}
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * write to a file
- */
-int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
-			 unsigned offset, unsigned to)
-{
-	struct afs_server *server;
-	struct afs_vnode *vnode = wb->vnode;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(wb->key),
-	       first, last, offset, to);
-
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_store_data(server, wb, first, last, offset, to,
-					false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	return PTR_ERR(server);
-}
-
-/*
- * set the attributes on a file
- */
-int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
-		      struct iattr *attr)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_setattr(server, key, vnode, attr, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	return PTR_ERR(server);
-}
-
-/*
- * get the status of a volume
- */
-int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
-				struct afs_volume_status *vs)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_get_volume_status(server, key, vnode, vs, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
-
-/*
- * get a lock on a file
- */
-int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
-		       afs_lock_type_t type)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%u",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key), type);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_set_lock(server, key, vnode, type, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
-
-/*
- * extend a lock on a file
- */
-int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_extend_lock(server, key, vnode, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
-
-/*
- * release a lock on a file
- */
-int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_release_lock(server, key, vnode, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index db73d6dad02b..684c48293353 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -10,19 +10,167 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
 #include "internal.h"
 
-static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
+unsigned __read_mostly afs_volume_gc_delay = 10;
+unsigned __read_mostly afs_volume_record_life = 60 * 60;
+
+static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
+
+/*
+ * Allocate a volume record and load it up from a vldb record.
+ */
+static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+					   struct afs_vldb_entry *vldb,
+					   unsigned long type_mask)
+{
+	struct afs_server_list *slist;
+	struct afs_server *server;
+	struct afs_volume *volume;
+	int ret = -ENOMEM, nr_servers = 0, i, j;
+
+	for (i = 0; i < vldb->nr_servers; i++)
+		if (vldb->fs_mask[i] & type_mask)
+			nr_servers++;
+
+	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
+	if (!volume)
+		goto error_0;
+
+	volume->vid		= vldb->vid[params->type];
+	volume->update_at	= ktime_get_real_seconds() + afs_volume_record_life;
+	volume->cell		= afs_get_cell(params->cell);
+	volume->type		= params->type;
+	volume->type_force	= params->force;
+	volume->name_len	= vldb->name_len;
+
+	atomic_set(&volume->usage, 1);
+	INIT_LIST_HEAD(&volume->proc_link);
+	rwlock_init(&volume->servers_lock);
+	memcpy(volume->name, vldb->name, vldb->name_len + 1);
+
+	slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
+	if (IS_ERR(slist)) {
+		ret = PTR_ERR(slist);
+		goto error_1;
+	}
+
+	refcount_set(&slist->usage, 1);
+	volume->servers = slist;
+
+	/* Make sure a records exists for each server this volume occupies. */
+	for (i = 0; i < nr_servers; i++) {
+		if (!(vldb->fs_mask[i] & type_mask))
+			continue;
+
+		server = afs_lookup_server(params->cell, params->key,
+					   &vldb->fs_server[i]);
+		if (IS_ERR(server)) {
+			ret = PTR_ERR(server);
+			if (ret == -ENOENT)
+				continue;
+			goto error_2;
+		}
+
+		/* Insertion-sort by server pointer */
+		for (j = 0; j < slist->nr_servers; j++)
+			if (slist->servers[j].server >= server)
+				break;
+		if (j < slist->nr_servers) {
+			if (slist->servers[j].server == server) {
+				afs_put_server(params->net, server);
+				continue;
+			}
+
+			memmove(slist->servers + j + 1,
+				slist->servers + j,
+				(slist->nr_servers - j) * sizeof(struct afs_server_entry));
+		}
+
+		slist->servers[j].server = server;
+		slist->nr_servers++;
+	}
+
+	if (slist->nr_servers == 0) {
+		ret = -EDESTADDRREQ;
+		goto error_2;
+	}
+
+	return volume;
+
+error_2:
+	afs_put_serverlist(params->net, slist);
+error_1:
+	kfree(volume);
+error_0:
+	return ERR_PTR(ret);
+}
 
 /*
- * lookup a volume by name
- * - this can be one of the following:
+ * Look up a VLDB record for a volume.
+ */
+static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
+						 struct key *key,
+						 const char *volname,
+						 size_t volnamesz)
+{
+	struct afs_addr_cursor ac;
+	struct afs_vldb_entry *vldb;
+	int ret;
+
+	ret = afs_set_vl_cursor(&ac, cell);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	while (afs_iterate_addresses(&ac)) {
+		if (!test_bit(ac.index, &ac.alist->probed)) {
+			ret = afs_vl_get_capabilities(cell->net, &ac, key);
+			switch (ret) {
+			case VL_SERVICE:
+				clear_bit(ac.index, &ac.alist->yfs);
+				set_bit(ac.index, &ac.alist->probed);
+				ac.addr->srx_service = ret;
+				break;
+			case YFS_VL_SERVICE:
+				set_bit(ac.index, &ac.alist->yfs);
+				set_bit(ac.index, &ac.alist->probed);
+				ac.addr->srx_service = ret;
+				break;
+			}
+		}
+		
+		vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key,
+						  volname, volnamesz);
+		switch (ac.error) {
+		case 0:
+			afs_end_cursor(&ac);
+			return vldb;
+		case -ECONNABORTED:
+			ac.error = afs_abort_to_error(ac.abort_code);
+			goto error;
+		case -ENOMEM:
+		case -ENONET:
+			goto error;
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			break;
+		default:
+			ac.error = -EIO;
+			goto error;
+		}
+	}
+
+error:
+	return ERR_PTR(afs_end_cursor(&ac));
+}
+
+/*
+ * Look up a volume in the VL server and create a candidate volume record for
+ * it.
+ *
+ * The volume name can be one of the following:
  *	"%[cell:]volume[.]"		R/W volume
  *	"#[cell:]volume[.]"		R/O or R/W volume (rwparent=0),
  *					 or R/W (rwparent=1) volume
@@ -42,353 +190,218 @@ static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
  * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
  *           explicitly told otherwise
  */
-struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_mount_params *params)
 {
-	struct afs_vlocation *vlocation = NULL;
-	struct afs_volume *volume = NULL;
-	struct afs_server *server = NULL;
-	char srvtmask;
-	int ret, loop;
-
-	_enter("{%*.*s,%d}",
-	       params->volnamesz, params->volnamesz, params->volname, params->rwpath);
-
-	/* lookup the volume location record */
-	vlocation = afs_vlocation_lookup(params->cell, params->key,
-					 params->volname, params->volnamesz);
-	if (IS_ERR(vlocation)) {
-		ret = PTR_ERR(vlocation);
-		vlocation = NULL;
-		goto error;
-	}
+	struct afs_vldb_entry *vldb;
+	struct afs_volume *volume;
+	unsigned long type_mask = 1UL << params->type;
 
-	/* make the final decision on the type we want */
-	ret = -ENOMEDIUM;
-	if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
-		goto error;
+	vldb = afs_vl_lookup_vldb(params->cell, params->key,
+				  params->volname, params->volnamesz);
+	if (IS_ERR(vldb))
+		return ERR_CAST(vldb);
 
-	srvtmask = 0;
-	for (loop = 0; loop < vlocation->vldb.nservers; loop++)
-		srvtmask |= vlocation->vldb.srvtmask[loop];
+	if (test_bit(AFS_VLDB_QUERY_ERROR, &vldb->flags)) {
+		volume = ERR_PTR(vldb->error);
+		goto error;
+	}
 
+	/* Make the final decision on the type we want */
+	volume = ERR_PTR(-ENOMEDIUM);
 	if (params->force) {
-		if (!(srvtmask & (1 << params->type)))
+		if (!(vldb->flags & type_mask))
 			goto error;
-	} else if (srvtmask & AFS_VOL_VTM_RO) {
+	} else if (test_bit(AFS_VLDB_HAS_RO, &vldb->flags)) {
 		params->type = AFSVL_ROVOL;
-	} else if (srvtmask & AFS_VOL_VTM_RW) {
+	} else if (test_bit(AFS_VLDB_HAS_RW, &vldb->flags)) {
 		params->type = AFSVL_RWVOL;
 	} else {
 		goto error;
 	}
 
-	down_write(&params->cell->vl_sem);
+	type_mask = 1UL << params->type;
+	volume = afs_alloc_volume(params, vldb, type_mask);
 
-	/* is the volume already active? */
-	if (vlocation->vols[params->type]) {
-		/* yes - re-use it */
-		volume = vlocation->vols[params->type];
-		afs_get_volume(volume);
-		goto success;
-	}
+error:
+	kfree(vldb);
+	return volume;
+}
 
-	/* create a new volume record */
-	_debug("creating new volume record");
+/*
+ * Destroy a volume record
+ */
+static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
+{
+	_enter("%p", volume);
 
-	ret = -ENOMEM;
-	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
-	if (!volume)
-		goto error_up;
+#ifdef CONFIG_AFS_FSCACHE
+	ASSERTCMP(volume->cache, ==, NULL);
+#endif
 
-	atomic_set(&volume->usage, 1);
-	volume->type		= params->type;
-	volume->type_force	= params->force;
-	volume->cell		= params->cell;
-	volume->vid		= vlocation->vldb.vid[params->type];
-
-	init_rwsem(&volume->server_sem);
-
-	/* look up all the applicable server records */
-	for (loop = 0; loop < 8; loop++) {
-		if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
-			server = afs_lookup_server(
-			       volume->cell, &vlocation->vldb.servers[loop]);
-			if (IS_ERR(server)) {
-				ret = PTR_ERR(server);
-				goto error_discard;
-			}
+	afs_put_serverlist(net, volume->servers);
+	afs_put_cell(net, volume->cell);
+	kfree(volume);
 
-			volume->servers[volume->nservers] = server;
-			volume->nservers++;
-		}
+	_leave(" [destroyed]");
+}
+
+/*
+ * Drop a reference on a volume record.
+ */
+void afs_put_volume(struct afs_cell *cell, struct afs_volume *volume)
+{
+	if (volume) {
+		_enter("%s", volume->name);
+
+		if (atomic_dec_and_test(&volume->usage))
+			afs_destroy_volume(cell->net, volume);
 	}
+}
 
-	/* attach the cache and volume location */
+/*
+ * Activate a volume.
+ */
+void afs_activate_volume(struct afs_volume *volume)
+{
 #ifdef CONFIG_AFS_FSCACHE
-	volume->cache = fscache_acquire_cookie(vlocation->cache,
+	volume->cache = fscache_acquire_cookie(volume->cell->cache,
 					       &afs_volume_cache_index_def,
 					       volume, true);
 #endif
-	afs_get_vlocation(vlocation);
-	volume->vlocation = vlocation;
-
-	vlocation->vols[volume->type] = volume;
-
-success:
-	_debug("kAFS selected %s volume %08x",
-	       afs_voltypes[volume->type], volume->vid);
-	up_write(&params->cell->vl_sem);
-	afs_put_vlocation(vlocation);
-	_leave(" = %p", volume);
-	return volume;
-
-	/* clean up */
-error_up:
-	up_write(&params->cell->vl_sem);
-error:
-	afs_put_vlocation(vlocation);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-
-error_discard:
-	up_write(&params->cell->vl_sem);
-
-	for (loop = volume->nservers - 1; loop >= 0; loop--)
-		afs_put_server(volume->servers[loop]);
 
-	kfree(volume);
-	goto error;
+	write_lock(&volume->cell->proc_lock);
+	list_add_tail(&volume->proc_link, &volume->cell->proc_volumes);
+	write_unlock(&volume->cell->proc_lock);
 }
 
 /*
- * destroy a volume record
+ * Deactivate a volume.
  */
-void afs_put_volume(struct afs_volume *volume)
+void afs_deactivate_volume(struct afs_volume *volume)
 {
-	struct afs_vlocation *vlocation;
-	int loop;
-
-	if (!volume)
-		return;
-
-	_enter("%p", volume);
+	_enter("%s", volume->name);
 
-	ASSERTCMP(atomic_read(&volume->usage), >, 0);
+	write_lock(&volume->cell->proc_lock);
+	list_del_init(&volume->proc_link);
+	write_unlock(&volume->cell->proc_lock);
 
-	vlocation = volume->vlocation;
-
-	/* to prevent a race, the decrement and the dequeue must be effectively
-	 * atomic */
-	down_write(&vlocation->cell->vl_sem);
-
-	if (likely(!atomic_dec_and_test(&volume->usage))) {
-		up_write(&vlocation->cell->vl_sem);
-		_leave("");
-		return;
-	}
-
-	vlocation->vols[volume->type] = NULL;
-
-	up_write(&vlocation->cell->vl_sem);
-
-	/* finish cleaning up the volume */
 #ifdef CONFIG_AFS_FSCACHE
-	fscache_relinquish_cookie(volume->cache, 0);
+	fscache_relinquish_cookie(volume->cache,
+				  test_bit(AFS_VOLUME_DELETED, &volume->flags));
+	volume->cache = NULL;
 #endif
-	afs_put_vlocation(vlocation);
-
-	for (loop = volume->nservers - 1; loop >= 0; loop--)
-		afs_put_server(volume->servers[loop]);
-
-	kfree(volume);
 
-	_leave(" [destroyed]");
+	_leave("");
 }
 
 /*
- * pick a server to use to try accessing this volume
- * - returns with an elevated usage count on the server chosen
+ * Query the VL service to update the volume status.
  */
-struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
+static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
 {
-	struct afs_volume *volume = vnode->volume;
-	struct afs_server *server;
-	int ret, state, loop;
+	struct afs_server_list *new, *old, *discard;
+	struct afs_vldb_entry *vldb;
+	char idbuf[16];
+	int ret, idsz;
+
+	_enter("");
+
+	/* We look up an ID by passing it as a decimal string in the
+	 * operation's name parameter.
+	 */
+	idsz = sprintf(idbuf, "%u", volume->vid);
 
-	_enter("%s", volume->vlocation->vldb.name);
+	vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
+	if (IS_ERR(vldb)) {
+		ret = PTR_ERR(vldb);
+		goto error;
+	}
 
-	/* stick with the server we're already using if we can */
-	if (vnode->server && vnode->server->fs_state == 0) {
-		afs_get_server(vnode->server);
-		_leave(" = %p [current]", vnode->server);
-		return vnode->server;
+	/* See if the volume got renamed. */
+	if (vldb->name_len != volume->name_len ||
+	    memcmp(vldb->name, volume->name, vldb->name_len) != 0) {
+		/* TODO: Use RCU'd string. */
+		memcpy(volume->name, vldb->name, AFS_MAXVOLNAME);
+		volume->name_len = vldb->name_len;
 	}
 
-	down_read(&volume->server_sem);
+	/* See if the volume's server list got updated. */
+	new = afs_alloc_server_list(volume->cell, key,
+				      vldb, (1 << volume->type));
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto error_vldb;
+	}
 
-	/* handle the no-server case */
-	if (volume->nservers == 0) {
-		ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
-		up_read(&volume->server_sem);
-		_leave(" = %d [no servers]", ret);
-		return ERR_PTR(ret);
+	write_lock(&volume->servers_lock);
+
+	discard = new;
+	old = volume->servers;
+	if (afs_annotate_server_list(new, old)) {
+		new->seq = volume->servers_seq + 1;
+		volume->servers = new;
+		smp_wmb();
+		volume->servers_seq++;
+		discard = old;
 	}
 
-	/* basically, just search the list for the first live server and use
-	 * that */
+	volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
+	clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
+	write_unlock(&volume->servers_lock);
 	ret = 0;
-	for (loop = 0; loop < volume->nservers; loop++) {
-		server = volume->servers[loop];
-		state = server->fs_state;
 
-		_debug("consider %d [%d]", loop, state);
+	afs_put_serverlist(volume->cell->net, discard);
+error_vldb:
+	kfree(vldb);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
 
-		switch (state) {
-			/* found an apparently healthy server */
-		case 0:
-			afs_get_server(server);
-			up_read(&volume->server_sem);
-			_leave(" = %p (picked %08x)",
-			       server, ntohl(server->addr.s_addr));
-			return server;
+/*
+ * Make sure the volume record is up to date.
+ */
+int afs_check_volume_status(struct afs_volume *volume, struct key *key)
+{
+	time64_t now = ktime_get_real_seconds();
+	int ret, retries = 0;
 
-		case -ENETUNREACH:
-			if (ret == 0)
-				ret = state;
-			break;
+	_enter("");
 
-		case -EHOSTUNREACH:
-			if (ret == 0 ||
-			    ret == -ENETUNREACH)
-				ret = state;
-			break;
+	if (volume->update_at <= now)
+		set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
 
-		case -ECONNREFUSED:
-			if (ret == 0 ||
-			    ret == -ENETUNREACH ||
-			    ret == -EHOSTUNREACH)
-				ret = state;
-			break;
+retry:
+	if (!test_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags) &&
+	    !test_bit(AFS_VOLUME_WAIT, &volume->flags)) {
+		_leave(" = 0");
+		return 0;
+	}
 
-		default:
-		case -EREMOTEIO:
-			if (ret == 0 ||
-			    ret == -ENETUNREACH ||
-			    ret == -EHOSTUNREACH ||
-			    ret == -ECONNREFUSED)
-				ret = state;
-			break;
-		}
+	if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) {
+		ret = afs_update_volume_status(volume, key);
+		clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags);
+		clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags);
+		wake_up_bit(&volume->flags, AFS_VOLUME_WAIT);
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	/* no available servers
-	 * - TODO: handle the no active servers case better
-	 */
-	up_read(&volume->server_sem);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-}
+	if (!test_bit(AFS_VOLUME_WAIT, &volume->flags)) {
+		_leave(" = 0 [no wait]");
+		return 0;
+	}
 
-/*
- * release a server after use
- * - releases the ref on the server struct that was acquired by picking
- * - records result of using a particular server to access a volume
- * - return 0 to try again, 1 if okay or to issue error
- * - the caller must release the server struct if result was 0
- */
-int afs_volume_release_fileserver(struct afs_vnode *vnode,
-				  struct afs_server *server,
-				  int result)
-{
-	struct afs_volume *volume = vnode->volume;
-	unsigned loop;
-
-	_enter("%s,%08x,%d",
-	       volume->vlocation->vldb.name, ntohl(server->addr.s_addr),
-	       result);
-
-	switch (result) {
-		/* success */
-	case 0:
-		server->fs_act_jif = jiffies;
-		server->fs_state = 0;
-		_leave("");
-		return 1;
-
-		/* the fileserver denied all knowledge of the volume */
-	case -ENOMEDIUM:
-		server->fs_act_jif = jiffies;
-		down_write(&volume->server_sem);
-
-		/* firstly, find where the server is in the active list (if it
-		 * is) */
-		for (loop = 0; loop < volume->nservers; loop++)
-			if (volume->servers[loop] == server)
-				goto present;
-
-		/* no longer there - may have been discarded by another op */
-		goto try_next_server_upw;
-
-	present:
-		volume->nservers--;
-		memmove(&volume->servers[loop],
-			&volume->servers[loop + 1],
-			sizeof(volume->servers[loop]) *
-			(volume->nservers - loop));
-		volume->servers[volume->nservers] = NULL;
-		afs_put_server(server);
-		volume->rjservers++;
-
-		if (volume->nservers > 0)
-			/* another server might acknowledge its existence */
-			goto try_next_server_upw;
-
-		/* handle the case where all the fileservers have rejected the
-		 * volume
-		 * - TODO: try asking the fileservers for volume information
-		 * - TODO: contact the VL server again to see if the volume is
-		 *         no longer registered
-		 */
-		up_write(&volume->server_sem);
-		afs_put_server(server);
-		_leave(" [completely rejected]");
-		return 1;
-
-		/* problem reaching the server */
-	case -ENETUNREACH:
-	case -EHOSTUNREACH:
-	case -ECONNREFUSED:
-	case -ETIME:
-	case -ETIMEDOUT:
-	case -EREMOTEIO:
-		/* mark the server as dead
-		 * TODO: vary dead timeout depending on error
-		 */
-		spin_lock(&server->fs_lock);
-		if (!server->fs_state) {
-			server->fs_dead_jif = jiffies + HZ * 10;
-			server->fs_state = result;
-			printk("kAFS: SERVER DEAD state=%d\n", result);
-		}
-		spin_unlock(&server->fs_lock);
-		goto try_next_server;
-
-		/* miscellaneous error */
-	default:
-		server->fs_act_jif = jiffies;
-	case -ENOMEM:
-	case -ENONET:
-		/* tell the caller to accept the result */
-		afs_put_server(server);
-		_leave(" [local failure]");
-		return 1;
+	ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE);
+	if (ret == -ERESTARTSYS) {
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	/* tell the caller to loop around and try the next server */
-try_next_server_upw:
-	up_write(&volume->server_sem);
-try_next_server:
-	afs_put_server(server);
-	_leave(" [try next server]");
-	return 0;
+	retries++;
+	if (retries == 4) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+	goto retry;
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 106e43db1115..cb5f8a3df577 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -8,6 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -16,9 +17,6 @@
 #include <linux/pagevec.h>
 #include "internal.h"
 
-static int afs_write_back_from_locked_page(struct afs_writeback *wb,
-					   struct page *page);
-
 /*
  * mark a page as having been made dirty and thus needing writeback
  */
@@ -29,58 +27,6 @@ int afs_set_page_dirty(struct page *page)
 }
 
 /*
- * unlink a writeback record because its usage has reached zero
- * - must be called with the wb->vnode->writeback_lock held
- */
-static void afs_unlink_writeback(struct afs_writeback *wb)
-{
-	struct afs_writeback *front;
-	struct afs_vnode *vnode = wb->vnode;
-
-	list_del_init(&wb->link);
-	if (!list_empty(&vnode->writebacks)) {
-		/* if an fsync rises to the front of the queue then wake it
-		 * up */
-		front = list_entry(vnode->writebacks.next,
-				   struct afs_writeback, link);
-		if (front->state == AFS_WBACK_SYNCING) {
-			_debug("wake up sync");
-			front->state = AFS_WBACK_COMPLETE;
-			wake_up(&front->waitq);
-		}
-	}
-}
-
-/*
- * free a writeback record
- */
-static void afs_free_writeback(struct afs_writeback *wb)
-{
-	_enter("");
-	key_put(wb->key);
-	kfree(wb);
-}
-
-/*
- * dispose of a reference to a writeback record
- */
-void afs_put_writeback(struct afs_writeback *wb)
-{
-	struct afs_vnode *vnode = wb->vnode;
-
-	_enter("{%d}", wb->usage);
-
-	spin_lock(&vnode->writeback_lock);
-	if (--wb->usage == 0)
-		afs_unlink_writeback(wb);
-	else
-		wb = NULL;
-	spin_unlock(&vnode->writeback_lock);
-	if (wb)
-		afs_free_writeback(wb);
-}
-
-/*
  * partly or wholly fill a page that's under preparation for writing
  */
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
@@ -103,7 +49,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 	req->pages[0] = page;
 	get_page(page);
 
-	ret = afs_vnode_fetch_data(vnode, key, req);
+	ret = afs_fetch_data(vnode, key, req);
 	afs_put_read(req);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
@@ -125,42 +71,32 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 		    loff_t pos, unsigned len, unsigned flags,
 		    struct page **pagep, void **fsdata)
 {
-	struct afs_writeback *candidate, *wb;
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	struct page *page;
-	struct key *key = file->private_data;
-	unsigned from = pos & (PAGE_SIZE - 1);
-	unsigned to = from + len;
+	struct key *key = afs_file_key(file);
+	unsigned long priv;
+	unsigned f, from = pos & (PAGE_SIZE - 1);
+	unsigned t, to = from + len;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	int ret;
 
 	_enter("{%x:%u},{%lx},%u,%u",
 	       vnode->fid.vid, vnode->fid.vnode, index, from, to);
 
-	candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
-	if (!candidate)
-		return -ENOMEM;
-	candidate->vnode = vnode;
-	candidate->first = candidate->last = index;
-	candidate->offset_first = from;
-	candidate->to_last = to;
-	INIT_LIST_HEAD(&candidate->link);
-	candidate->usage = 1;
-	candidate->state = AFS_WBACK_PENDING;
-	init_waitqueue_head(&candidate->waitq);
+	/* We want to store information about how much of a page is altered in
+	 * page->private.
+	 */
+	BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page) {
-		kfree(candidate);
+	if (!page)
 		return -ENOMEM;
-	}
 
 	if (!PageUptodate(page) && len != PAGE_SIZE) {
 		ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page);
 		if (ret < 0) {
 			unlock_page(page);
 			put_page(page);
-			kfree(candidate);
 			_leave(" = %d [prep]", ret);
 			return ret;
 		}
@@ -171,79 +107,59 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = page;
 
 try_again:
-	spin_lock(&vnode->writeback_lock);
-
-	/* see if this page is already pending a writeback under a suitable key
-	 * - if so we can just join onto that one */
-	wb = (struct afs_writeback *) page_private(page);
-	if (wb) {
-		if (wb->key == key && wb->state == AFS_WBACK_PENDING)
-			goto subsume_in_current_wb;
-		goto flush_conflicting_wb;
+	/* See if this page is already partially written in a way that we can
+	 * merge the new write with.
+	 */
+	t = f = 0;
+	if (PagePrivate(page)) {
+		priv = page_private(page);
+		f = priv & AFS_PRIV_MAX;
+		t = priv >> AFS_PRIV_SHIFT;
+		ASSERTCMP(f, <=, t);
 	}
 
-	if (index > 0) {
-		/* see if we can find an already pending writeback that we can
-		 * append this page to */
-		list_for_each_entry(wb, &vnode->writebacks, link) {
-			if (wb->last == index - 1 && wb->key == key &&
-			    wb->state == AFS_WBACK_PENDING)
-				goto append_to_previous_wb;
+	if (f != t) {
+		if (PageWriteback(page)) {
+			trace_afs_page_dirty(vnode, tracepoint_string("alrdy"),
+					     page->index, priv);
+			goto flush_conflicting_write;
 		}
+		if (to < f || from > t)
+			goto flush_conflicting_write;
+		if (from < f)
+			f = from;
+		if (to > t)
+			t = to;
+	} else {
+		f = from;
+		t = to;
 	}
 
-	list_add_tail(&candidate->link, &vnode->writebacks);
-	candidate->key = key_get(key);
-	spin_unlock(&vnode->writeback_lock);
-	SetPagePrivate(page);
-	set_page_private(page, (unsigned long) candidate);
-	_leave(" = 0 [new]");
-	return 0;
-
-subsume_in_current_wb:
-	_debug("subsume");
-	ASSERTRANGE(wb->first, <=, index, <=, wb->last);
-	if (index == wb->first && from < wb->offset_first)
-		wb->offset_first = from;
-	if (index == wb->last && to > wb->to_last)
-		wb->to_last = to;
-	spin_unlock(&vnode->writeback_lock);
-	kfree(candidate);
-	_leave(" = 0 [sub]");
-	return 0;
-
-append_to_previous_wb:
-	_debug("append into %lx-%lx", wb->first, wb->last);
-	wb->usage++;
-	wb->last++;
-	wb->to_last = to;
-	spin_unlock(&vnode->writeback_lock);
+	priv = (unsigned long)t << AFS_PRIV_SHIFT;
+	priv |= f;
+	trace_afs_page_dirty(vnode, tracepoint_string("begin"),
+			     page->index, priv);
 	SetPagePrivate(page);
-	set_page_private(page, (unsigned long) wb);
-	kfree(candidate);
-	_leave(" = 0 [app]");
+	set_page_private(page, priv);
+	_leave(" = 0");
 	return 0;
 
-	/* the page is currently bound to another context, so if it's dirty we
-	 * need to flush it before we can use the new context */
-flush_conflicting_wb:
+	/* The previous write and this write aren't adjacent or overlapping, so
+	 * flush the page out.
+	 */
+flush_conflicting_write:
 	_debug("flush conflict");
-	if (wb->state == AFS_WBACK_PENDING)
-		wb->state = AFS_WBACK_CONFLICTING;
-	spin_unlock(&vnode->writeback_lock);
-	if (clear_page_dirty_for_io(page)) {
-		ret = afs_write_back_from_locked_page(wb, page);
-		if (ret < 0) {
-			afs_put_writeback(candidate);
-			_leave(" = %d", ret);
-			return ret;
-		}
+	ret = write_one_page(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	/* the page holds a ref on the writeback record */
-	afs_put_writeback(wb);
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	ret = lock_page_killable(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
 	goto try_again;
 }
 
@@ -255,7 +171,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 		  struct page *page, void *fsdata)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	loff_t i_size, maybe_i_size;
 	int ret;
 
@@ -266,11 +182,11 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (maybe_i_size > i_size) {
-		spin_lock(&vnode->writeback_lock);
+		spin_lock(&vnode->wb_lock);
 		i_size = i_size_read(&vnode->vfs_inode);
 		if (maybe_i_size > i_size)
 			i_size_write(&vnode->vfs_inode, maybe_i_size);
-		spin_unlock(&vnode->writeback_lock);
+		spin_unlock(&vnode->wb_lock);
 	}
 
 	if (!PageUptodate(page)) {
@@ -299,16 +215,17 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 /*
  * kill all the pages in the given range
  */
-static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+static void afs_kill_pages(struct address_space *mapping,
 			   pgoff_t first, pgoff_t last)
 {
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
 	struct pagevec pv;
 	unsigned count, loop;
 
 	_enter("{%x:%u},%lx-%lx",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
-	pagevec_init(&pv, 0);
+	pagevec_init(&pv);
 
 	do {
 		_debug("kill %lx-%lx", first, last);
@@ -316,37 +233,157 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error,
 		count = last - first + 1;
 		if (count > PAGEVEC_SIZE)
 			count = PAGEVEC_SIZE;
-		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
-					      first, count, pv.pages);
+		pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
 		ASSERTCMP(pv.nr, ==, count);
 
 		for (loop = 0; loop < count; loop++) {
 			struct page *page = pv.pages[loop];
 			ClearPageUptodate(page);
-			if (error)
-				SetPageError(page);
-			if (PageWriteback(page))
-				end_page_writeback(page);
+			SetPageError(page);
+			end_page_writeback(page);
+			if (page->index >= first)
+				first = page->index + 1;
+			lock_page(page);
+			generic_error_remove_page(mapping, page);
+		}
+
+		__pagevec_release(&pv);
+	} while (first <= last);
+
+	_leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void afs_redirty_pages(struct writeback_control *wbc,
+			      struct address_space *mapping,
+			      pgoff_t first, pgoff_t last)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct pagevec pv;
+	unsigned count, loop;
+
+	_enter("{%x:%u},%lx-%lx",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	pagevec_init(&pv);
+
+	do {
+		_debug("redirty %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		for (loop = 0; loop < count; loop++) {
+			struct page *page = pv.pages[loop];
+
+			redirty_page_for_writepage(wbc, page);
+			end_page_writeback(page);
 			if (page->index >= first)
 				first = page->index + 1;
 		}
 
 		__pagevec_release(&pv);
-	} while (first < last);
+	} while (first <= last);
 
 	_leave("");
 }
 
 /*
- * synchronously write back the locked page and any subsequent non-locked dirty
- * pages also covered by the same writeback record
+ * write to a file
  */
-static int afs_write_back_from_locked_page(struct afs_writeback *wb,
-					   struct page *primary_page)
+static int afs_store_data(struct address_space *mapping,
+			  pgoff_t first, pgoff_t last,
+			  unsigned offset, unsigned to)
 {
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct afs_fs_cursor fc;
+	struct afs_wb_key *wbk = NULL;
+	struct list_head *p;
+	int ret = -ENOKEY, ret2;
+
+	_enter("%s{%x:%u.%u},%lx,%lx,%x,%x",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       first, last, offset, to);
+
+	spin_lock(&vnode->wb_lock);
+	p = vnode->wb_keys.next;
+
+	/* Iterate through the list looking for a valid key to use. */
+try_next_key:
+	while (p != &vnode->wb_keys) {
+		wbk = list_entry(p, struct afs_wb_key, vnode_link);
+		_debug("wbk %u", key_serial(wbk->key));
+		ret2 = key_validate(wbk->key);
+		if (ret2 == 0)
+			goto found_key;
+		if (ret == -ENOKEY)
+			ret = ret2;
+		p = p->next;
+	}
+
+	spin_unlock(&vnode->wb_lock);
+	afs_put_wb_key(wbk);
+	_leave(" = %d [no keys]", ret);
+	return ret;
+
+found_key:
+	refcount_inc(&wbk->usage);
+	spin_unlock(&vnode->wb_lock);
+
+	_debug("USE WB KEY %u", key_serial(wbk->key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_store_data(&fc, mapping, first, last, offset, to);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	switch (ret) {
+	case -EACCES:
+	case -EPERM:
+	case -ENOKEY:
+	case -EKEYEXPIRED:
+	case -EKEYREJECTED:
+	case -EKEYREVOKED:
+		_debug("next");
+		spin_lock(&vnode->wb_lock);
+		p = wbk->vnode_link.next;
+		afs_put_wb_key(wbk);
+		goto try_next_key;
+	}
+
+	afs_put_wb_key(wbk);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static int afs_write_back_from_locked_page(struct address_space *mapping,
+					   struct writeback_control *wbc,
+					   struct page *primary_page,
+					   pgoff_t final_page)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
 	struct page *pages[8], *page;
-	unsigned long count;
-	unsigned n, offset, to;
+	unsigned long count, priv;
+	unsigned n, offset, to, f, t;
 	pgoff_t start, first, last;
 	int loop, ret;
 
@@ -356,20 +393,33 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 	if (test_set_page_writeback(primary_page))
 		BUG();
 
-	/* find all consecutive lockable dirty pages, stopping when we find a
-	 * page that is not immediately lockable, is not dirty or is missing,
-	 * or we reach the end of the range */
+	/* Find all consecutive lockable dirty pages that have contiguous
+	 * written regions, stopping when we find a page that is not
+	 * immediately lockable, is not dirty or is missing, or we reach the
+	 * end of the range.
+	 */
 	start = primary_page->index;
-	if (start >= wb->last)
+	priv = page_private(primary_page);
+	offset = priv & AFS_PRIV_MAX;
+	to = priv >> AFS_PRIV_SHIFT;
+	trace_afs_page_dirty(vnode, tracepoint_string("store"),
+			     primary_page->index, priv);
+
+	WARN_ON(offset == to);
+	if (offset == to)
+		trace_afs_page_dirty(vnode, tracepoint_string("WARN"),
+				     primary_page->index, priv);
+
+	if (start >= final_page || to < PAGE_SIZE)
 		goto no_more;
+
 	start++;
 	do {
 		_debug("more %lx [%lx]", start, count);
-		n = wb->last - start + 1;
+		n = final_page - start + 1;
 		if (n > ARRAY_SIZE(pages))
 			n = ARRAY_SIZE(pages);
-		n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
-					  start, n, pages);
+		n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages);
 		_debug("fgpc %u", n);
 		if (n == 0)
 			goto no_more;
@@ -381,16 +431,30 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 		}
 
 		for (loop = 0; loop < n; loop++) {
+			if (to != PAGE_SIZE)
+				break;
 			page = pages[loop];
-			if (page->index > wb->last)
+			if (page->index > final_page)
 				break;
 			if (!trylock_page(page))
 				break;
-			if (!PageDirty(page) ||
-			    page_private(page) != (unsigned long) wb) {
+			if (!PageDirty(page) || PageWriteback(page)) {
 				unlock_page(page);
 				break;
 			}
+
+			priv = page_private(page);
+			f = priv & AFS_PRIV_MAX;
+			t = priv >> AFS_PRIV_SHIFT;
+			if (f != 0) {
+				unlock_page(page);
+				break;
+			}
+			to = t;
+
+			trace_afs_page_dirty(vnode, tracepoint_string("store+"),
+					     page->index, priv);
+
 			if (!clear_page_dirty_for_io(page))
 				BUG();
 			if (test_set_page_writeback(page))
@@ -406,50 +470,55 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 		}
 
 		start += loop;
-	} while (start <= wb->last && count < 65536);
+	} while (start <= final_page && count < 65536);
 
 no_more:
-	/* we now have a contiguous set of dirty pages, each with writeback set
-	 * and the dirty mark cleared; the first page is locked and must remain
-	 * so, all the rest are unlocked */
+	/* We now have a contiguous set of dirty pages, each with writeback
+	 * set; the first page is still locked at this point, but all the rest
+	 * have been unlocked.
+	 */
+	unlock_page(primary_page);
+
 	first = primary_page->index;
 	last = first + count - 1;
 
-	offset = (first == wb->first) ? wb->offset_first : 0;
-	to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
-
 	_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
 
-	ret = afs_vnode_store_data(wb, first, last, offset, to);
-	if (ret < 0) {
-		switch (ret) {
-		case -EDQUOT:
-		case -ENOSPC:
-			mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC);
-			break;
-		case -EROFS:
-		case -EIO:
-		case -EREMOTEIO:
-		case -EFBIG:
-		case -ENOENT:
-		case -ENOMEDIUM:
-		case -ENXIO:
-			afs_kill_pages(wb->vnode, true, first, last);
-			mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO);
-			break;
-		case -EACCES:
-		case -EPERM:
-		case -ENOKEY:
-		case -EKEYEXPIRED:
-		case -EKEYREJECTED:
-		case -EKEYREVOKED:
-			afs_kill_pages(wb->vnode, false, first, last);
-			break;
-		default:
-			break;
-		}
-	} else {
+	ret = afs_store_data(mapping, first, last, offset, to);
+	switch (ret) {
+	case 0:
 		ret = count;
+		break;
+
+	default:
+		pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
+		/* Fall through */
+	case -EACCES:
+	case -EPERM:
+	case -ENOKEY:
+	case -EKEYEXPIRED:
+	case -EKEYREJECTED:
+	case -EKEYREVOKED:
+		afs_redirty_pages(wbc, mapping, first, last);
+		mapping_set_error(mapping, ret);
+		break;
+
+	case -EDQUOT:
+	case -ENOSPC:
+		afs_redirty_pages(wbc, mapping, first, last);
+		mapping_set_error(mapping, -ENOSPC);
+		break;
+
+	case -EROFS:
+	case -EIO:
+	case -EREMOTEIO:
+	case -EFBIG:
+	case -ENOENT:
+	case -ENOMEDIUM:
+	case -ENXIO:
+		afs_kill_pages(mapping, first, last);
+		mapping_set_error(mapping, ret);
+		break;
 	}
 
 	_leave(" = %d", ret);
@@ -462,16 +531,12 @@ no_more:
  */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct afs_writeback *wb;
 	int ret;
 
 	_enter("{%lx},", page->index);
 
-	wb = (struct afs_writeback *) page_private(page);
-	ASSERT(wb != NULL);
-
-	ret = afs_write_back_from_locked_page(wb, page);
-	unlock_page(page);
+	ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
+					      wbc->range_end >> PAGE_SHIFT);
 	if (ret < 0) {
 		_leave(" = %d", ret);
 		return 0;
@@ -490,33 +555,30 @@ static int afs_writepages_region(struct address_space *mapping,
 				 struct writeback_control *wbc,
 				 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-	struct afs_writeback *wb;
 	struct page *page;
 	int ret, n;
 
 	_enter(",,%lx,%lx,", index, end);
 
 	do {
-		n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
-				       1, &page);
+		n = find_get_pages_range_tag(mapping, &index, end,
+					PAGECACHE_TAG_DIRTY, 1, &page);
 		if (!n)
 			break;
 
 		_debug("wback %lx", page->index);
 
-		if (page->index > end) {
-			*_next = index;
-			put_page(page);
-			_leave(" = 0 [%lx]", *_next);
-			return 0;
-		}
-
 		/* at this point we hold neither mapping->tree_lock nor lock on
 		 * the page itself: the page may be truncated or invalidated
 		 * (changing page->mapping to NULL), or even swizzled back from
 		 * swapper_space to tmpfs file mapping
 		 */
-		lock_page(page);
+		ret = lock_page_killable(page);
+		if (ret < 0) {
+			put_page(page);
+			_leave(" = %d", ret);
+			return ret;
+		}
 
 		if (page->mapping != mapping || !PageDirty(page)) {
 			unlock_page(page);
@@ -532,17 +594,9 @@ static int afs_writepages_region(struct address_space *mapping,
 			continue;
 		}
 
-		wb = (struct afs_writeback *) page_private(page);
-		ASSERT(wb != NULL);
-
-		spin_lock(&wb->vnode->writeback_lock);
-		wb->state = AFS_WBACK_WRITING;
-		spin_unlock(&wb->vnode->writeback_lock);
-
 		if (!clear_page_dirty_for_io(page))
 			BUG();
-		ret = afs_write_back_from_locked_page(wb, page);
-		unlock_page(page);
+		ret = afs_write_back_from_locked_page(mapping, wbc, page, end);
 		put_page(page);
 		if (ret < 0) {
 			_leave(" = %d", ret);
@@ -598,18 +652,15 @@ int afs_writepages(struct address_space *mapping,
  */
 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 {
-	struct afs_writeback *wb = call->wb;
 	struct pagevec pv;
+	unsigned long priv;
 	unsigned count, loop;
 	pgoff_t first = call->first, last = call->last;
-	bool free_wb;
 
 	_enter("{%x:%u},{%lx-%lx}",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
-	ASSERT(wb != NULL);
-
-	pagevec_init(&pv, 0);
+	pagevec_init(&pv);
 
 	do {
 		_debug("done %lx-%lx", first, last);
@@ -617,35 +668,22 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 		count = last - first + 1;
 		if (count > PAGEVEC_SIZE)
 			count = PAGEVEC_SIZE;
-		pv.nr = find_get_pages_contig(call->mapping, first, count,
-					      pv.pages);
+		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+					      first, count, pv.pages);
 		ASSERTCMP(pv.nr, ==, count);
 
-		spin_lock(&vnode->writeback_lock);
 		for (loop = 0; loop < count; loop++) {
-			struct page *page = pv.pages[loop];
-			end_page_writeback(page);
-			if (page_private(page) == (unsigned long) wb) {
-				set_page_private(page, 0);
-				ClearPagePrivate(page);
-				wb->usage--;
-			}
-		}
-		free_wb = false;
-		if (wb->usage == 0) {
-			afs_unlink_writeback(wb);
-			free_wb = true;
+			priv = page_private(pv.pages[loop]);
+			trace_afs_page_dirty(vnode, tracepoint_string("clear"),
+					     pv.pages[loop]->index, priv);
+			set_page_private(pv.pages[loop], 0);
+			end_page_writeback(pv.pages[loop]);
 		}
-		spin_unlock(&vnode->writeback_lock);
 		first += count;
-		if (free_wb) {
-			afs_free_writeback(wb);
-			wb = NULL;
-		}
-
 		__pagevec_release(&pv);
 	} while (first <= last);
 
+	afs_prune_wb_keys(vnode);
 	_leave("");
 }
 
@@ -677,28 +715,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
 }
 
 /*
- * flush the vnode to the fileserver
- */
-int afs_writeback_all(struct afs_vnode *vnode)
-{
-	struct address_space *mapping = vnode->vfs_inode.i_mapping;
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_ALL,
-		.nr_to_write	= LONG_MAX,
-		.range_cyclic	= 1,
-	};
-	int ret;
-
-	_enter("");
-
-	ret = mapping->a_ops->writepages(mapping, &wbc);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
  * flush any dirty pages for this process, and check for write errors.
  * - the return status from this call provides a reliable indication of
  *   whether any write errors occurred for this process.
@@ -706,61 +722,13 @@ int afs_writeback_all(struct afs_vnode *vnode)
 int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file_inode(file);
-	struct afs_writeback *wb, *xwb;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	int ret;
 
 	_enter("{%x:%u},{n=%pD},%d",
 	       vnode->fid.vid, vnode->fid.vnode, file,
 	       datasync);
 
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		return ret;
-	inode_lock(inode);
-
-	/* use a writeback record as a marker in the queue - when this reaches
-	 * the front of the queue, all the outstanding writes are either
-	 * completed or rejected */
-	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
-	if (!wb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	wb->vnode = vnode;
-	wb->first = 0;
-	wb->last = -1;
-	wb->offset_first = 0;
-	wb->to_last = PAGE_SIZE;
-	wb->usage = 1;
-	wb->state = AFS_WBACK_SYNCING;
-	init_waitqueue_head(&wb->waitq);
-
-	spin_lock(&vnode->writeback_lock);
-	list_for_each_entry(xwb, &vnode->writebacks, link) {
-		if (xwb->state == AFS_WBACK_PENDING)
-			xwb->state = AFS_WBACK_CONFLICTING;
-	}
-	list_add_tail(&wb->link, &vnode->writebacks);
-	spin_unlock(&vnode->writeback_lock);
-
-	/* push all the outstanding writebacks to the server */
-	ret = afs_writeback_all(vnode);
-	if (ret < 0) {
-		afs_put_writeback(wb);
-		_leave(" = %d [wb]", ret);
-		goto out;
-	}
-
-	/* wait for the preceding writes to actually complete */
-	ret = wait_event_interruptible(wb->waitq,
-				       wb->state == AFS_WBACK_COMPLETE ||
-				       vnode->writebacks.next == &wb->link);
-	afs_put_writeback(wb);
-	_leave(" = %d", ret);
-out:
-	inode_unlock(inode);
-	return ret;
+	return file_write_and_wait_range(file, start, end);
 }
 
 /*
@@ -781,19 +749,114 @@ int afs_flush(struct file *file, fl_owner_t id)
  * notification that a previously read-only page is about to become writable
  * - if it returns an error, the caller will deliver a bus error signal
  */
-int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int afs_page_mkwrite(struct vm_fault *vmf)
 {
-	struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+	struct file *file = vmf->vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	unsigned long priv;
 
 	_enter("{{%x:%u}},{%lx}",
-	       vnode->fid.vid, vnode->fid.vnode, page->index);
+	       vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
+
+	sb_start_pagefault(inode->i_sb);
 
-	/* wait for the page to be written to the cache before we allow it to
-	 * be modified */
+	/* Wait for the page to be written to the cache before we allow it to
+	 * be modified.  We then assume the entire page will need writing back.
+	 */
 #ifdef CONFIG_AFS_FSCACHE
-	fscache_wait_on_page_write(vnode->cache, page);
+	fscache_wait_on_page_write(vnode->cache, vmf->page);
 #endif
 
-	_leave(" = 0");
-	return 0;
+	if (PageWriteback(vmf->page) &&
+	    wait_on_page_bit_killable(vmf->page, PG_writeback) < 0)
+		return VM_FAULT_RETRY;
+
+	if (lock_page_killable(vmf->page) < 0)
+		return VM_FAULT_RETRY;
+
+	/* We mustn't change page->private until writeback is complete as that
+	 * details the portion of the page we need to write back and we might
+	 * need to redirty the page if there's a problem.
+	 */
+	wait_on_page_writeback(vmf->page);
+
+	priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */
+	priv |= 0; /* From */
+	trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),
+			     vmf->page->index, priv);
+	SetPagePrivate(vmf->page);
+	set_page_private(vmf->page, priv);
+
+	sb_end_pagefault(inode->i_sb);
+	return VM_FAULT_LOCKED;
+}
+
+/*
+ * Prune the keys cached for writeback.  The caller must hold vnode->wb_lock.
+ */
+void afs_prune_wb_keys(struct afs_vnode *vnode)
+{
+	LIST_HEAD(graveyard);
+	struct afs_wb_key *wbk, *tmp;
+
+	/* Discard unused keys */
+	spin_lock(&vnode->wb_lock);
+
+	if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+	    !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+		list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
+			if (refcount_read(&wbk->usage) == 1)
+				list_move(&wbk->vnode_link, &graveyard);
+		}
+	}
+
+	spin_unlock(&vnode->wb_lock);
+
+	while (!list_empty(&graveyard)) {
+		wbk = list_entry(graveyard.next, struct afs_wb_key, vnode_link);
+		list_del(&wbk->vnode_link);
+		afs_put_wb_key(wbk);
+	}
+}
+
+/*
+ * Clean up a page during invalidation.
+ */
+int afs_launder_page(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	unsigned long priv;
+	unsigned int f, t;
+	int ret = 0;
+
+	_enter("{%lx}", page->index);
+
+	priv = page_private(page);
+	if (clear_page_dirty_for_io(page)) {
+		f = 0;
+		t = PAGE_SIZE;
+		if (PagePrivate(page)) {
+			f = priv & AFS_PRIV_MAX;
+			t = priv >> AFS_PRIV_SHIFT;
+		}
+
+		trace_afs_page_dirty(vnode, tracepoint_string("launder"),
+				     page->index, priv);
+		ret = afs_store_data(mapping, page->index, page->index, t, f);
+	}
+
+	trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
+			     page->index, priv);
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (PageFsCache(page)) {
+		fscache_wait_on_page_write(vnode->cache, page);
+		fscache_uncache_page(vnode->cache, page);
+	}
+#endif
+	return ret;
 }
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 2830e4f48d85..cfcc674e64a5 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -45,7 +45,7 @@ static int afs_xattr_get_cell(const struct xattr_handler *handler,
 	struct afs_cell *cell = vnode->volume->cell;
 	size_t namelen;
 
-	namelen = strlen(cell->name);
+	namelen = cell->name_len;
 	if (size == 0)
 		return namelen;
 	if (namelen > size)
@@ -96,7 +96,7 @@ static int afs_xattr_get_volume(const struct xattr_handler *handler,
 			      void *buffer, size_t size)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	const char *volname = vnode->volume->vlocation->vldb.name;
+	const char *volname = vnode->volume->name;
 	size_t namelen;
 
 	namelen = strlen(volname);
diff --git a/fs/aio.c b/fs/aio.c
index e6de7715228c..a062d75109cb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1297,20 +1297,10 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
 
 static long read_events(struct kioctx *ctx, long min_nr, long nr,
 			struct io_event __user *event,
-			struct timespec __user *timeout)
+			ktime_t until)
 {
-	ktime_t until = KTIME_MAX;
 	long ret = 0;
 
-	if (timeout) {
-		struct timespec	ts;
-
-		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
-			return -EFAULT;
-
-		until = timespec_to_ktime(ts);
-	}
-
 	/*
 	 * Note that aio_read_events() is being called as the conditional - i.e.
 	 * we're calling it after prepare_to_wait() has set task state to
@@ -1826,6 +1816,25 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	return ret;
 }
 
+static long do_io_getevents(aio_context_t ctx_id,
+		long min_nr,
+		long nr,
+		struct io_event __user *events,
+		struct timespec64 *ts)
+{
+	ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
+	struct kioctx *ioctx = lookup_ioctx(ctx_id);
+	long ret = -EINVAL;
+
+	if (likely(ioctx)) {
+		if (likely(min_nr <= nr && min_nr >= 0))
+			ret = read_events(ioctx, min_nr, nr, events, until);
+		percpu_ref_put(&ioctx->users);
+	}
+
+	return ret;
+}
+
 /* io_getevents:
  *	Attempts to read at least min_nr events and up to nr events from
  *	the completion queue for the aio_context specified by ctx_id. If
@@ -1844,15 +1853,14 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		struct io_event __user *, events,
 		struct timespec __user *, timeout)
 {
-	struct kioctx *ioctx = lookup_ioctx(ctx_id);
-	long ret = -EINVAL;
+	struct timespec64	ts;
 
-	if (likely(ioctx)) {
-		if (likely(min_nr <= nr && min_nr >= 0))
-			ret = read_events(ioctx, min_nr, nr, events, timeout);
-		percpu_ref_put(&ioctx->users);
+	if (timeout) {
+		if (unlikely(get_timespec64(&ts, timeout)))
+			return -EFAULT;
 	}
-	return ret;
+
+	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1862,17 +1870,14 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
 		       struct io_event __user *, events,
 		       struct compat_timespec __user *, timeout)
 {
-	struct timespec t;
-	struct timespec __user *ut = NULL;
+	struct timespec64 t;
 
 	if (timeout) {
-		if (compat_get_timespec(&t, timeout))
+		if (compat_get_timespec64(&t, timeout))
 			return -EFAULT;
 
-		ut = compat_alloc_user_space(sizeof(*ut));
-		if (copy_to_user(ut, &t, sizeof(t)))
-			return -EFAULT;
 	}
-	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+
+	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 }
 #endif
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d79ced925861..82e8f6edfb48 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
 		pr_debug("waiting for mount name=%pd\n", path->dentry);
 		status = autofs4_wait(sbi, path, NFY_MOUNT);
 		pr_debug("mount wait done status=%d\n", status);
-		ino->last_used = jiffies;
 	}
+	ino->last_used = jiffies;
 	return status;
 }
 
@@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
 	 */
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 		struct dentry *parent = dentry->d_parent;
+		struct autofs_info *ino;
 		struct dentry *new;
 
 		new = d_lookup(parent, &dentry->d_name);
 		if (!new)
 			return NULL;
-		if (new == dentry)
-			dput(new);
-		else {
-			struct autofs_info *ino;
-
-			ino = autofs4_dentry_ino(new);
-			ino->last_used = jiffies;
-			dput(path->dentry);
-			path->dentry = new;
-		}
+		ino = autofs4_dentry_ino(new);
+		ino->last_used = jiffies;
+		dput(path->dentry);
+		path->dentry = new;
 	}
 	return path->dentry;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4ac49d038bf3..961a12dc6dc8 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi,
 		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	}
 
-	return (bytes > 0);
+	/* if 'wr' returned 0 (impossible) we assume -EIO (safe) */
+	return bytes == 0 ? 0 : wr < 0 ? wr : -EIO;
 }
 
 static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
@@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	} pkt;
 	struct file *pipe = NULL;
 	size_t pktsz;
+	int ret;
 
 	pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
 		 (unsigned long) wq->wait_queue_token,
@@ -168,8 +170,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	mutex_unlock(&sbi->wq_mutex);
 
-	if (autofs4_write(sbi, pipe, &pkt, pktsz))
+	switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) {
+	case 0:
+		break;
+	case -ENOMEM:
+	case -ERESTARTSYS:
+		/* Just fail this one */
+		autofs4_wait_release(sbi, wq->wait_queue_token, ret);
+		break;
+	default:
 		autofs4_catatonic_mode(sbi);
+		break;
+	}
 	fput(pipe);
 }
 
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index 75a461cfaca6..16f2dfe8c2f7 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -365,7 +365,7 @@ Version 0.4 (2001-10-28)
 	(fs/befs/super.c)
 
 * Tell the kernel to only mount befs read-only. 
-	By setting the MS_RDONLY flag in befs_read_super().
+	By setting the SB_RDONLY flag in befs_read_super().
 	Not that it was possible to write before. But now the kernel won't even try.
 	(fs/befs/super.c)
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a92355cc453b..ee236231cafa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -841,7 +841,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb_rdonly(sb)) {
 		befs_warning(sb,
 			     "No write support. Marking filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/*
@@ -948,7 +948,7 @@ static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		return -EINVAL;
 	return 0;
 }
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 73b01e474fdc..83732fef510d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -51,6 +51,11 @@
 #define user_siginfo_t siginfo_t
 #endif
 
+/* That's for binfmt_elf_fdpic to deal with */
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(ex) false
+#endif
+
 static int load_elf_binary(struct linux_binprm *bprm);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
 				int, int, unsigned long);
@@ -541,7 +546,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	if (interp_elf_ex->e_type != ET_EXEC &&
 	    interp_elf_ex->e_type != ET_DYN)
 		goto out;
-	if (!elf_check_arch(interp_elf_ex))
+	if (!elf_check_arch(interp_elf_ex) ||
+	    elf_check_fdpic(interp_elf_ex))
 		goto out;
 	if (!interpreter->f_op->mmap)
 		goto out;
@@ -718,6 +724,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
+	if (elf_check_fdpic(&loc->elf_ex))
+		goto out;
 	if (!bprm->file->f_op->mmap)
 		goto out;
 
@@ -817,7 +825,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
 			goto out_free_dentry;
 		/* Verify the interpreter has a valid arch */
-		if (!elf_check_arch(&loc->interp_elf_ex))
+		if (!elf_check_arch(&loc->interp_elf_ex) ||
+		    elf_check_fdpic(&loc->interp_elf_ex))
 			goto out_free_dentry;
 
 		/* Load the interpreter program headers */
@@ -1190,6 +1199,8 @@ static int load_elf_library(struct file *file)
 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
 	    !elf_check_arch(&elf_ex) || !file->f_op->mmap)
 		goto out;
+	if (elf_check_fdpic(&elf_ex))
+		goto out;
 
 	/* Now read in all of the header information */
 
@@ -1699,7 +1710,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
-	unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
+	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1708,11 +1719,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
 				    &t->prstatus.pr_reg, NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1728,7 +1739,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 		if (regset->core_note_type && regset->get &&
 		    (!regset->active || regset->active(t->task, regset))) {
 			int ret;
-			size_t size = regset->n * regset->size;
+			size_t size = regset_size(t->task, regset);
 			void *data = kmalloc(size, GFP_KERNEL);
 			if (unlikely(!data))
 				return 0;
@@ -1743,7 +1754,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  size, data);
 				else {
 					SET_PR_FPVALID(&t->prstatus,
-							1, regset_size);
+							1, regset0_size);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e70c039ac190..429326b6e2e7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -378,6 +378,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 				 executable_stack);
 	if (retval < 0)
 		goto error;
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	retval = arch_setup_additional_pages(bprm, !!interpreter_name);
+	if (retval < 0)
+		goto error;
+#endif
 #endif
 
 	/* load the executable and interpreter into memory */
@@ -831,6 +836,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 			if (phdr->p_vaddr >= seg->p_vaddr &&
 			    phdr->p_vaddr + phdr->p_memsz <=
 			    seg->p_vaddr + seg->p_memsz) {
+				Elf32_Dyn __user *dyn;
+				Elf32_Sword d_tag;
+
 				params->dynamic_addr =
 					(phdr->p_vaddr - seg->p_vaddr) +
 					seg->addr;
@@ -843,8 +851,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 					goto dynamic_error;
 
 				tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
-				if (((Elf32_Dyn *)
-				     params->dynamic_addr)[tmp - 1].d_tag != 0)
+				dyn = (Elf32_Dyn __user *)params->dynamic_addr;
+				__get_user(d_tag, &dyn[tmp - 1].d_tag);
+				if (d_tag != 0)
 					goto dynamic_error;
 				break;
 			}
@@ -1489,7 +1498,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
 	struct vm_area_struct *vma;
 
 	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+#ifdef CONFIG_MMU
 		unsigned long addr;
+#endif
 
 		if (!maydump(vma, cprm->mm_flags))
 			continue;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b35ce16b3df3..5982c8a71f02 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -295,7 +295,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				 unsigned long len, u64 disk_start,
 				 unsigned long compressed_len,
 				 struct page **compressed_pages,
-				 unsigned long nr_pages)
+				 unsigned long nr_pages,
+				 unsigned int write_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct bio *bio = NULL;
@@ -327,7 +328,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bdev = fs_info->fs_devices->latest_bdev;
 
 	bio = btrfs_bio_alloc(bdev, first_byte);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	bio->bi_opf = REQ_OP_WRITE | write_flags;
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
 	refcount_set(&cb->pending_bios, 1);
@@ -374,7 +375,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio_put(bio);
 
 			bio = btrfs_bio_alloc(bdev, first_byte);
-			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+			bio->bi_opf = REQ_OP_WRITE | write_flags;
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -1528,5 +1529,5 @@ unsigned int btrfs_compress_str2level(const char *str)
 	if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
 		return str[5] - '0';
 
-	return 0;
+	return BTRFS_ZLIB_DEFAULT_LEVEL;
 }
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index da20755ebf21..0868cc554f14 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,6 +34,8 @@
 /* Maximum size of data before compression */
 #define BTRFS_MAX_UNCOMPRESSED		(SZ_128K)
 
+#define	BTRFS_ZLIB_DEFAULT_LEVEL		3
+
 struct compressed_bio {
 	/* number of bios pending for this compressed extent */
 	refcount_t pending_bios;
@@ -91,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long len, u64 disk_start,
 				  unsigned long compressed_len,
 				  struct page **compressed_pages,
-				  unsigned long nr_pages);
+				  unsigned long nr_pages,
+				  unsigned int write_flags);
 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 531e0a8645b0..1e74cf826532 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1032,14 +1032,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 			ret = btrfs_inc_ref(trans, root, buf, 1);
-			BUG_ON(ret); /* -ENOMEM */
+			if (ret)
+				return ret;
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID) {
 				ret = btrfs_dec_ref(trans, root, buf, 0);
-				BUG_ON(ret); /* -ENOMEM */
+				if (ret)
+					return ret;
 				ret = btrfs_inc_ref(trans, root, cow, 1);
-				BUG_ON(ret); /* -ENOMEM */
+				if (ret)
+					return ret;
 			}
 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 		} else {
@@ -1049,7 +1052,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_ref(trans, root, cow, 1);
 			else
 				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret); /* -ENOMEM */
+			if (ret)
+				return ret;
 		}
 		if (new_flags != 0) {
 			int level = btrfs_header_level(buf);
@@ -1068,9 +1072,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_ref(trans, root, cow, 1);
 			else
 				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret); /* -ENOMEM */
+			if (ret)
+				return ret;
 			ret = btrfs_dec_ref(trans, root, buf, 1);
-			BUG_ON(ret); /* -ENOMEM */
+			if (ret)
+				return ret;
 		}
 		clean_tree_block(fs_info, buf);
 		*last_ref = 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7df5536ab61..13c260b525a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2957,7 +2957,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
  */
 static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
 {
-	return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
+	return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
 }
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
@@ -3180,6 +3180,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 			       int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      unsigned int extra_bits,
 			      struct extent_state **cached_state, int dedupe);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index efce9a2fa9be..a8ecccfc36de 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && btrfs_check_leaf(root, eb)) {
+	if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
@@ -3231,6 +3231,7 @@ static int write_dev_supers(struct btrfs_device *device,
 	int errors = 0;
 	u32 crc;
 	u64 bytenr;
+	int op_flags;
 
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3273,13 +3274,10 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
 		 */
-		if (i == 0) {
-			ret = btrfsic_submit_bh(REQ_OP_WRITE,
-				REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh);
-		} else {
-			ret = btrfsic_submit_bh(REQ_OP_WRITE,
-				REQ_SYNC | REQ_META | REQ_PRIO, bh);
-		}
+		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
+		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
+			op_flags |= REQ_FUA;
+		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
 		if (ret)
 			errors++;
 	}
@@ -3848,7 +3846,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 					 buf->len,
 					 fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
+	/*
+	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+	 * but item data not updated.
+	 * So here we should only check item pointers, not item data.
+	 */
+	if (btrfs_header_level(buf) == 0 &&
+	    btrfs_check_leaf_relaxed(root, buf)) {
 		btrfs_print_leaf(buf);
 		ASSERT(0);
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 673ac4e01dd0..2f4328511ac8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3502,13 +3502,6 @@ again:
 		goto again;
 	}
 
-	/* We've already setup this transaction, go ahead and exit */
-	if (block_group->cache_generation == trans->transid &&
-	    i_size_read(inode)) {
-		dcs = BTRFS_DC_SETUP;
-		goto out_put;
-	}
-
 	/*
 	 * We want to set the generation to 0, that way if anything goes wrong
 	 * from here on out we know not to trust this cache when we load up next
@@ -3532,6 +3525,13 @@ again:
 	}
 	WARN_ON(ret);
 
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
 	if (i_size_read(inode) > 0) {
 		ret = btrfs_check_trunc_cache_free_space(fs_info,
 					&fs_info->global_block_rsv);
@@ -3992,16 +3992,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 {
-	wait_on_atomic_t(&bg->nocow_writers,
-			 btrfs_wait_nocow_writers_atomic_t,
+	wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -6530,12 +6523,6 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_space_info *space_info = bg->space_info;
@@ -6558,8 +6545,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 	down_write(&space_info->groups_sem);
 	up_write(&space_info->groups_sem);
 
-	wait_on_atomic_t(&bg->reservations,
-			 btrfs_wait_bg_reservations_atomic_t,
+	wait_on_atomic_t(&bg->reservations, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -9220,6 +9206,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	ret = btrfs_del_root(trans, fs_info, &root->root_key);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
+		err = ret;
 		goto out_end_trans;
 	}
 
@@ -11059,12 +11046,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
 	return 1;
 }
 
-static int wait_snapshotting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 {
 	while (true) {
@@ -11073,8 +11054,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 		ret = btrfs_start_write_no_snapshotting(root);
 		if (ret)
 			break;
-		wait_on_atomic_t(&root->will_be_snapshotted,
-				 wait_snapshotting_atomic_t,
+		wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 	}
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index adbbc017191c..012d63870b99 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1984,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	struct btrfs_bio *bbio = NULL;
 	int ret;
 
-	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
 	BUG_ON(!mirror_num);
 
 	bio = btrfs_io_bio_alloc(1);
@@ -3253,7 +3253,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
 					       delalloc_start,
 					       delalloc_end,
 					       &page_started,
-					       nr_written);
+					       nr_written, wbc);
 		/* File system has been set read-only */
 		if (ret) {
 			SetPageError(page);
@@ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 	int scanned = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -3814,8 +3814,8 @@ retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag_pages_for_writeback(mapping, index, end);
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+			tag))) {
 		unsigned i;
 
 		scanned = 1;
@@ -3825,11 +3825,6 @@ retry:
 			if (!PagePrivate(page))
 				continue;
 
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				break;
-			}
-
 			spin_lock(&mapping->private_lock);
 			if (!PagePrivate(page)) {
 				spin_unlock(&mapping->private_lock);
@@ -3941,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
 	if (!igrab(inode))
 		return 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -3961,8 +3956,8 @@ retry:
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
+						&index, end, tag))) {
 		unsigned i;
 
 		scanned = 1;
@@ -3987,12 +3982,6 @@ retry:
 				continue;
 			}
 
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				unlock_page(page);
-				continue;
-			}
-
 			if (wbc->sync_mode != WB_SYNC_NONE) {
 				if (PageWriteback(page))
 					flush_fn(data);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4a8861379d3e..93dcae0c3183 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -116,7 +116,8 @@ struct extent_io_ops {
 	 */
 	int (*fill_delalloc)(void *private_data, struct page *locked_page,
 			     u64 start, u64 end, int *page_started,
-			     unsigned long *nr_written);
+			     unsigned long *nr_written,
+			     struct writeback_control *wbc);
 
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -365,10 +366,11 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		       struct extent_state **cached_state);
 
 static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
-		u64 end, struct extent_state **cached_state)
+				      u64 end, unsigned int extra_bits,
+				      struct extent_state **cached_state)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_UPTODATE,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
 			      NULL, cached_state, GFP_NOFS);
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f80254d82f40..eb1bac7c8553 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -477,6 +477,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+					 const u64 start,
+					 const u64 len,
+					 struct extent_state **cached_state)
+{
+	u64 search_start = start;
+	const u64 end = start + len - 1;
+
+	while (search_start < end) {
+		const u64 search_len = end - search_start + 1;
+		struct extent_map *em;
+		u64 em_len;
+		int ret = 0;
+
+		em = btrfs_get_extent(inode, NULL, 0, search_start,
+				      search_len, 0);
+		if (IS_ERR(em))
+			return PTR_ERR(em);
+
+		if (em->block_start != EXTENT_MAP_HOLE)
+			goto next;
+
+		em_len = em->len;
+		if (em->start < search_start)
+			em_len -= search_start - em->start;
+		if (em_len > search_len)
+			em_len = search_len;
+
+		ret = set_extent_bit(&inode->io_tree, search_start,
+				     search_start + em_len - 1,
+				     EXTENT_DELALLOC_NEW,
+				     NULL, cached_state, GFP_NOFS);
+next:
+		search_start = extent_map_end(em);
+		free_extent_map(em);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
@@ -497,14 +538,34 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
 	loff_t isize = i_size_read(inode);
+	unsigned int extra_bits = 0;
 
 	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
 	num_bytes = round_up(write_bytes + pos - start_pos,
 			     fs_info->sectorsize);
 
 	end_of_last_block = start_pos + num_bytes - 1;
+
+	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+		if (start_pos >= isize &&
+		    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+			/*
+			 * There can't be any extents following eof in this case
+			 * so just set the delalloc new bit for the range
+			 * directly.
+			 */
+			extra_bits |= EXTENT_DELALLOC_NEW;
+		} else {
+			err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
+							    start_pos,
+							    num_bytes, cached);
+			if (err)
+				return err;
+		}
+	}
+
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-					cached, 0);
+					extra_bits, cached, 0);
 	if (err)
 		return err;
 
@@ -1404,47 +1465,6 @@ fail:
 
 }
 
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
-					 const u64 start,
-					 const u64 len,
-					 struct extent_state **cached_state)
-{
-	u64 search_start = start;
-	const u64 end = start + len - 1;
-
-	while (search_start < end) {
-		const u64 search_len = end - search_start + 1;
-		struct extent_map *em;
-		u64 em_len;
-		int ret = 0;
-
-		em = btrfs_get_extent(inode, NULL, 0, search_start,
-				      search_len, 0);
-		if (IS_ERR(em))
-			return PTR_ERR(em);
-
-		if (em->block_start != EXTENT_MAP_HOLE)
-			goto next;
-
-		em_len = em->len;
-		if (em->start < search_start)
-			em_len -= search_start - em->start;
-		if (em_len > search_len)
-			em_len = search_len;
-
-		ret = set_extent_bit(&inode->io_tree, search_start,
-				     search_start + em_len - 1,
-				     EXTENT_DELALLOC_NEW,
-				     NULL, cached_state, GFP_NOFS);
-next:
-		search_start = extent_map_end(em);
-		free_extent_map(em);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
 /*
  * This function locks the extent and properly waits for data=ordered extents
  * to finish before allowing the pages to be modified if need.
@@ -1473,10 +1493,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		+ round_up(pos + write_bytes - start_pos,
 			   fs_info->sectorsize) - 1;
 
-	if (start_pos < inode->vfs_inode.i_size ||
-	    (inode->flags & BTRFS_INODE_PREALLOC)) {
+	if (start_pos < inode->vfs_inode.i_size) {
 		struct btrfs_ordered_extent *ordered;
-		unsigned int clear_bits;
 
 		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
 				cached_state);
@@ -1498,19 +1516,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
-		ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
-						    last_pos - start_pos + 1,
-						    cached_state);
-		clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
-			EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
-		if (ret)
-			clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
-		clear_extent_bit(&inode->io_tree, start_pos,
-				 last_pos, clear_bits,
-				 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
-				 0, cached_state, GFP_NOFS);
-		if (ret)
-			return ret;
+		clear_extent_bit(&inode->io_tree, start_pos, last_pos,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				 0, 0, cached_state, GFP_NOFS);
 		*lockstart = start_pos;
 		*lockend = last_pos;
 		ret = 1;
@@ -2048,6 +2057,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	len = (u64)end - (u64)start + 1;
 	trace_btrfs_sync_file(file, datasync);
 
+	btrfs_init_log_ctx(&ctx, inode);
+
 	/*
 	 * We write the dirty pages in the range and wait until they complete
 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
@@ -2194,8 +2205,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 	trans->sync = true;
 
-	btrfs_init_log_ctx(&ctx, inode);
-
 	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
@@ -2253,6 +2262,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = btrfs_end_transaction(trans);
 	}
 out:
+	ASSERT(list_empty(&ctx.list));
 	err = file_check_and_advance_wb_err(file);
 	if (!ret)
 		ret = err;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cdc9f4015ec3..4426d1c73e50 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1264,7 +1264,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	/* Lock all pages first so we can lock the extent safely. */
 	ret = io_ctl_prepare_pages(io_ctl, inode, 0);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 &cached_state);
@@ -1358,6 +1358,7 @@ out_nospc_locked:
 out_nospc:
 	cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
 
+out_unlock:
 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
 		up_write(&block_group->data_rwsem);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b93fe05a39c7..e1a7f3cb5be9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -378,6 +378,7 @@ struct async_cow {
 	struct page *locked_page;
 	u64 start;
 	u64 end;
+	unsigned int write_flags;
 	struct list_head extents;
 	struct btrfs_work work;
 };
@@ -857,7 +858,8 @@ retry:
 				    async_extent->ram_size,
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
-				    async_extent->nr_pages)) {
+				    async_extent->nr_pages,
+				    async_cow->write_flags)) {
 			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 			struct page *p = async_extent->pages[0];
 			const u64 start = async_extent->start;
@@ -1191,7 +1193,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
 
 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 				u64 start, u64 end, int *page_started,
-				unsigned long *nr_written)
+				unsigned long *nr_written,
+				unsigned int write_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct async_cow *async_cow;
@@ -1208,6 +1211,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->root = root;
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
+		async_cow->write_flags = write_flags;
 
 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
 		    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
@@ -1577,11 +1581,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
  */
 static int run_delalloc_range(void *private_data, struct page *locked_page,
 			      u64 start, u64 end, int *page_started,
-			      unsigned long *nr_written)
+			      unsigned long *nr_written,
+			      struct writeback_control *wbc)
 {
 	struct inode *inode = private_data;
 	int ret;
 	int force_cow = need_force_cow(inode, start, end);
+	unsigned int write_flags = wbc_to_write_flags(wbc);
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1596,7 +1602,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 			&BTRFS_I(inode)->runtime_flags);
 		ret = cow_file_range_async(inode, locked_page, start, end,
-					   page_started, nr_written);
+					   page_started, nr_written,
+					   write_flags);
 	}
 	if (ret)
 		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
@@ -2025,11 +2032,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      unsigned int extra_bits,
 			      struct extent_state **cached_state, int dedupe)
 {
 	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-				   cached_state);
+				   extra_bits, cached_state);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -2090,7 +2098,7 @@ again:
 		goto out;
 	 }
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
+	btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state,
 				  0);
 	ClearPageChecked(page);
 	set_page_dirty(page);
@@ -2997,6 +3005,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		compress_type = ordered_extent->compress_type;
 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 		BUG_ON(compress_type);
+		btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
+				       ordered_extent->len);
 		ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
@@ -4790,7 +4800,7 @@ again:
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
+	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
 					&cached_state, 0);
 	if (ret) {
 		unlock_extent_cached(io_tree, block_start, block_end,
@@ -5438,6 +5448,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		goto out_err;
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+	if (location->type != BTRFS_INODE_ITEM_KEY &&
+	    location->type != BTRFS_ROOT_ITEM_KEY) {
+		btrfs_warn(root->fs_info,
+"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
+			   __func__, name, btrfs_ino(BTRFS_I(dir)),
+			   location->objectid, location->type, location->offset);
+		goto out_err;
+	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -5754,8 +5772,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return inode;
 	}
 
-	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
-
 	index = srcu_read_lock(&fs_info->subvol_srcu);
 	ret = fixup_tree_root_location(fs_info, dir, dentry,
 				       &location, &sub_root);
@@ -9150,7 +9166,7 @@ again:
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, end,
+	ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
 					&cached_state, 0);
 	if (ret) {
 		unlock_extent_cached(io_tree, page_start, page_end,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fd172a93d11a..2ef8acaac688 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1172,7 +1172,7 @@ again:
 	if (!i_done || ret)
 		goto out;
 
-	if (!(inode->i_sb->s_flags & MS_ACTIVE))
+	if (!(inode->i_sb->s_flags & SB_ACTIVE))
 		goto out;
 
 	/*
@@ -1333,7 +1333,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		 * make sure we stop running if someone unmounts
 		 * the FS
 		 */
-		if (!(inode->i_sb->s_flags & MS_ACTIVE))
+		if (!(inode->i_sb->s_flags & SB_ACTIVE))
 			break;
 
 		if (btrfs_defrag_cancelled(fs_info)) {
@@ -2206,7 +2206,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	if (!path)
 		return -ENOMEM;
 
-	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
 
 	key.objectid = tree_id;
 	key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4cf2eb67eba6..f0c3f00e97cb 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3268,7 +3268,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			nr++;
 		}
 
-		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
+		btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL,
+					  0);
 		set_page_dirty(page);
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c10e4c70f02d..20d3300bd268 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3521,7 +3521,40 @@ out:
 }
 
 /*
- * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Check if inode ino2, or any of its ancestors, is inode ino1.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int check_ino_in_path(struct btrfs_root *root,
+			     const u64 ino1,
+			     const u64 ino1_gen,
+			     const u64 ino2,
+			     const u64 ino2_gen,
+			     struct fs_path *fs_path)
+{
+	u64 ino = ino2;
+
+	if (ino1 == ino2)
+		return ino1_gen == ino2_gen;
+
+	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+		u64 parent;
+		u64 parent_gen;
+		int ret;
+
+		fs_path_reset(fs_path);
+		ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+		if (ret < 0)
+			return ret;
+		if (parent == ino1)
+			return parent_gen == ino1_gen;
+		ino = parent;
+	}
+	return 0;
+}
+
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
+ * possible path (in case ino2 is not a directory and has multiple hard links).
  * Return 1 if true, 0 if false and < 0 on error.
  */
 static int is_ancestor(struct btrfs_root *root,
@@ -3530,36 +3563,91 @@ static int is_ancestor(struct btrfs_root *root,
 		       const u64 ino2,
 		       struct fs_path *fs_path)
 {
-	u64 ino = ino2;
-	bool free_path = false;
+	bool free_fs_path = false;
 	int ret = 0;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
 
 	if (!fs_path) {
 		fs_path = fs_path_alloc();
 		if (!fs_path)
 			return -ENOMEM;
-		free_path = true;
+		free_fs_path = true;
 	}
 
-	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
-		u64 parent;
-		u64 parent_gen;
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-		fs_path_reset(fs_path);
-		ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
-		if (ret < 0) {
-			if (ret == -ENOENT && ino == ino2)
-				ret = 0;
-			goto out;
+	key.objectid = ino2;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+		u32 cur_offset = 0;
+		u32 item_size;
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			continue;
 		}
-		if (parent == ino1) {
-			ret = parent_gen == ino1_gen ? 1 : 0;
-			goto out;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != ino2)
+			break;
+		if (key.type != BTRFS_INODE_REF_KEY &&
+		    key.type != BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		item_size = btrfs_item_size_nr(leaf, slot);
+		while (cur_offset < item_size) {
+			u64 parent;
+			u64 parent_gen;
+
+			if (key.type == BTRFS_INODE_EXTREF_KEY) {
+				unsigned long ptr;
+				struct btrfs_inode_extref *extref;
+
+				ptr = btrfs_item_ptr_offset(leaf, slot);
+				extref = (struct btrfs_inode_extref *)
+					(ptr + cur_offset);
+				parent = btrfs_inode_extref_parent(leaf,
+								   extref);
+				cur_offset += sizeof(*extref);
+				cur_offset += btrfs_inode_extref_name_len(leaf,
+								  extref);
+			} else {
+				parent = key.offset;
+				cur_offset = item_size;
+			}
+
+			ret = get_inode_info(root, parent, NULL, &parent_gen,
+					     NULL, NULL, NULL, NULL);
+			if (ret < 0)
+				goto out;
+			ret = check_ino_in_path(root, ino1, ino1_gen,
+						parent, parent_gen, fs_path);
+			if (ret)
+				goto out;
 		}
-		ino = parent;
+		path->slots[0]++;
 	}
+	ret = 0;
  out:
-	if (free_path)
+	btrfs_free_path(path);
+	if (free_fs_path)
 		fs_path_free(fs_path);
 	return ret;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 65af029559b5..3a4dce153645 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -107,7 +107,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 		return;
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		btrfs_info(fs_info, "forced readonly");
 		/*
 		 * Note that a running device replace operation is not
@@ -137,7 +137,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 
 	/*
 	 * Special case: if the error is EROFS, and we're already
-	 * under MS_RDONLY, then it is safe here.
+	 * under SB_RDONLY, then it is safe here.
 	 */
 	if (errno == -EROFS && sb_rdonly(sb))
   		return;
@@ -168,7 +168,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
 	/* Don't go through full error handling during mount */
-	if (sb->s_flags & MS_BORN)
+	if (sb->s_flags & SB_BORN)
 		btrfs_handle_error(fs_info);
 }
 
@@ -507,9 +507,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			    token == Opt_compress_force ||
 			    strncmp(args[0].from, "zlib", 4) == 0) {
 				compress_type = "zlib";
+
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
-				info->compress_level =
-					btrfs_compress_str2level(args[0].from);
+				info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+				/*
+				 * args[0] contains uninitialized data since
+				 * for these tokens we don't expect any
+				 * parameter.
+				 */
+				if (token != Opt_compress &&
+				    token != Opt_compress_force)
+					info->compress_level =
+					  btrfs_compress_str2level(args[0].from);
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -625,7 +634,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			break;
 		case Opt_acl:
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-			info->sb->s_flags |= MS_POSIXACL;
+			info->sb->s_flags |= SB_POSIXACL;
 			break;
 #else
 			btrfs_err(info, "support for ACL not compiled in!");
@@ -633,7 +642,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			goto out;
 #endif
 		case Opt_noacl:
-			info->sb->s_flags &= ~MS_POSIXACL;
+			info->sb->s_flags &= ~SB_POSIXACL;
 			break;
 		case Opt_notreelog:
 			btrfs_set_and_info(info, NOTREELOG,
@@ -851,7 +860,7 @@ check:
 	/*
 	 * Extra check for current option against current flag
 	 */
-	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
 		btrfs_err(info,
 			  "nologreplay must be used with ro mount option");
 		ret = -EINVAL;
@@ -1147,7 +1156,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	sb->s_flags |= SB_I_VERSION;
 	sb->s_iflags |= SB_I_CGROUPWB;
@@ -1180,7 +1189,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	}
 
 	cleancache_init_fs(sb);
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 	return 0;
 
 fail_close:
@@ -1277,7 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD))
 		seq_puts(seq, ",discard");
-	if (!(info->sb->s_flags & MS_POSIXACL))
+	if (!(info->sb->s_flags & SB_POSIXACL))
 		seq_puts(seq, ",noacl");
 	if (btrfs_test_opt(info, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
@@ -1409,11 +1418,11 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 
 	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
 	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
-		if (flags & MS_RDONLY) {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+		if (flags & SB_RDONLY) {
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY,
 					     device_name, newargs);
 		} else {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY,
 					     device_name, newargs);
 			if (IS_ERR(mnt)) {
 				root = ERR_CAST(mnt);
@@ -1565,7 +1574,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	u64 subvol_objectid = 0;
 	int error = 0;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
@@ -1619,13 +1628,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		goto error_fs_info;
 
-	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
 		error = -EACCES;
 		goto error_close_devices;
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
 		 fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
@@ -1635,7 +1644,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (s->s_root) {
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
-		if ((flags ^ s->s_flags) & MS_RDONLY)
+		if ((flags ^ s->s_flags) & SB_RDONLY)
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
@@ -1702,11 +1711,11 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
 {
 	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
 	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
-	     (flags & MS_RDONLY))) {
+	     (flags & SB_RDONLY))) {
 		/* wait for any defraggers to finish */
 		wait_event(fs_info->transaction_wait,
 			   (atomic_read(&fs_info->defrag_running) == 0));
-		if (flags & MS_RDONLY)
+		if (flags & SB_RDONLY)
 			sync_filesystem(fs_info->sb);
 	}
 }
@@ -1766,10 +1775,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	btrfs_resize_thread_pool(fs_info,
 		fs_info->thread_pool_size, old_thread_pool_size);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out;
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		/*
 		 * this also happens on 'umount -rf' or on shutdown, when
 		 * the filesystem is busy.
@@ -1781,10 +1790,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		/* avoid complains from lockdep et al. */
 		up(&fs_info->uuid_tree_rescan_sem);
 
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 		/*
-		 * Setting MS_RDONLY will put the cleaner thread to
+		 * Setting SB_RDONLY will put the cleaner thread to
 		 * sleep at the next loop if it's already active.
 		 * If it's already asleep, we'll leave unused block
 		 * groups on disk until we're mounted read-write again
@@ -1856,7 +1865,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 				goto restore;
 			}
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	}
@@ -1866,9 +1875,9 @@ out:
 	return 0;
 
 restore:
-	/* We've hit an error - don't reset MS_RDONLY */
+	/* We've hit an error - don't reset SB_RDONLY */
 	if (sb_rdonly(sb))
-		old_flags |= MS_RDONLY;
+		old_flags |= SB_RDONLY;
 	sb->s_flags = old_flags;
 	fs_info->mount_opt = old_opts;
 	fs_info->compress_type = old_compress_type;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d06b1c931d05..2e7f64a3b22b 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -114,7 +114,7 @@ static int test_find_delalloc(u32 sectorsize)
 	 * |--- delalloc ---|
 	 * |---  search  ---|
 	 */
-	set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL);
+	set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
 	start = 0;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -145,7 +145,7 @@ static int test_find_delalloc(u32 sectorsize)
 		test_msg("Couldn't find the locked page\n");
 		goto out_bits;
 	}
-	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL);
+	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
 	start = test_start;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -200,7 +200,7 @@ static int test_find_delalloc(u32 sectorsize)
 	 *
 	 * We are re-using our test_start from above since it works out well.
 	 */
-	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL);
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
 	start = test_start;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index f797642c013d..30affb60da51 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	btrfs_test_inode_set_ops(inode);
 
 	/* [BTRFS_MAX_EXTENT_SIZE] */
-	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
 					NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
@@ -984,7 +984,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
 					BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
-					NULL, 0);
+					0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1018,7 +1018,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
 					(BTRFS_MAX_EXTENT_SIZE >> 1)
 					+ sectorsize - 1,
-					NULL, 0);
+					0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1036,7 +1036,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
 			(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
-			NULL, 0);
+			0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1053,7 +1053,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	*/
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
-			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
+			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
@@ -1089,7 +1089,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	 */
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
-			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
+			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
 	if (ret) {
 		test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
 		goto out;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 114fc5f0ecc5..ce4ed6ec8f39 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -242,7 +242,8 @@ static int check_leaf_item(struct btrfs_root *root,
 	return ret;
 }
 
-int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
+static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+		      bool check_item_data)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	/* No valid key type is 0, so all key should be larger than this key */
@@ -361,10 +362,15 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 			return -EUCLEAN;
 		}
 
-		/* Check if the item size and content meet other criteria */
-		ret = check_leaf_item(root, leaf, &key, slot);
-		if (ret < 0)
-			return ret;
+		if (check_item_data) {
+			/*
+			 * Check if the item size and content meet other
+			 * criteria
+			 */
+			ret = check_leaf_item(root, leaf, &key, slot);
+			if (ret < 0)
+				return ret;
+		}
 
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
@@ -374,6 +380,17 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 	return 0;
 }
 
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+	return check_leaf(root, leaf, true);
+}
+
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+			     struct extent_buffer *leaf)
+{
+	return check_leaf(root, leaf, false);
+}
+
 int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 {
 	unsigned long nr = btrfs_header_nritems(node);
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 96c486e95d70..3d53e8d6fda0 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -20,7 +20,19 @@
 #include "ctree.h"
 #include "extent_io.h"
 
-int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+			     struct extent_buffer *leaf);
 int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
 
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index aa7c71cff575..7bf9b31561db 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4102,7 +4102,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 
 	if (ordered_io_err) {
 		ctx->io_err = -EIO;
-		return 0;
+		return ctx->io_err;
 	}
 
 	btrfs_init_map_token(&token);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1ecb938ba4d..49810b70afd3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -189,6 +189,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 				    struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
 		rcu_string_free(device->name);
+		bio_put(device->flush_bio);
 		kfree(device);
 	}
 	kfree(fs_devices);
@@ -578,6 +579,7 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 				fs_devs->num_devices--;
 				list_del(&dev->dev_list);
 				rcu_string_free(dev->name);
+				bio_put(dev->flush_bio);
 				kfree(dev);
 			}
 			break;
@@ -630,6 +632,7 @@ static noinline int device_list_add(const char *path,
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
+			bio_put(device->flush_bio);
 			kfree(device);
 			return -ENOMEM;
 		}
@@ -742,6 +745,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 			name = rcu_string_strdup(orig_dev->name->str,
 					GFP_KERNEL);
 			if (!name) {
+				bio_put(device->flush_bio);
 				kfree(device);
 				goto error;
 			}
@@ -807,6 +811,7 @@ again:
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
 		rcu_string_free(device->name);
+		bio_put(device->flush_bio);
 		kfree(device);
 	}
 
@@ -1750,20 +1755,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
 	key.offset = device->devid;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto out;
-
-	if (ret > 0) {
-		ret = -ENOENT;
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
 		goto out;
 	}
 
 	ret = btrfs_del_item(trans, root, path);
-	if (ret)
-		goto out;
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+	}
+
 out:
 	btrfs_free_path(path);
-	btrfs_commit_transaction(trans);
+	if (!ret)
+		ret = btrfs_commit_transaction(trans);
 	return ret;
 }
 
@@ -1993,7 +2002,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	fs_devices = srcdev->fs_devices;
 
 	list_del_rcu(&srcdev->dev_list);
-	list_del_rcu(&srcdev->dev_alloc_list);
+	list_del(&srcdev->dev_alloc_list);
 	fs_devices->num_devices--;
 	if (srcdev->missing)
 		fs_devices->missing_devices--;
@@ -2349,6 +2358,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
+		bio_put(device->flush_bio);
 		kfree(device);
 		ret = -ENOMEM;
 		goto error;
@@ -2358,6 +2368,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		rcu_string_free(device->name);
+		bio_put(device->flush_bio);
 		kfree(device);
 		ret = PTR_ERR(trans);
 		goto error;
@@ -2384,7 +2395,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 		ret = btrfs_prepare_sprout(fs_info);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -2497,10 +2508,11 @@ error_sysfs:
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
 error_trans:
 	if (seeding_dev)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	if (trans)
 		btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
+	bio_put(device->flush_bio);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
@@ -2567,6 +2579,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
+		bio_put(device->flush_bio);
 		kfree(device);
 		ret = -ENOMEM;
 		goto error;
@@ -6284,6 +6297,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
 		ret = find_next_devid(fs_info, &tmp);
 		if (ret) {
+			bio_put(dev->flush_bio);
 			kfree(dev);
 			return ERR_PTR(ret);
 		}
diff --git a/fs/buffer.c b/fs/buffer.c
index 1c18a22a6013..0736a6a2e2f0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1592,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 	struct buffer_head *head;
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
 		count = pagevec_count(&pvec);
 		for (i = 0; i < count; i++) {
@@ -3514,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 	if (length <= 0)
 		return -ENOENT;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	do {
 		unsigned nr_pages, i;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 18d7aa61ef0f..883bc7bb12c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = __page_cache_alloc(cachefiles_gfp |
-						     __GFP_COLD);
+			newpage = __page_cache_alloc(cachefiles_gfp);
 			if (!newpage)
 				goto nomem_monitor;
 		}
@@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = __page_cache_alloc(cachefiles_gfp |
-							     __GFP_COLD);
+				newpage = __page_cache_alloc(cachefiles_gfp);
 				if (!newpage)
 					goto nomem;
 			}
@@ -710,7 +708,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	/* calculate the shift required to use bmap */
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
-	pagevec_init(&pagevec, 0);
+	pagevec_init(&pagevec);
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
 	op->op.flags |= FSCACHE_OP_ASYNC;
@@ -844,7 +842,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
 
 	ret = cachefiles_has_space(cache, 0, *nr_pages);
 	if (ret == 0) {
-		pagevec_init(&pagevec, 0);
+		pagevec_init(&pagevec);
 
 		list_for_each_entry(page, pages, lru) {
 			if (pagevec_add(&pagevec, page) == 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4d622654bfbc..dbf07051aacd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num)
 	struct pagevec pvec;
 	int i;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	for (i = 0; i < num; i++) {
 		if (pagevec_add(&pvec, pages[i]) == 0)
 			pagevec_release(&pvec);
@@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	if (fsc->mount_options->wsize < wsize)
 		wsize = fsc->mount_options->wsize;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
 	index = start_index;
@@ -870,15 +870,10 @@ retry:
 		max_pages = wsize >> PAGE_SHIFT;
 
 get_more_pages:
-		pvec_pages = min_t(unsigned, PAGEVEC_SIZE,
-				   max_pages - locked_pages);
-		if (end - index < (u64)(pvec_pages - 1))
-			pvec_pages = (unsigned)(end - index) + 1;
-
-		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-						PAGECACHE_TAG_DIRTY,
-						pvec_pages);
-		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
+						end, PAGECACHE_TAG_DIRTY,
+						max_pages - locked_pages);
+		dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
 		if (!pvec_pages && !locked_pages)
 			break;
 		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
@@ -896,16 +891,6 @@ get_more_pages:
 				unlock_page(page);
 				continue;
 			}
-			if (page->index > end) {
-				dout("end of range %p\n", page);
-				/* can't be range_cyclic (1st pass) because
-				 * end == -1 in that case. */
-				stop = true;
-				if (ceph_wbc.head_snapc)
-					done = true;
-				unlock_page(page);
-				break;
-			}
 			if (strip_unit_end && (page->index > strip_unit_end)) {
 				dout("end of strip unit %p\n", page);
 				unlock_page(page);
@@ -1177,8 +1162,7 @@ release_pvec_pages:
 			index = 0;
 			while ((index <= end) &&
 			       (nr = pagevec_lookup_tag(&pvec, mapping, &index,
-							PAGECACHE_TAG_WRITEBACK,
-							PAGEVEC_SIZE))) {
+						PAGECACHE_TAG_WRITEBACK))) {
 				for (i = 0; i < nr; i++) {
 					page = pvec.pages[i];
 					if (page_snap_context(page) != snapc)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ff5d32cf9578..a14b2c974c9e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1160,7 +1160,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	struct ceph_inode_info *ci = cap->ci;
 	struct inode *inode = &ci->vfs_inode;
 	struct cap_msg_args arg;
-	int held, revoking, dropping;
+	int held, revoking;
 	int wake = 0;
 	int delayed = 0;
 	int ret;
@@ -1168,7 +1168,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	held = cap->issued | cap->implemented;
 	revoking = cap->implemented & ~cap->issued;
 	retain &= ~revoking;
-	dropping = cap->issued & ~retain;
 
 	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
 	     inode, cap, cap->session,
@@ -1712,7 +1711,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 
 	/* if we are unmounting, flush any unused caps immediately. */
 	if (mdsc->stopping)
-		is_delayed = 1;
+		is_delayed = true;
 
 	spin_lock(&ci->i_ceph_lock);
 
@@ -3189,8 +3188,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
 	bool drop = false;
-	bool wake_ci = 0;
-	bool wake_mdsc = 0;
+	bool wake_ci = false;
+	bool wake_mdsc = false;
 
 	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
 		if (cf->tid == flush_tid)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f2550a076edc..ab81652198c4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -493,6 +493,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_wb_ref = 0;
 	ci->i_wrbuffer_ref = 0;
 	ci->i_wrbuffer_ref_head = 0;
+	atomic_set(&ci->i_filelock_ref, 0);
 	ci->i_shared_gen = 0;
 	ci->i_rdcache_gen = 0;
 	ci->i_rdcache_revoking = 0;
@@ -786,7 +787,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
 	/* update inode */
 	ci->i_version = le64_to_cpu(info->version);
-	inode->i_version++;
 	inode->i_rdev = le32_to_cpu(info->rdev);
 	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
 
@@ -1185,6 +1185,7 @@ retry_lookup:
 				    ceph_snap(d_inode(dn)) != tvino.snap)) {
 				dout(" dn %p points to wrong inode %p\n",
 				     dn, d_inode(dn));
+				ceph_dir_clear_ordered(dir);
 				d_delete(dn);
 				dput(dn);
 				goto retry_lookup;
@@ -1322,6 +1323,7 @@ retry_lookup:
 			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
 			     dn, d_inode(dn), ceph_vinop(d_inode(dn)),
 			     ceph_vinop(in));
+			ceph_dir_clear_ordered(dir);
 			d_invalidate(dn);
 			have_lease = false;
 		}
@@ -1573,6 +1575,7 @@ retry_lookup:
 			    ceph_snap(d_inode(dn)) != tvino.snap)) {
 			dout(" dn %p points to wrong inode %p\n",
 			     dn, d_inode(dn));
+			__ceph_dir_clear_ordered(ci);
 			d_delete(dn);
 			dput(dn);
 			goto retry_lookup;
@@ -1597,7 +1600,9 @@ retry_lookup:
 				 &req->r_caps_reservation);
 		if (ret < 0) {
 			pr_err("fill_inode badness on %p\n", in);
-			if (d_really_is_negative(dn))
+			if (d_really_is_positive(dn))
+				__ceph_dir_clear_ordered(ci);
+			else
 				iput(in);
 			d_drop(dn);
 			err = ret;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index e7cce412f2cf..9e66f69ee8a5 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -30,19 +30,52 @@ void __init ceph_flock_init(void)
 	get_random_bytes(&lock_secret, sizeof(lock_secret));
 }
 
+static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+	struct inode *inode = file_inode(src->fl_file);
+	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+}
+
+static void ceph_fl_release_lock(struct file_lock *fl)
+{
+	struct inode *inode = file_inode(fl->fl_file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
+		/* clear error when all locks are released */
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
+		spin_unlock(&ci->i_ceph_lock);
+	}
+}
+
+static const struct file_lock_operations ceph_fl_lock_ops = {
+	.fl_copy_lock = ceph_fl_copy_lock,
+	.fl_release_private = ceph_fl_release_lock,
+};
+
 /**
  * Implement fcntl and flock locking functions.
  */
-static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 			     int cmd, u8 wait, struct file_lock *fl)
 {
-	struct inode *inode = file_inode(file);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 	u64 length = 0;
 	u64 owner;
 
+	if (operation == CEPH_MDS_OP_SETFILELOCK) {
+		/*
+		 * increasing i_filelock_ref closes race window between
+		 * handling request reply and adding file_lock struct to
+		 * inode. Otherwise, auth caps may get trimmed in the
+		 * window. Caller function will decrease the counter.
+		 */
+		fl->fl_ops = &ceph_fl_lock_ops;
+		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+	}
+
 	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
 		wait = 0;
 
@@ -180,10 +213,12 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
  */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-	u8 lock_cmd;
-	int err;
-	u8 wait = 0;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err = 0;
 	u16 op = CEPH_MDS_OP_SETFILELOCK;
+	u8 wait = 0;
+	u8 lock_cmd;
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
@@ -199,6 +234,26 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	else if (IS_SETLKW(cmd))
 		wait = 1;
 
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+		err = -EIO;
+	} else if (op == CEPH_MDS_OP_SETFILELOCK) {
+		/*
+		 * increasing i_filelock_ref closes race window between
+		 * handling request reply and adding file_lock struct to
+		 * inode. Otherwise, i_auth_cap may get trimmed in the
+		 * window. Caller function will decrease the counter.
+		 */
+		fl->fl_ops = &ceph_fl_lock_ops;
+		atomic_inc(&ci->i_filelock_ref);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	if (err < 0) {
+		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
+			posix_lock_file(file, fl, NULL);
+		return err;
+	}
+
 	if (F_RDLCK == fl->fl_type)
 		lock_cmd = CEPH_LOCK_SHARED;
 	else if (F_WRLCK == fl->fl_type)
@@ -206,16 +261,16 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	else
 		lock_cmd = CEPH_LOCK_UNLOCK;
 
-	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
+	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 	if (!err) {
-		if (op != CEPH_MDS_OP_GETFILELOCK) {
+		if (op == CEPH_MDS_OP_SETFILELOCK) {
 			dout("mds locked, locking locally");
 			err = posix_lock_file(file, fl, NULL);
-			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+			if (err) {
 				/* undo! This should only happen if
 				 * the kernel detects local
 				 * deadlock. */
-				ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
 						  CEPH_LOCK_UNLOCK, 0, fl);
 				dout("got %d on posix_lock_file, undid lock",
 				     err);
@@ -227,9 +282,11 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	u8 lock_cmd;
-	int err;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err = 0;
 	u8 wait = 0;
+	u8 lock_cmd;
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
@@ -239,6 +296,21 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	dout("ceph_flock, fl_file: %p", fl->fl_file);
 
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+		err = -EIO;
+	} else {
+		/* see comment in ceph_lock */
+		fl->fl_ops = &ceph_fl_lock_ops;
+		atomic_inc(&ci->i_filelock_ref);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	if (err < 0) {
+		if (F_UNLCK == fl->fl_type)
+			locks_lock_file_wait(file, fl);
+		return err;
+	}
+
 	if (IS_SETLKW(cmd))
 		wait = 1;
 
@@ -250,13 +322,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 		lock_cmd = CEPH_LOCK_UNLOCK;
 
 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-				file, lock_cmd, wait, fl);
+				inode, lock_cmd, wait, fl);
 	if (!err) {
 		err = locks_lock_file_wait(file, fl);
 		if (err) {
 			ceph_lock_message(CEPH_LOCK_FLOCK,
 					  CEPH_MDS_OP_SETFILELOCK,
-					  file, CEPH_LOCK_UNLOCK, 0, fl);
+					  inode, CEPH_LOCK_UNLOCK, 0, fl);
 			dout("got %d on locks_lock_file_wait, undid lock", err);
 		}
 	}
@@ -288,6 +360,37 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 	     *flock_count, *fcntl_count);
 }
 
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+static int lock_to_ceph_filelock(struct file_lock *lock,
+				 struct ceph_filelock *cephlock)
+{
+	int err = 0;
+	cephlock->start = cpu_to_le64(lock->fl_start);
+	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+	cephlock->client = cpu_to_le64(0);
+	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+
+	switch (lock->fl_type) {
+	case F_RDLCK:
+		cephlock->type = CEPH_LOCK_SHARED;
+		break;
+	case F_WRLCK:
+		cephlock->type = CEPH_LOCK_EXCL;
+		break;
+	case F_UNLCK:
+		cephlock->type = CEPH_LOCK_UNLOCK;
+		break;
+	default:
+		dout("Have unknown lock type %d", lock->fl_type);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
 /**
  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
  * array. Must be called with inode->i_lock already held.
@@ -356,50 +459,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 	if (err)
 		goto out_fail;
 
-	err = ceph_pagelist_append(pagelist, flocks,
-				   num_fcntl_locks * sizeof(*flocks));
-	if (err)
-		goto out_fail;
+	if (num_fcntl_locks > 0) {
+		err = ceph_pagelist_append(pagelist, flocks,
+					   num_fcntl_locks * sizeof(*flocks));
+		if (err)
+			goto out_fail;
+	}
 
 	nlocks = cpu_to_le32(num_flock_locks);
 	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 	if (err)
 		goto out_fail;
 
-	err = ceph_pagelist_append(pagelist,
-				   &flocks[num_fcntl_locks],
-				   num_flock_locks * sizeof(*flocks));
-out_fail:
-	return err;
-}
-
-/*
- * Given a pointer to a lock, convert it to a ceph filelock
- */
-int lock_to_ceph_filelock(struct file_lock *lock,
-			  struct ceph_filelock *cephlock)
-{
-	int err = 0;
-	cephlock->start = cpu_to_le64(lock->fl_start);
-	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
-	cephlock->client = cpu_to_le64(0);
-	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
-	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
-
-	switch (lock->fl_type) {
-	case F_RDLCK:
-		cephlock->type = CEPH_LOCK_SHARED;
-		break;
-	case F_WRLCK:
-		cephlock->type = CEPH_LOCK_EXCL;
-		break;
-	case F_UNLCK:
-		cephlock->type = CEPH_LOCK_UNLOCK;
-		break;
-	default:
-		dout("Have unknown lock type %d", lock->fl_type);
-		err = -EINVAL;
+	if (num_flock_locks > 0) {
+		err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
+					   num_flock_locks * sizeof(*flocks));
 	}
-
+out_fail:
 	return err;
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0687ab3c3267..1b468250e947 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1039,22 +1039,23 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
  * session caps
  */
 
-/* caller holds s_cap_lock, we drop it */
-static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
-				 struct ceph_mds_session *session)
-	__releases(session->s_cap_lock)
+static void detach_cap_releases(struct ceph_mds_session *session,
+				struct list_head *target)
 {
-	LIST_HEAD(tmp_list);
-	list_splice_init(&session->s_cap_releases, &tmp_list);
+	lockdep_assert_held(&session->s_cap_lock);
+
+	list_splice_init(&session->s_cap_releases, target);
 	session->s_num_cap_releases = 0;
-	spin_unlock(&session->s_cap_lock);
+	dout("dispose_cap_releases mds%d\n", session->s_mds);
+}
 
-	dout("cleanup_cap_releases mds%d\n", session->s_mds);
-	while (!list_empty(&tmp_list)) {
+static void dispose_cap_releases(struct ceph_mds_client *mdsc,
+				 struct list_head *dispose)
+{
+	while (!list_empty(dispose)) {
 		struct ceph_cap *cap;
 		/* zero out the in-progress message */
-		cap = list_first_entry(&tmp_list,
-					struct ceph_cap, session_caps);
+		cap = list_first_entry(dispose, struct ceph_cap, session_caps);
 		list_del(&cap->session_caps);
 		ceph_put_cap(mdsc, cap);
 	}
@@ -1215,6 +1216,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 
+		if (atomic_read(&ci->i_filelock_ref) > 0) {
+			/* make further file lock syscall return -EIO */
+			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+			pr_warn_ratelimited(" dropping file locks for %p %lld\n",
+					    inode, ceph_ino(inode));
+		}
+
 		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
 			list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
 			ci->i_prealloc_cap_flush = NULL;
@@ -1244,6 +1252,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
 {
 	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
 	struct super_block *sb = fsc->sb;
+	LIST_HEAD(dispose);
+
 	dout("remove_session_caps on %p\n", session);
 	iterate_session_caps(session, remove_session_caps_cb, fsc);
 
@@ -1278,10 +1288,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
 	}
 
 	// drop cap expires and unlock s_cap_lock
-	cleanup_cap_releases(session->s_mdsc, session);
+	detach_cap_releases(session, &dispose);
 
 	BUG_ON(session->s_nr_caps > 0);
 	BUG_ON(!list_empty(&session->s_cap_flushing));
+	spin_unlock(&session->s_cap_lock);
+	dispose_cap_releases(session->s_mdsc, &dispose);
 }
 
 /*
@@ -1428,6 +1440,29 @@ static int __close_session(struct ceph_mds_client *mdsc,
 	return request_close_session(mdsc, session);
 }
 
+static bool drop_negative_children(struct dentry *dentry)
+{
+	struct dentry *child;
+	bool all_negative = true;
+
+	if (!d_is_dir(dentry))
+		goto out;
+
+	spin_lock(&dentry->d_lock);
+	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+		if (d_really_is_positive(child)) {
+			all_negative = false;
+			break;
+		}
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (all_negative)
+		shrink_dcache_parent(dentry);
+out:
+	return all_negative;
+}
+
 /*
  * Trim old(er) caps.
  *
@@ -1462,6 +1497,11 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 			goto out;
 		if ((used | wanted) & CEPH_CAP_ANY_WR)
 			goto out;
+		/* Note: it's possible that i_filelock_ref becomes non-zero
+		 * after dropping auth caps. It doesn't hurt because reply
+		 * of lock mds request will re-add auth caps. */
+		if (atomic_read(&ci->i_filelock_ref) > 0)
+			goto out;
 	}
 	/* The inode has cached pages, but it's no longer used.
 	 * we can safely drop it */
@@ -1473,16 +1513,27 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	if ((used | wanted) & ~oissued & mine)
 		goto out;   /* we need these caps */
 
-	session->s_trim_caps--;
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
 		__ceph_remove_cap(cap, true);
+		session->s_trim_caps--;
 	} else {
+		struct dentry *dentry;
 		/* try dropping referring dentries */
 		spin_unlock(&ci->i_ceph_lock);
-		d_prune_aliases(inode);
-		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
-		     inode, cap, atomic_read(&inode->i_count));
+		dentry = d_find_any_alias(inode);
+		if (dentry && drop_negative_children(dentry)) {
+			int count;
+			dput(dentry);
+			d_prune_aliases(inode);
+			count = atomic_read(&inode->i_count);
+			if (count == 1)
+				session->s_trim_caps--;
+			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
+			     inode, cap, count);
+		} else {
+			dput(dentry);
+		}
 		return 0;
 	}
 
@@ -2827,7 +2878,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_mds_cap_reconnect v2;
 		struct ceph_mds_cap_reconnect_v1 v1;
 	} rec;
-	struct ceph_inode_info *ci;
+	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
 	char *path;
@@ -2836,8 +2887,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	u64 snap_follows;
 	struct dentry *dentry;
 
-	ci = cap->ci;
-
 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
 	     inode, ceph_vinop(inode), cap, cap->cap_id,
 	     ceph_cap_string(cap->issued));
@@ -2870,7 +2919,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		rec.v2.issued = cpu_to_le32(cap->issued);
 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 		rec.v2.pathbase = cpu_to_le64(pathbase);
-		rec.v2.flock_len = 0;
+		rec.v2.flock_len = (__force __le32)
+			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
 		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
 		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2894,26 +2944,37 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 	if (recon_state->msg_version >= 2) {
 		int num_fcntl_locks, num_flock_locks;
-		struct ceph_filelock *flocks;
+		struct ceph_filelock *flocks = NULL;
 		size_t struct_len, total_len = 0;
 		u8 struct_v = 0;
 
 encode_again:
-		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-		flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
-				 sizeof(struct ceph_filelock), GFP_NOFS);
-		if (!flocks) {
-			err = -ENOMEM;
-			goto out_free;
+		if (rec.v2.flock_len) {
+			ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+		} else {
+			num_fcntl_locks = 0;
+			num_flock_locks = 0;
 		}
-		err = ceph_encode_locks_to_buffer(inode, flocks,
-						  num_fcntl_locks,
-						  num_flock_locks);
-		if (err) {
+		if (num_fcntl_locks + num_flock_locks > 0) {
+			flocks = kmalloc((num_fcntl_locks + num_flock_locks) *
+					 sizeof(struct ceph_filelock), GFP_NOFS);
+			if (!flocks) {
+				err = -ENOMEM;
+				goto out_free;
+			}
+			err = ceph_encode_locks_to_buffer(inode, flocks,
+							  num_fcntl_locks,
+							  num_flock_locks);
+			if (err) {
+				kfree(flocks);
+				flocks = NULL;
+				if (err == -ENOSPC)
+					goto encode_again;
+				goto out_free;
+			}
+		} else {
 			kfree(flocks);
-			if (err == -ENOSPC)
-				goto encode_again;
-			goto out_free;
+			flocks = NULL;
 		}
 
 		if (recon_state->msg_version >= 3) {
@@ -2993,6 +3054,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	int s_nr_caps;
 	struct ceph_pagelist *pagelist;
 	struct ceph_reconnect_state recon_state;
+	LIST_HEAD(dispose);
 
 	pr_info("mds%d reconnect start\n", mds);
 
@@ -3026,7 +3088,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	 */
 	session->s_cap_reconnect = 1;
 	/* drop old cap expires; we're about to reestablish that state */
-	cleanup_cap_releases(mdsc, session);
+	detach_cap_releases(session, &dispose);
+	spin_unlock(&session->s_cap_lock);
+	dispose_cap_releases(mdsc, &dispose);
 
 	/* trim unused caps to reduce MDS's cache rejoin time */
 	if (mdsc->fsc->sb->s_root)
@@ -3857,14 +3921,14 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		goto err_out;
 	}
 	return;
+
 bad:
 	pr_err("error decoding fsmap\n");
 err_out:
 	mutex_lock(&mdsc->mutex);
-	mdsc->mdsmap_err = -ENOENT;
+	mdsc->mdsmap_err = err;
 	__wake_requests(mdsc, &mdsc->waiting_for_map);
 	mutex_unlock(&mdsc->mutex);
-	return;
 }
 
 /*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e4082afedcb1..a62d2a9841dc 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -84,8 +84,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_ffree = -1;
 	buf->f_namelen = NAME_MAX;
 
-	/* leave fsid little-endian, regardless of host endianness */
-	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+	/* Must convert the fsid, for consistent values across arches */
+	fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^
+	       le64_to_cpu(*((__le64 *)&monmap->fsid + 1));
 	buf->f_fsid.val[0] = fsid & 0xffffffff;
 	buf->f_fsid.val[1] = fsid >> 32;
 
@@ -330,11 +331,11 @@ static int parse_fsopt_token(char *c, void *private)
 		break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	case Opt_acl:
-		fsopt->sb_flags |= MS_POSIXACL;
+		fsopt->sb_flags |= SB_POSIXACL;
 		break;
 #endif
 	case Opt_noacl:
-		fsopt->sb_flags &= ~MS_POSIXACL;
+		fsopt->sb_flags &= ~SB_POSIXACL;
 		break;
 	default:
 		BUG_ON(token);
@@ -519,7 +520,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",nopoolperm");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	if (fsopt->sb_flags & MS_POSIXACL)
+	if (fsopt->sb_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
 	else
 		seq_puts(m, ",noacl");
@@ -987,7 +988,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	dout("ceph_mount\n");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	flags |= MS_POSIXACL;
+	flags |= SB_POSIXACL;
 #endif
 	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
 	if (err < 0) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3e27a28aa44a..2beeec07fa76 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -352,6 +352,7 @@ struct ceph_inode_info {
 	int i_pin_ref;
 	int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
 	int i_wrbuffer_ref, i_wrbuffer_ref_head;
+	atomic_t i_filelock_ref;
 	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
 	u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
 	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
@@ -487,6 +488,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_KICK_FLUSH	(1 << 9)  /* kick flushing caps */
 #define CEPH_I_FLUSH_SNAPS	(1 << 10) /* need flush snapss */
 #define CEPH_I_ERROR_WRITE	(1 << 11) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK	(1 << 12) /* have seen file lock errors */
+
 
 /*
  * We set the ERROR_WRITE bit when we start seeing write errors on an inode
@@ -1011,7 +1014,6 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode,
 extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 				  struct ceph_pagelist *pagelist,
 				  int num_fcntl_locks, int num_flock_locks);
-extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 
 /* debugfs.c */
 extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index cbd216b57239..350fa55a1bf7 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -42,7 +42,7 @@
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
 #define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
-#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of SB_POSIXACL in mnt_cifs_flags */
 #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
 #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
 #define CIFS_MOUNT_MAP_SFM_CHR	0x800000 /* SFM/MAC mapping for illegal chars */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c8b75d33f31..31b7565b1617 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -125,7 +125,7 @@ cifs_read_super(struct super_block *sb)
 	tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 
 	if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
 		sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -497,7 +497,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
 		seq_puts(s, ",dynperm");
-	if (root->d_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & SB_POSIXACL)
 		seq_puts(s, ",acl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
 		seq_puts(s, ",mfsymlinks");
@@ -573,7 +573,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return 0;
 }
 
@@ -708,7 +708,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 
 	rc = cifs_mount(cifs_sb, volume_info);
 	if (rc) {
-		if (!(flags & MS_SILENT))
+		if (!(flags & SB_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
@@ -720,7 +720,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 	mnt_data.flags = flags;
 
 	/* BB should we make this contingent on mount parm? */
-	flags |= MS_NODIRATIME | MS_NOATIME;
+	flags |= SB_NODIRATIME | SB_NOATIME;
 
 	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
 	if (IS_ERR(sb)) {
@@ -739,7 +739,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 			goto out_super;
 		}
 
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 	}
 
 	root = cifs_get_root(volume_info, sb);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e185b2853eab..b16583594d1a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -559,8 +559,8 @@ struct smb_vol {
 			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
 			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
 
-#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
-		      MS_NODEV | MS_SYNCHRONOUS)
+#define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \
+		      SB_NODEV | SB_SYNCHRONOUS)
 
 struct cifs_mnt_data {
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 92fdf9c35de2..df9f682708c6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1963,8 +1963,6 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 			  pgoff_t end, pgoff_t *index,
 			  unsigned int *found_pages)
 {
-	unsigned int nr_pages;
-	struct page **pages;
 	struct cifs_writedata *wdata;
 
 	wdata = cifs_writedata_alloc((unsigned int)tofind,
@@ -1972,23 +1970,8 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 	if (!wdata)
 		return NULL;
 
-	/*
-	 * find_get_pages_tag seems to return a max of 256 on each
-	 * iteration, so we must call it several times in order to
-	 * fill the array or the wsize is effectively limited to
-	 * 256 * PAGE_SIZE.
-	 */
-	*found_pages = 0;
-	pages = wdata->pages;
-	do {
-		nr_pages = find_get_pages_tag(mapping, index,
-					      PAGECACHE_TAG_DIRTY, tofind,
-					      pages);
-		*found_pages += nr_pages;
-		tofind -= nr_pages;
-		pages += nr_pages;
-	} while (nr_pages && tofind && *index <= end);
-
+	*found_pages = find_get_pages_range_tag(mapping, index, end,
+				PAGECACHE_TAG_DIRTY, tofind, wdata->pages);
 	return wdata;
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7c732cb44164..ecb99079363a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -985,7 +985,7 @@ retry_iget5_locked:
 		}
 
 		cifs_fattr_to_inode(inode, fattr);
-		if (sb->s_flags & MS_NOATIME)
+		if (sb->s_flags & SB_NOATIME)
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index e06740436b92..ed88ab8a4774 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1406,7 +1406,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
 	} while (rc == -EAGAIN);
 
 	if (rc) {
-		cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc);
+		if (rc != -ENOENT)
+			cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc);
 		goto out;
 	}
 
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5331631386a2..01346b8b6edb 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2678,27 +2678,27 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	cifs_small_buf_release(req);
 
 	rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
-	shdr = get_sync_hdr(rsp);
 
-	if (shdr->Status == STATUS_END_OF_FILE) {
+	if (rc) {
+		if (rc != -ENODATA) {
+			cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
+			cifs_dbg(VFS, "Send error in read = %d\n", rc);
+		}
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
-		return 0;
+		return rc == -ENODATA ? 0 : rc;
 	}
 
-	if (rc) {
-		cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
-		cifs_dbg(VFS, "Send error in read = %d\n", rc);
-	} else {
-		*nbytes = le32_to_cpu(rsp->DataLength);
-		if ((*nbytes > CIFS_MAX_MSGSIZE) ||
-		    (*nbytes > io_parms->length)) {
-			cifs_dbg(FYI, "bad length %d for count %d\n",
-				 *nbytes, io_parms->length);
-			rc = -EIO;
-			*nbytes = 0;
-		}
+	*nbytes = le32_to_cpu(rsp->DataLength);
+	if ((*nbytes > CIFS_MAX_MSGSIZE) ||
+	    (*nbytes > io_parms->length)) {
+		cifs_dbg(FYI, "bad length %d for count %d\n",
+			 *nbytes, io_parms->length);
+		rc = -EIO;
+		*nbytes = 0;
 	}
 
+	shdr = get_sync_hdr(rsp);
+
 	if (*buf) {
 		memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes);
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 52f975d848a0..316af84674f1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -117,7 +117,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 #ifdef CONFIG_CIFS_POSIX
 		if (!value)
 			goto out;
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
 				value, (const int)size,
 				ACL_TYPE_ACCESS, cifs_sb->local_nls,
@@ -129,7 +129,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 #ifdef CONFIG_CIFS_POSIX
 		if (!value)
 			goto out;
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
 				value, (const int)size,
 				ACL_TYPE_DEFAULT, cifs_sb->local_nls,
@@ -266,7 +266,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 
 	case XATTR_ACL_ACCESS:
 #ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
 				value, size, ACL_TYPE_ACCESS,
 				cifs_sb->local_nls,
@@ -276,7 +276,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 
 	case XATTR_ACL_DEFAULT:
 #ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
 				value, size, ACL_TYPE_DEFAULT,
 				cifs_sb->local_nls,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6f0a6a4d5faa..97424cf206c0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,7 +96,7 @@ void coda_destroy_inodecache(void)
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -188,7 +188,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_unlock(&vc->vc_mutex);
 
 	sb->s_fs_info = vc;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 	sb->s_blocksize = 4096;	/* XXXXX  what do we put here?? */
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index a37f003530d7..1175a1722411 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -447,8 +447,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid)
 	UPARG(CODA_FSYNC);
 
 	inp->coda_fsync.VFid = *fid;
-	error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs),
-			    &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index bd5d91e119ca..5fc5dc660600 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -54,8 +54,6 @@
 #include <linux/if_tun.h>
 #include <linux/ctype.h>
 #include <linux/syscalls.h>
-#include <linux/i2c.h>
-#include <linux/i2c-dev.h>
 #include <linux/atalk.h>
 #include <linux/gfp.h>
 #include <linux/cec.h>
@@ -137,22 +135,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return vfs_ioctl(file, cmd, arg);
 }
 
-static int w_long(struct file *file,
-		unsigned int cmd, compat_ulong_t __user *argp)
-{
-	int err;
-	unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
-
-	if (valp == NULL)
-		return -EFAULT;
-	err = do_ioctl(file, cmd, (unsigned long)valp);
-	if (err)
-		return err;
-	if (convert_in_user(valp, argp))
-		return -EFAULT;
-	return 0;
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -671,96 +653,6 @@ static int serial_struct_ioctl(struct file *file,
         return err;
 }
 
-/*
- * I2C layer ioctls
- */
-
-struct i2c_msg32 {
-	u16 addr;
-	u16 flags;
-	u16 len;
-	compat_caddr_t buf;
-};
-
-struct i2c_rdwr_ioctl_data32 {
-	compat_caddr_t msgs; /* struct i2c_msg __user *msgs */
-	u32 nmsgs;
-};
-
-struct i2c_smbus_ioctl_data32 {
-	u8 read_write;
-	u8 command;
-	u32 size;
-	compat_caddr_t data; /* union i2c_smbus_data *data */
-};
-
-struct i2c_rdwr_aligned {
-	struct i2c_rdwr_ioctl_data cmd;
-	struct i2c_msg msgs[0];
-};
-
-static int do_i2c_rdwr_ioctl(struct file *file,
-	unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
-{
-	struct i2c_rdwr_aligned		__user *tdata;
-	struct i2c_msg			__user *tmsgs;
-	struct i2c_msg32		__user *umsgs;
-	compat_caddr_t			datap;
-	u32				nmsgs;
-	int				i;
-
-	if (get_user(nmsgs, &udata->nmsgs))
-		return -EFAULT;
-	if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
-		return -EINVAL;
-
-	if (get_user(datap, &udata->msgs))
-		return -EFAULT;
-	umsgs = compat_ptr(datap);
-
-	tdata = compat_alloc_user_space(sizeof(*tdata) +
-				      nmsgs * sizeof(struct i2c_msg));
-	tmsgs = &tdata->msgs[0];
-
-	if (put_user(nmsgs, &tdata->cmd.nmsgs) ||
-	    put_user(tmsgs, &tdata->cmd.msgs))
-		return -EFAULT;
-
-	for (i = 0; i < nmsgs; i++) {
-		if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16)))
-			return -EFAULT;
-		if (get_user(datap, &umsgs[i].buf) ||
-		    put_user(compat_ptr(datap), &tmsgs[i].buf))
-			return -EFAULT;
-	}
-	return do_ioctl(file, cmd, (unsigned long)tdata);
-}
-
-static int do_i2c_smbus_ioctl(struct file *file,
-		unsigned int cmd, struct i2c_smbus_ioctl_data32   __user *udata)
-{
-	struct i2c_smbus_ioctl_data	__user *tdata;
-	union {
-		/* beginnings of those have identical layouts */
-		struct i2c_smbus_ioctl_data32	data32;
-		struct i2c_smbus_ioctl_data	data;
-	} v;
-
-	tdata = compat_alloc_user_space(sizeof(*tdata));
-	if (tdata == NULL)
-		return -ENOMEM;
-
-	memset(&v, 0, sizeof(v));
-	if (copy_from_user(&v.data32, udata, sizeof(v.data32)))
-		return -EFAULT;
-	v.data.data = compat_ptr(v.data32.data);
-
-	if (copy_to_user(tdata, &v.data, sizeof(v.data)))
-		return -EFAULT;
-
-	return do_ioctl(file, cmd, (unsigned long)tdata);
-}
-
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
@@ -1283,13 +1175,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
 COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* i2c */
-COMPATIBLE_IOCTL(I2C_SLAVE)
-COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
-COMPATIBLE_IOCTL(I2C_TENBIT)
-COMPATIBLE_IOCTL(I2C_PEC)
-COMPATIBLE_IOCTL(I2C_RETRIES)
-COMPATIBLE_IOCTL(I2C_TIMEOUT)
 /* hiddev */
 COMPATIBLE_IOCTL(HIDIOCGVERSION)
 COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
@@ -1464,13 +1349,6 @@ static long do_ioctl_trans(unsigned int cmd,
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
 		return serial_struct_ioctl(file, cmd, argp);
-	/* i2c */
-	case I2C_FUNCS:
-		return w_long(file, cmd, argp);
-	case I2C_RDWR:
-		return do_i2c_rdwr_ioctl(file, cmd, argp);
-	case I2C_SMBUS:
-		return do_i2c_smbus_ioctl(file, cmd, argp);
 	/* Not implemented in the native kernel */
 	case RTC_IRQP_READ32:
 	case RTC_IRQP_SET32:
@@ -1580,6 +1458,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	case FICLONE:
 	case FICLONERANGE:
 	case FIDEDUPERANGE:
+	case FS_IOC_FIEMAP:
 		goto do_ioctl;
 
 	case FIBMAP:
diff --git a/fs/coredump.c b/fs/coredump.c
index 52c63d6c9143..1e2c87acac9b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -680,16 +680,11 @@ void do_coredump(const siginfo_t *siginfo)
 		 * privs and don't want to unlink another user's coredump.
 		 */
 		if (!need_suid_safe) {
-			mm_segment_t old_fs;
-
-			old_fs = get_fs();
-			set_fs(KERNEL_DS);
 			/*
 			 * If it doesn't exist, that's fine. If there's some
 			 * other problem, we'll catch it at the filp_open().
 			 */
-			(void) sys_unlink((const char __user *)cn.corename);
-			set_fs(old_fs);
+			do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
 		}
 
 		/*
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index 11b29d491b7c..58e2fe40b2a0 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,6 +1,5 @@
 config CRAMFS
-	tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
-	depends on BLOCK
+	tristate "Compressed ROM file system support (cramfs)"
 	select ZLIB_INFLATE
 	help
 	  Saying Y here includes support for CramFs (Compressed ROM File
@@ -16,7 +15,40 @@ config CRAMFS
 	  cramfs.  Note that the root file system (the one containing the
 	  directory /) cannot be compiled as a module.
 
-	  This filesystem is obsoleted by SquashFS, which is much better
-	  in terms of performance and features.
+	  This filesystem is limited in capabilities and performance on
+	  purpose to remain small and low on RAM usage. It is most suitable
+	  for small embedded systems. If you have ample RAM to spare, you may
+	  consider a more capable compressed filesystem such as SquashFS
+	  which is much better in terms of performance and features.
+
+	  If unsure, say N.
+
+config CRAMFS_BLOCKDEV
+	bool "Support CramFs image over a regular block device" if EXPERT
+	depends on CRAMFS && BLOCK
+	default y
+	help
+	  This option allows the CramFs driver to load data from a regular
+	  block device such a disk partition or a ramdisk.
+
+config CRAMFS_MTD
+	bool "Support CramFs image directly mapped in physical memory"
+	depends on CRAMFS && MTD
+	depends on CRAMFS=m || MTD=y
+	default y if !CRAMFS_BLOCKDEV
+	help
+	  This option allows the CramFs driver to load data directly from
+	  a linear adressed memory range (usually non volatile memory
+	  like flash) instead of going through the block device layer.
+	  This saves some memory since no intermediate buffering is
+	  necessary.
+
+	  The location of the CramFs image is determined by a
+	  MTD device capable of direct memory mapping e.g. from
+	  the 'physmap' map driver or a resulting MTD partition.
+	  For example, this would mount the cramfs image stored in
+	  the MTD partition named "xip_fs" on the /mnt mountpoint:
+
+	  mount -t cramfs mtd:xip_fs /mnt
 
 	  If unsure, say N.
diff --git a/fs/cramfs/README b/fs/cramfs/README
index 9d4e7ea311f4..d71b27e0ff15 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -49,17 +49,46 @@ same as the start of the (i+1)'th <block> if there is one).  The first
 <block> immediately follows the last <block_pointer> for the file.
 <block_pointer>s are each 32 bits long.
 
+When the CRAMFS_FLAG_EXT_BLOCK_POINTERS capability bit is set, each
+<block_pointer>'s top bits may contain special flags as follows:
+
+CRAMFS_BLK_FLAG_UNCOMPRESSED (bit 31):
+	The block data is not compressed and should be copied verbatim.
+
+CRAMFS_BLK_FLAG_DIRECT_PTR (bit 30):
+	The <block_pointer> stores the actual block start offset and not
+	its end, shifted right by 2 bits. The block must therefore be
+	aligned to a 4-byte boundary. The block size is either blksize
+	if CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified, otherwise
+	the compressed data length is included in the first 2 bytes of
+	the block data. This is used to allow discontiguous data layout
+	and specific data block alignments e.g. for XIP applications.
+
+
 The order of <file_data>'s is a depth-first descent of the directory
 tree, i.e. the same order as `find -size +0 \( -type f -o -type l \)
 -print'.
 
 
 <block>: The i'th <block> is the output of zlib's compress function
-applied to the i'th blksize-sized chunk of the input data.
+applied to the i'th blksize-sized chunk of the input data if the
+corresponding CRAMFS_BLK_FLAG_UNCOMPRESSED <block_ptr> bit is not set,
+otherwise it is the input data directly.
 (For the last <block> of the file, the input may of course be smaller.)
 Each <block> may be a different size.  (See <block_pointer> above.)
+
 <block>s are merely byte-aligned, not generally u32-aligned.
 
+When CRAMFS_BLK_FLAG_DIRECT_PTR is specified then the corresponding
+<block> may be located anywhere and not necessarily contiguous with
+the previous/next blocks. In that case it is minimally u32-aligned.
+If CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified then the size is always
+blksize except for the last block which is limited by the file length.
+If CRAMFS_BLK_FLAG_DIRECT_PTR is set and CRAMFS_BLK_FLAG_UNCOMPRESSED
+is not set then the first 2 bytes of the block contains the size of the
+remaining block data as this cannot be determined from the placement of
+logically adjacent blocks.
+
 
 Holes
 -----
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 7919967488cb..017b0ab19bc4 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,10 +15,15 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/pagemap.h>
+#include <linux/pfn_t.h>
+#include <linux/ramfs.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/super.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
@@ -36,6 +41,9 @@ struct cramfs_sb_info {
 	unsigned long blocks;
 	unsigned long files;
 	unsigned long flags;
+	void *linear_virt_addr;
+	resource_size_t linear_phys_addr;
+	size_t mtd_point_size;
 };
 
 static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
@@ -46,6 +54,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
 static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -93,6 +102,10 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	case S_IFREG:
 		inode->i_fop = &generic_ro_fops;
 		inode->i_data.a_ops = &cramfs_aops;
+		if (IS_ENABLED(CONFIG_CRAMFS_MTD) &&
+		    CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+		    CRAMFS_SB(sb)->linear_phys_addr)
+			inode->i_fop = &cramfs_physmem_fops;
 		break;
 	case S_IFDIR:
 		inode->i_op = &cramfs_dir_inode_operations;
@@ -140,6 +153,9 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
  * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to
  * worry about end-of-buffer issues even when decompressing a full
  * page cache.
+ *
+ * Note: This is all optimized away at compile time when
+ *       CONFIG_CRAMFS_BLOCKDEV=n.
  */
 #define READ_BUFFERS (2)
 /* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */
@@ -160,10 +176,10 @@ static struct super_block *buffer_dev[READ_BUFFERS];
 static int next_buffer;
 
 /*
- * Returns a pointer to a buffer containing at least LEN bytes of
- * filesystem starting at byte offset OFFSET into the filesystem.
+ * Populate our block cache and return a pointer to it.
  */
-static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len)
+static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
+				unsigned int len)
 {
 	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
 	struct page *pages[BLKS_PER_BUF];
@@ -239,49 +255,278 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 	return read_buffers[buffer] + offset;
 }
 
+/*
+ * Return a pointer to the linearly addressed cramfs image in memory.
+ */
+static void *cramfs_direct_read(struct super_block *sb, unsigned int offset,
+				unsigned int len)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+	if (!len)
+		return NULL;
+	if (len > sbi->size || offset > sbi->size - len)
+		return page_address(ZERO_PAGE(0));
+	return sbi->linear_virt_addr + offset;
+}
+
+/*
+ * Returns a pointer to a buffer containing at least LEN bytes of
+ * filesystem starting at byte offset OFFSET into the filesystem.
+ */
+static void *cramfs_read(struct super_block *sb, unsigned int offset,
+			 unsigned int len)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+	if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr)
+		return cramfs_direct_read(sb, offset, len);
+	else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
+		return cramfs_blkdev_read(sb, offset, len);
+	else
+		return NULL;
+}
+
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+	int i;
+	u32 *blockptrs, first_block_addr;
+
+	/*
+	 * We can dereference memory directly here as this code may be
+	 * reached only when there is a direct filesystem image mapping
+	 * available in memory.
+	 */
+	blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4);
+	first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS;
+	i = 0;
+	do {
+		u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT);
+		u32 expect = (first_block_addr + block_off) |
+			     CRAMFS_BLK_FLAG_DIRECT_PTR |
+			     CRAMFS_BLK_FLAG_UNCOMPRESSED;
+		if (blockptrs[i] != expect) {
+			pr_debug("range: block %d/%d got %#x expects %#x\n",
+				 pgoff+i, pgoff + *pages - 1,
+				 blockptrs[i], expect);
+			if (i == 0)
+				return 0;
+			break;
+		}
+	} while (++i < *pages);
+
+	*pages = i;
+	return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+}
+
+#ifdef CONFIG_MMU
+
+/*
+ * Return true if the last page of a file in the filesystem image contains
+ * some other data that doesn't belong to that file. It is assumed that the
+ * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED
+ * (verified by cramfs_get_block_range() and directly accessible in memory.
+ */
+static bool cramfs_last_page_is_shared(struct inode *inode)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+	u32 partial, last_page, blockaddr, *blockptrs;
+	char *tail_data;
+
+	partial = offset_in_page(inode->i_size);
+	if (!partial)
+		return false;
+	last_page = inode->i_size >> PAGE_SHIFT;
+	blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode));
+	blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS;
+	blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+	tail_data = sbi->linear_virt_addr + blockaddr + partial;
+	return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false;
+}
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+	struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+	unsigned int pages, max_pages, offset;
+	unsigned long address, pgoff = vma->vm_pgoff;
+	char *bailout_reason;
+	int ret;
+
+	ret = generic_file_readonly_mmap(file, vma);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now try to pre-populate ptes for this vma with a direct
+	 * mapping avoiding memory allocation when possible.
+	 */
+
+	/* Could COW work here? */
+	bailout_reason = "vma is writable";
+	if (vma->vm_flags & VM_WRITE)
+		goto bailout;
+
+	max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	bailout_reason = "beyond file limit";
+	if (pgoff >= max_pages)
+		goto bailout;
+	pages = min(vma_pages(vma), max_pages - pgoff);
+
+	offset = cramfs_get_block_range(inode, pgoff, &pages);
+	bailout_reason = "unsuitable block layout";
+	if (!offset)
+		goto bailout;
+	address = sbi->linear_phys_addr + offset;
+	bailout_reason = "data is not page aligned";
+	if (!PAGE_ALIGNED(address))
+		goto bailout;
+
+	/* Don't map the last page if it contains some other data */
+	if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) {
+		pr_debug("mmap: %s: last page is shared\n",
+			 file_dentry(file)->d_name.name);
+		pages--;
+	}
+
+	if (!pages) {
+		bailout_reason = "no suitable block remaining";
+		goto bailout;
+	}
+
+	if (pages == vma_pages(vma)) {
+		/*
+		 * The entire vma is mappable. remap_pfn_range() will
+		 * make it distinguishable from a non-direct mapping
+		 * in /proc/<pid>/maps by substituting the file offset
+		 * with the actual physical address.
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT,
+				      pages * PAGE_SIZE, vma->vm_page_prot);
+	} else {
+		/*
+		 * Let's create a mixed map if we can't map it all.
+		 * The normal paging machinery will take care of the
+		 * unpopulated ptes via cramfs_readpage().
+		 */
+		int i;
+		vma->vm_flags |= VM_MIXEDMAP;
+		for (i = 0; i < pages && !ret; i++) {
+			unsigned long off = i * PAGE_SIZE;
+			pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
+			ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+		}
+	}
+
+	if (!ret)
+		pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) "
+			 "to vma 0x%08lx, page_prot 0x%llx\n",
+			 file_dentry(file)->d_name.name, pgoff,
+			 address, pages, vma_pages(vma), vma->vm_start,
+			 (unsigned long long)pgprot_val(vma->vm_page_prot));
+	return ret;
+
+bailout:
+	pr_debug("%s[%lu]: direct mmap impossible: %s\n",
+		 file_dentry(file)->d_name.name, pgoff, bailout_reason);
+	/* Didn't manage any direct map, but normal paging is still possible */
+	return 0;
+}
+
+#else /* CONFIG_MMU */
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,
+			unsigned long addr, unsigned long len,
+			unsigned long pgoff, unsigned long flags)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+	unsigned int pages, block_pages, max_pages, offset;
+
+	pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= max_pages || pages > max_pages - pgoff)
+		return -EINVAL;
+	block_pages = pages;
+	offset = cramfs_get_block_range(inode, pgoff, &block_pages);
+	if (!offset || block_pages != pages)
+		return -ENOSYS;
+	addr = sbi->linear_phys_addr + offset;
+	pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n",
+		 file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr);
+	return addr;
+}
+
+static unsigned int cramfs_physmem_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT |
+	       NOMMU_MAP_READ | NOMMU_MAP_EXEC;
+}
+
+#endif /* CONFIG_MMU */
+
+static const struct file_operations cramfs_physmem_fops = {
+	.llseek			= generic_file_llseek,
+	.read_iter		= generic_file_read_iter,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= cramfs_physmem_mmap,
+#ifndef CONFIG_MMU
+	.get_unmapped_area	= cramfs_physmem_get_unmapped_area,
+	.mmap_capabilities	= cramfs_physmem_mmap_capabilities,
+#endif
+};
+
 static void cramfs_kill_sb(struct super_block *sb)
 {
 	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 
-	kill_block_super(sb);
+	if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) {
+		if (sbi && sbi->mtd_point_size)
+			mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
+		kill_mtd_super(sb);
+	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
+		kill_block_super(sb);
+	}
 	kfree(sbi);
 }
 
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
-static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
+static int cramfs_read_super(struct super_block *sb,
+			     struct cramfs_super *super, int silent)
 {
-	int i;
-	struct cramfs_super super;
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 	unsigned long root_offset;
-	struct cramfs_sb_info *sbi;
-	struct inode *root;
-
-	sb->s_flags |= MS_RDONLY;
-
-	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-	sb->s_fs_info = sbi;
 
-	/* Invalidate the read buffers on mount: think disk change.. */
-	mutex_lock(&read_mutex);
-	for (i = 0; i < READ_BUFFERS; i++)
-		buffer_blocknr[i] = -1;
+	/* We don't know the real size yet */
+	sbi->size = PAGE_SIZE;
 
 	/* Read the first block and get the superblock from it */
-	memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super));
+	mutex_lock(&read_mutex);
+	memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super));
 	mutex_unlock(&read_mutex);
 
 	/* Do sanity checks on the superblock */
-	if (super.magic != CRAMFS_MAGIC) {
+	if (super->magic != CRAMFS_MAGIC) {
 		/* check for wrong endianness */
-		if (super.magic == CRAMFS_MAGIC_WEND) {
+		if (super->magic == CRAMFS_MAGIC_WEND) {
 			if (!silent)
 				pr_err("wrong endianness\n");
 			return -EINVAL;
@@ -289,10 +534,12 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 
 		/* check at 512 byte offset */
 		mutex_lock(&read_mutex);
-		memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super));
+		memcpy(super,
+		       cramfs_read(sb, 512, sizeof(*super)),
+		       sizeof(*super));
 		mutex_unlock(&read_mutex);
-		if (super.magic != CRAMFS_MAGIC) {
-			if (super.magic == CRAMFS_MAGIC_WEND && !silent)
+		if (super->magic != CRAMFS_MAGIC) {
+			if (super->magic == CRAMFS_MAGIC_WEND && !silent)
 				pr_err("wrong endianness\n");
 			else if (!silent)
 				pr_err("wrong magic\n");
@@ -301,34 +548,34 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* get feature flags first */
-	if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
+	if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) {
 		pr_err("unsupported filesystem features\n");
 		return -EINVAL;
 	}
 
 	/* Check that the root inode is in a sane state */
-	if (!S_ISDIR(super.root.mode)) {
+	if (!S_ISDIR(super->root.mode)) {
 		pr_err("root is not a directory\n");
 		return -EINVAL;
 	}
 	/* correct strange, hard-coded permissions of mkcramfs */
-	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+	super->root.mode |= 0555;
 
-	root_offset = super.root.offset << 2;
-	if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
-		sbi->size = super.size;
-		sbi->blocks = super.fsid.blocks;
-		sbi->files = super.fsid.files;
+	root_offset = super->root.offset << 2;
+	if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) {
+		sbi->size = super->size;
+		sbi->blocks = super->fsid.blocks;
+		sbi->files = super->fsid.files;
 	} else {
 		sbi->size = 1<<28;
 		sbi->blocks = 0;
 		sbi->files = 0;
 	}
-	sbi->magic = super.magic;
-	sbi->flags = super.flags;
+	sbi->magic = super->magic;
+	sbi->flags = super->flags;
 	if (root_offset == 0)
 		pr_info("empty filesystem");
-	else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
+	else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
 		 ((root_offset != sizeof(struct cramfs_super)) &&
 		  (root_offset != 512 + sizeof(struct cramfs_super))))
 	{
@@ -336,9 +583,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+static int cramfs_finalize_super(struct super_block *sb,
+				 struct cramfs_inode *cramfs_root)
+{
+	struct inode *root;
+
 	/* Set it all up.. */
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &cramfs_ops;
-	root = get_cramfs_inode(sb, &super.root, 0);
+	root = get_cramfs_inode(sb, cramfs_root, 0);
 	if (IS_ERR(root))
 		return PTR_ERR(root);
 	sb->s_root = d_make_root(root);
@@ -347,10 +603,79 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
+static int cramfs_blkdev_fill_super(struct super_block *sb, void *data,
+				    int silent)
+{
+	struct cramfs_sb_info *sbi;
+	struct cramfs_super super;
+	int i, err;
+
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* Invalidate the read buffers on mount: think disk change.. */
+	for (i = 0; i < READ_BUFFERS; i++)
+		buffer_blocknr[i] = -1;
+
+	err = cramfs_read_super(sb, &super, silent);
+	if (err)
+		return err;
+	return cramfs_finalize_super(sb, &super.root);
+}
+
+static int cramfs_mtd_fill_super(struct super_block *sb, void *data,
+				 int silent)
+{
+	struct cramfs_sb_info *sbi;
+	struct cramfs_super super;
+	int err;
+
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* Map only one page for now.  Will remap it when fs size is known. */
+	err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size,
+			&sbi->linear_virt_addr, &sbi->linear_phys_addr);
+	if (err || sbi->mtd_point_size != PAGE_SIZE) {
+		pr_err("unable to get direct memory access to mtd:%s\n",
+		       sb->s_mtd->name);
+		return err ? : -ENODATA;
+	}
+
+	pr_info("checking physical address %pap for linear cramfs image\n",
+		&sbi->linear_phys_addr);
+	err = cramfs_read_super(sb, &super, silent);
+	if (err)
+		return err;
+
+	/* Remap the whole filesystem now */
+	pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n",
+		sb->s_mtd->name, sbi->size/1024);
+	mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE);
+	err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size,
+			&sbi->linear_virt_addr, &sbi->linear_phys_addr);
+	if (err || sbi->mtd_point_size != sbi->size) {
+		pr_err("unable to get direct memory access to mtd:%s\n",
+		       sb->s_mtd->name);
+		return err ? : -ENODATA;
+	}
+
+	return cramfs_finalize_super(sb, &super.root);
+}
+
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	u64 id = 0;
+
+	if (sb->s_bdev)
+		id = huge_encode_dev(sb->s_bdev->bd_dev);
+	else if (sb->s_dev)
+		id = huge_encode_dev(sb->s_dev);
 
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_SIZE;
@@ -502,34 +827,86 @@ static int cramfs_readpage(struct file *file, struct page *page)
 
 	if (page->index < maxblock) {
 		struct super_block *sb = inode->i_sb;
-		u32 blkptr_offset = OFFSET(inode) + page->index*4;
-		u32 start_offset, compr_len;
+		u32 blkptr_offset = OFFSET(inode) + page->index * 4;
+		u32 block_ptr, block_start, block_len;
+		bool uncompressed, direct;
 
-		start_offset = OFFSET(inode) + maxblock*4;
 		mutex_lock(&read_mutex);
-		if (page->index)
-			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
-				4);
-		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
-			start_offset);
-		mutex_unlock(&read_mutex);
+		block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4);
+		uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED);
+		direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR);
+		block_ptr &= ~CRAMFS_BLK_FLAGS;
+
+		if (direct) {
+			/*
+			 * The block pointer is an absolute start pointer,
+			 * shifted by 2 bits. The size is included in the
+			 * first 2 bytes of the data block when compressed,
+			 * or PAGE_SIZE otherwise.
+			 */
+			block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+			if (uncompressed) {
+				block_len = PAGE_SIZE;
+				/* if last block: cap to file length */
+				if (page->index == maxblock - 1)
+					block_len =
+						offset_in_page(inode->i_size);
+			} else {
+				block_len = *(u16 *)
+					cramfs_read(sb, block_start, 2);
+				block_start += 2;
+			}
+		} else {
+			/*
+			 * The block pointer indicates one past the end of
+			 * the current block (start of next block). If this
+			 * is the first block then it starts where the block
+			 * pointer table ends, otherwise its start comes
+			 * from the previous block's pointer.
+			 */
+			block_start = OFFSET(inode) + maxblock * 4;
+			if (page->index)
+				block_start = *(u32 *)
+					cramfs_read(sb, blkptr_offset - 4, 4);
+			/* Beware... previous ptr might be a direct ptr */
+			if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
+				/* See comments on earlier code. */
+				u32 prev_start = block_start;
+			       block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+			       block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+				if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
+					block_start += PAGE_SIZE;
+				} else {
+					block_len = *(u16 *)
+						cramfs_read(sb, block_start, 2);
+					block_start += 2 + block_len;
+				}
+			}
+			block_start &= ~CRAMFS_BLK_FLAGS;
+			block_len = block_ptr - block_start;
+		}
 
-		if (compr_len == 0)
+		if (block_len == 0)
 			; /* hole */
-		else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
-			pr_err("bad compressed blocksize %u\n",
-				compr_len);
+		else if (unlikely(block_len > 2*PAGE_SIZE ||
+				  (uncompressed && block_len > PAGE_SIZE))) {
+			mutex_unlock(&read_mutex);
+			pr_err("bad data blocksize %u\n", block_len);
 			goto err;
+		} else if (uncompressed) {
+			memcpy(pgdata,
+			       cramfs_read(sb, block_start, block_len),
+			       block_len);
+			bytes_filled = block_len;
 		} else {
-			mutex_lock(&read_mutex);
 			bytes_filled = cramfs_uncompress_block(pgdata,
 				 PAGE_SIZE,
-				 cramfs_read(sb, start_offset, compr_len),
-				 compr_len);
-			mutex_unlock(&read_mutex);
-			if (unlikely(bytes_filled < 0))
-				goto err;
+				 cramfs_read(sb, block_start, block_len),
+				 block_len);
 		}
+		mutex_unlock(&read_mutex);
+		if (unlikely(bytes_filled < 0))
+			goto err;
 	}
 
 	memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
@@ -573,10 +950,22 @@ static const struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct dentry *cramfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags,
+				   const char *dev_name, void *data)
 {
-	return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	struct dentry *ret = ERR_PTR(-ENOPROTOOPT);
+
+	if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
+		ret = mount_mtd(fs_type, flags, dev_name, data,
+				cramfs_mtd_fill_super);
+		if (!IS_ERR(ret))
+			return ret;
+	}
+	if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) {
+		ret = mount_bdev(fs_type, flags, dev_name, data,
+				 cramfs_blkdev_fill_super);
+	}
+	return ret;
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dax.c b/fs/dax.c
index f3a44a7c14b3..95981591977a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      struct vm_fault *vmf,
 				      void *entry, sector_t sector,
-				      unsigned long flags)
+				      unsigned long flags, bool dirty)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
 	void *new_entry;
 	pgoff_t index = vmf->pgoff;
 
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -565,11 +565,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
 		WARN_ON_ONCE(ret != entry);
 		__radix_tree_replace(page_tree, node, slot,
-				     new_entry, NULL, NULL);
+				     new_entry, NULL);
 		entry = new_entry;
 	}
 
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if (dirty)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 
 	spin_unlock_irq(&mapping->tree_lock);
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
 			continue;
 
+		/*
+		 * No need to call mmu_notifier_invalidate_range() as we are
+		 * downgrading page table protection not changing it to point
+		 * to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
 		if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
 			pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 			pmd = pmd_wrprotect(pmd);
 			pmd = pmd_mkclean(pmd);
 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
 			spin_unlock(ptl);
 #endif
@@ -643,7 +649,6 @@ unlock_pmd:
 			pte = pte_wrprotect(pte);
 			pte = pte_mkclean(pte);
 			set_pte_at(vma->vm_mm, address, ptep, pte);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
 			pte_unmap_unlock(ptep, ptl);
 		}
@@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (!done) {
 		pvec.nr = find_get_entries_tag(mapping, start_index,
 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
@@ -820,38 +825,42 @@ out:
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct address_space *mapping,
-		struct block_device *bdev, struct dax_device *dax_dev,
-		sector_t sector, size_t size, void *entry,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
-	unsigned long vaddr = vmf->address;
-	void *ret, *kaddr;
+	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+			 pfn_t *pfnp)
+{
+	const sector_t sector = dax_iomap_sector(iomap, pos);
 	pgoff_t pgoff;
+	void *kaddr;
 	int id, rc;
-	pfn_t pfn;
+	long length;
 
-	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
 	if (rc)
 		return rc;
-
 	id = dax_read_lock();
-	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-	if (rc < 0) {
-		dax_read_unlock(id);
-		return rc;
+	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+				   &kaddr, pfnp);
+	if (length < 0) {
+		rc = length;
+		goto out;
 	}
+	rc = -EINVAL;
+	if (PFN_PHYS(length) < size)
+		goto out;
+	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+		goto out;
+	/* For larger pages we need devmap */
+	if (length > 1 && !pfn_t_devmap(*pfnp))
+		goto out;
+	rc = 0;
+out:
 	dax_read_unlock(id);
-
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
-	if (IS_ERR(ret))
-		return PTR_ERR(ret);
-
-	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	if (vmf->flags & FAULT_FLAG_WRITE)
-		return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
-	else
-		return vm_insert_mixed(vma, vaddr, pfn);
+	return rc;
 }
 
 /*
@@ -877,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 	}
 
 	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-			RADIX_DAX_ZERO_PAGE);
+			RADIX_DAX_ZERO_PAGE, false);
 	if (IS_ERR(entry2)) {
 		ret = VM_FAULT_SIGBUS;
 		goto out;
@@ -936,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -1080,19 +1084,33 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+		struct vm_area_struct *vma, struct iomap *iomap)
+{
+	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+		&& (iomap->flags & IOMAP_F_DIRTY);
+}
+
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	struct vm_area_struct *vma = vmf->vma;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-	sector_t sector;
 	struct iomap iomap = { 0 };
 	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	int vmf_ret = 0;
 	void *entry;
+	pfn_t pfn;
 
 	trace_dax_pte_fault(inode, vmf, vmf_ret);
 	/*
@@ -1105,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto out;
 	}
 
-	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+	if (write && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
 
 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1140,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto error_finish_iomap;
 	}
 
-	sector = dax_iomap_sector(&iomap, pos);
-
 	if (vmf->cow_page) {
+		sector_t sector = dax_iomap_sector(&iomap, pos);
+
 		switch (iomap.type) {
 		case IOMAP_HOLE:
 		case IOMAP_UNWRITTEN:
@@ -1168,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto finish_iomap;
 	}
 
+	sync = dax_fault_is_synchronous(flags, vma, &iomap);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-				sector, PAGE_SIZE, entry, vmf->vma, vmf);
+		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+		if (error < 0)
+			goto error_finish_iomap;
+
+		entry = dax_insert_mapping_entry(mapping, vmf, entry,
+						 dax_iomap_sector(&iomap, pos),
+						 0, write && !sync);
+		if (IS_ERR(entry)) {
+			error = PTR_ERR(entry);
+			goto error_finish_iomap;
+		}
+
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PTE into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp)) {
+				error = -EIO;
+				goto error_finish_iomap;
+			}
+			*pfnp = pfn;
+			vmf_ret = VM_FAULT_NEEDDSYNC | major;
+			goto finish_iomap;
+		}
+		trace_dax_insert_mapping(inode, vmf, entry);
+		if (write)
+			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+		else
+			error = vm_insert_mixed(vma, vaddr, pfn);
+
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
-		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+		if (!write) {
 			vmf_ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
@@ -1218,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-		loff_t pos, void *entry)
-{
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	const sector_t sector = dax_iomap_sector(iomap, pos);
-	struct dax_device *dax_dev = iomap->dax_dev;
-	struct block_device *bdev = iomap->bdev;
-	struct inode *inode = mapping->host;
-	const size_t size = PMD_SIZE;
-	void *ret = NULL, *kaddr;
-	long length = 0;
-	pgoff_t pgoff;
-	pfn_t pfn = {};
-	int id;
-
-	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
-		goto fallback;
-
-	id = dax_read_lock();
-	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-	if (length < 0)
-		goto unlock_fallback;
-	length = PFN_PHYS(length);
-
-	if (length < size)
-		goto unlock_fallback;
-	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
-		goto unlock_fallback;
-	if (!pfn_t_devmap(pfn))
-		goto unlock_fallback;
-	dax_read_unlock(id);
-
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
-			RADIX_DAX_PMD);
-	if (IS_ERR(ret))
-		goto fallback;
-
-	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
-	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-unlock_fallback:
-	dax_read_unlock(id);
-fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
-	return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		void *entry)
@@ -1283,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		goto fallback;
 
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
 	if (IS_ERR(ret))
 		goto fallback;
 
@@ -1305,13 +1314,14 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
@@ -1320,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	void *entry;
 	loff_t pos;
 	int error;
+	pfn_t pfn;
 
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is
@@ -1327,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	 * this is a reliable test.
 	 */
 	pgoff = linear_page_index(vma, pmd_addr);
-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
 	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
@@ -1351,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	if (pgoff > max_pgoff) {
+	if (pgoff >= max_pgoff) {
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
 	/* If the PMD would extend beyond the file size */
-	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
 		goto fallback;
 
 	/*
@@ -1395,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
+	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+		if (error < 0)
+			goto finish_iomap;
+
+		entry = dax_insert_mapping_entry(mapping, vmf, entry,
+						dax_iomap_sector(&iomap, pos),
+						RADIX_DAX_PMD, write && !sync);
+		if (IS_ERR(entry))
+			goto finish_iomap;
+
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PMD into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp))
+				goto finish_iomap;
+			*pfnp = pfn;
+			result = VM_FAULT_NEEDDSYNC;
+			goto finish_iomap;
+		}
+
+		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+					    write);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
@@ -1437,7 +1476,7 @@ out:
 	return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -1447,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
+ * @ops: Iomap ops passed from the file system
  *
  * When a page fault occurs, filesystems may call this helper in
  * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1455,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * successfully.
  */
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops)
+		    pfn_t *pfnp, const struct iomap_ops *ops)
 {
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		return dax_iomap_pte_fault(vmf, ops);
+		return dax_iomap_pte_fault(vmf, pfnp, ops);
 	case PE_SIZE_PMD:
-		return dax_iomap_pmd_fault(vmf, ops);
+		return dax_iomap_pmd_fault(vmf, pfnp, ops);
 	default:
 		return VM_FAULT_FALLBACK;
 	}
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file.  It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+				  enum page_entry_size pe_size,
+				  pfn_t pfn)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	void *entry, **slot;
+	pgoff_t index = vmf->pgoff;
+	int vmf_ret, error;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* Did we race with someone splitting entry or so? */
+	if (!entry ||
+	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+		put_unlocked_mapping_entry(mapping, index, entry);
+		spin_unlock_irq(&mapping->tree_lock);
+		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+						      VM_FAULT_NOPAGE);
+		return VM_FAULT_NOPAGE;
+	}
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	entry = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+		vmf_ret = dax_fault_return(error);
+		break;
+#ifdef CONFIG_FS_DAX_PMD
+	case PE_SIZE_PMD:
+		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			pfn, true);
+		break;
+#endif
+	default:
+		vmf_ret = VM_FAULT_FALLBACK;
+	}
+	put_locked_mapping_entry(mapping, index);
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+	return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn)
+{
+	int err;
+	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+	size_t len = 0;
+
+	if (pe_size == PE_SIZE_PTE)
+		len = PAGE_SIZE;
+	else if (pe_size == PE_SIZE_PMD)
+		len = PMD_SIZE;
+	else
+		WARN_ON_ONCE(1);
+	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+	if (err)
+		return VM_FAULT_SIGBUS;
+	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/dcache.c b/fs/dcache.c
index bcc9f6981569..5c7df1df81ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
-			kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6dabc4a10396..cd12e6576b48 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  file.c - part of debugfs, a tiny little debug file system
  *
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See Documentation/filesystems/ for more details.
- *
  */
 
 #include <linux/module.h>
@@ -22,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
-#include <linux/srcu.h>
 #include <asm/poll.h>
 
 #include "internal.h"
@@ -48,66 +43,108 @@ const struct file_operations debugfs_noop_file_operations = {
 	.llseek =	noop_llseek,
 };
 
+#define F_DENTRY(filp) ((filp)->f_path.dentry)
+
+const struct file_operations *debugfs_real_fops(const struct file *filp)
+{
+	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
+
+	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) {
+		/*
+		 * Urgh, we've been called w/o a protecting
+		 * debugfs_file_get().
+		 */
+		WARN_ON(1);
+		return NULL;
+	}
+
+	return fsd->real_fops;
+}
+EXPORT_SYMBOL_GPL(debugfs_real_fops);
+
 /**
- * debugfs_use_file_start - mark the beginning of file data access
+ * debugfs_file_get - mark the beginning of file data access
  * @dentry: the dentry object whose data is being accessed.
- * @srcu_idx: a pointer to some memory to store a SRCU index in.
  *
- * Up to a matching call to debugfs_use_file_finish(), any
- * successive call into the file removing functions debugfs_remove()
- * and debugfs_remove_recursive() will block. Since associated private
+ * Up to a matching call to debugfs_file_put(), any successive call
+ * into the file removing functions debugfs_remove() and
+ * debugfs_remove_recursive() will block. Since associated private
  * file data may only get freed after a successful return of any of
  * the removal functions, you may safely access it after a successful
- * call to debugfs_use_file_start() without worrying about
- * lifetime issues.
+ * call to debugfs_file_get() without worrying about lifetime issues.
  *
  * If -%EIO is returned, the file has already been removed and thus,
  * it is not safe to access any of its data. If, on the other hand,
  * it is allowed to access the file data, zero is returned.
- *
- * Regardless of the return code, any call to
- * debugfs_use_file_start() must be followed by a matching call
- * to debugfs_use_file_finish().
  */
-int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
-	__acquires(&debugfs_srcu)
+int debugfs_file_get(struct dentry *dentry)
 {
-	*srcu_idx = srcu_read_lock(&debugfs_srcu);
-	barrier();
+	struct debugfs_fsdata *fsd;
+	void *d_fsd;
+
+	d_fsd = READ_ONCE(dentry->d_fsdata);
+	if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
+		fsd = d_fsd;
+	} else {
+		fsd = kmalloc(sizeof(*fsd), GFP_KERNEL);
+		if (!fsd)
+			return -ENOMEM;
+
+		fsd->real_fops = (void *)((unsigned long)d_fsd &
+					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+		refcount_set(&fsd->active_users, 1);
+		init_completion(&fsd->active_users_drained);
+		if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
+			kfree(fsd);
+			fsd = READ_ONCE(dentry->d_fsdata);
+		}
+	}
+
+	/*
+	 * In case of a successful cmpxchg() above, this check is
+	 * strictly necessary and must follow it, see the comment in
+	 * __debugfs_remove_file().
+	 * OTOH, if the cmpxchg() hasn't been executed or wasn't
+	 * successful, this serves the purpose of not starving
+	 * removers.
+	 */
 	if (d_unlinked(dentry))
 		return -EIO;
+
+	if (!refcount_inc_not_zero(&fsd->active_users))
+		return -EIO;
+
 	return 0;
 }
-EXPORT_SYMBOL_GPL(debugfs_use_file_start);
+EXPORT_SYMBOL_GPL(debugfs_file_get);
 
 /**
- * debugfs_use_file_finish - mark the end of file data access
- * @srcu_idx: the SRCU index "created" by a former call to
- *            debugfs_use_file_start().
+ * debugfs_file_put - mark the end of file data access
+ * @dentry: the dentry object formerly passed to
+ *          debugfs_file_get().
  *
  * Allow any ongoing concurrent call into debugfs_remove() or
  * debugfs_remove_recursive() blocked by a former call to
- * debugfs_use_file_start() to proceed and return to its caller.
+ * debugfs_file_get() to proceed and return to its caller.
  */
-void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+void debugfs_file_put(struct dentry *dentry)
 {
-	srcu_read_unlock(&debugfs_srcu, srcu_idx);
-}
-EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
+	struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata);
 
-#define F_DENTRY(filp) ((filp)->f_path.dentry)
+	if (refcount_dec_and_test(&fsd->active_users))
+		complete(&fsd->active_users_drained);
+}
+EXPORT_SYMBOL_GPL(debugfs_file_put);
 
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
+	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops = NULL;
-	int srcu_idx, r;
+	int r;
 
-	r = debugfs_use_file_start(dentry, &srcu_idx);
-	if (r) {
-		r = -ENOENT;
-		goto out;
-	}
+	r = debugfs_file_get(dentry);
+	if (r)
+		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
 	real_fops = fops_get(real_fops);
@@ -124,7 +161,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
 		r = real_fops->open(inode, filp);
 
 out:
-	debugfs_use_file_finish(srcu_idx);
+	debugfs_file_put(dentry);
 	return r;
 }
 
@@ -138,16 +175,16 @@ const struct file_operations debugfs_open_proxy_file_operations = {
 #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args)		\
 static ret_type full_proxy_ ## name(proto)				\
 {									\
-	const struct dentry *dentry = F_DENTRY(filp);			\
-	const struct file_operations *real_fops =			\
-		debugfs_real_fops(filp);				\
-	int srcu_idx;							\
+	struct dentry *dentry = F_DENTRY(filp);			\
+	const struct file_operations *real_fops;			\
 	ret_type r;							\
 									\
-	r = debugfs_use_file_start(dentry, &srcu_idx);			\
-	if (likely(!r))						\
-		r = real_fops->name(args);				\
-	debugfs_use_file_finish(srcu_idx);				\
+	r = debugfs_file_get(dentry);					\
+	if (unlikely(r))						\
+		return r;						\
+	real_fops = debugfs_real_fops(filp);				\
+	r = real_fops->name(args);					\
+	debugfs_file_put(dentry);					\
 	return r;							\
 }
 
@@ -172,18 +209,16 @@ FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
 static unsigned int full_proxy_poll(struct file *filp,
 				struct poll_table_struct *wait)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
-	const struct file_operations *real_fops = debugfs_real_fops(filp);
-	int srcu_idx;
+	struct dentry *dentry = F_DENTRY(filp);
 	unsigned int r = 0;
+	const struct file_operations *real_fops;
 
-	if (debugfs_use_file_start(dentry, &srcu_idx)) {
-		debugfs_use_file_finish(srcu_idx);
+	if (debugfs_file_get(dentry))
 		return POLLHUP;
-	}
 
+	real_fops = debugfs_real_fops(filp);
 	r = real_fops->poll(filp, wait);
-	debugfs_use_file_finish(srcu_idx);
+	debugfs_file_put(dentry);
 	return r;
 }
 
@@ -227,16 +262,14 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops,
 
 static int full_proxy_open(struct inode *inode, struct file *filp)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
+	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops = NULL;
 	struct file_operations *proxy_fops = NULL;
-	int srcu_idx, r;
+	int r;
 
-	r = debugfs_use_file_start(dentry, &srcu_idx);
-	if (r) {
-		r = -ENOENT;
-		goto out;
-	}
+	r = debugfs_file_get(dentry);
+	if (r)
+		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
 	real_fops = fops_get(real_fops);
@@ -274,7 +307,7 @@ free_proxy:
 	kfree(proxy_fops);
 	fops_put(real_fops);
 out:
-	debugfs_use_file_finish(srcu_idx);
+	debugfs_file_put(dentry);
 	return r;
 }
 
@@ -285,13 +318,14 @@ const struct file_operations debugfs_full_proxy_file_operations = {
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
 			size_t len, loff_t *ppos)
 {
+	struct dentry *dentry = F_DENTRY(file);
 	ssize_t ret;
-	int srcu_idx;
 
-	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!ret))
-		ret = simple_attr_read(file, buf, len, ppos);
-	debugfs_use_file_finish(srcu_idx);
+	ret = debugfs_file_get(dentry);
+	if (unlikely(ret))
+		return ret;
+	ret = simple_attr_read(file, buf, len, ppos);
+	debugfs_file_put(dentry);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(debugfs_attr_read);
@@ -299,13 +333,14 @@ EXPORT_SYMBOL_GPL(debugfs_attr_read);
 ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
 			 size_t len, loff_t *ppos)
 {
+	struct dentry *dentry = F_DENTRY(file);
 	ssize_t ret;
-	int srcu_idx;
 
-	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!ret))
-		ret = simple_attr_write(file, buf, len, ppos);
-	debugfs_use_file_finish(srcu_idx);
+	ret = debugfs_file_get(dentry);
+	if (unlikely(ret))
+		return ret;
+	ret = simple_attr_write(file, buf, len, ppos);
+	debugfs_file_put(dentry);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(debugfs_attr_write);
@@ -739,14 +774,14 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
 {
 	char buf[3];
 	bool val;
-	int r, srcu_idx;
+	int r;
+	struct dentry *dentry = F_DENTRY(file);
 
-	r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!r))
-		val = *(bool *)file->private_data;
-	debugfs_use_file_finish(srcu_idx);
-	if (r)
+	r = debugfs_file_get(dentry);
+	if (unlikely(r))
 		return r;
+	val = *(bool *)file->private_data;
+	debugfs_file_put(dentry);
 
 	if (val)
 		buf[0] = 'Y';
@@ -764,8 +799,9 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 	char buf[32];
 	size_t buf_size;
 	bool bv;
-	int r, srcu_idx;
+	int r;
 	bool *val = file->private_data;
+	struct dentry *dentry = F_DENTRY(file);
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
@@ -773,12 +809,11 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 
 	buf[buf_size] = '\0';
 	if (strtobool(buf, &bv) == 0) {
-		r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-		if (likely(!r))
-			*val = bv;
-		debugfs_use_file_finish(srcu_idx);
-		if (r)
+		r = debugfs_file_get(dentry);
+		if (unlikely(r))
 			return r;
+		*val = bv;
+		debugfs_file_put(dentry);
 	}
 
 	return count;
@@ -840,14 +875,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
 {
 	struct debugfs_blob_wrapper *blob = file->private_data;
+	struct dentry *dentry = F_DENTRY(file);
 	ssize_t r;
-	int srcu_idx;
 
-	r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!r))
-		r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
-					blob->size);
-	debugfs_use_file_finish(srcu_idx);
+	r = debugfs_file_get(dentry);
+	if (unlikely(r))
+		return r;
+	r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
+				blob->size);
+	debugfs_file_put(dentry);
 	return r;
 }
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c59f015f386e..63a998c3f252 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  inode.c - part of debugfs, a tiny little debug file system
  *
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See ./Documentation/core-api/kernel-api.rst for more details.
- *
  */
 
 #include <linux/module.h>
@@ -27,14 +23,11 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <linux/srcu.h>
 
 #include "internal.h"
 
 #define DEBUGFS_DEFAULT_MODE	0700
 
-DEFINE_SRCU(debugfs_srcu);
-
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
@@ -185,6 +178,14 @@ static const struct super_operations debugfs_super_operations = {
 	.evict_inode	= debugfs_evict_inode,
 };
 
+static void debugfs_release_dentry(struct dentry *dentry)
+{
+	void *fsd = dentry->d_fsdata;
+
+	if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
+		kfree(dentry->d_fsdata);
+}
+
 static struct vfsmount *debugfs_automount(struct path *path)
 {
 	debugfs_automount_t f;
@@ -194,6 +195,7 @@ static struct vfsmount *debugfs_automount(struct path *path)
 
 static const struct dentry_operations debugfs_dops = {
 	.d_delete = always_delete_dentry,
+	.d_release = debugfs_release_dentry,
 	.d_automount = debugfs_automount,
 };
 
@@ -358,7 +360,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 	inode->i_private = data;
 
 	inode->i_fop = proxy_fops;
-	dentry->d_fsdata = (void *)real_fops;
+	dentry->d_fsdata = (void *)((unsigned long)real_fops |
+				DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
 
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
@@ -615,18 +618,43 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
+static void __debugfs_remove_file(struct dentry *dentry, struct dentry *parent)
+{
+	struct debugfs_fsdata *fsd;
+
+	simple_unlink(d_inode(parent), dentry);
+	d_delete(dentry);
+
+	/*
+	 * Paired with the closing smp_mb() implied by a successful
+	 * cmpxchg() in debugfs_file_get(): either
+	 * debugfs_file_get() must see a dead dentry or we must see a
+	 * debugfs_fsdata instance at ->d_fsdata here (or both).
+	 */
+	smp_mb();
+	fsd = READ_ONCE(dentry->d_fsdata);
+	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
+		return;
+	if (!refcount_dec_and_test(&fsd->active_users))
+		wait_for_completion(&fsd->active_users_drained);
+}
+
 static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
 	if (simple_positive(dentry)) {
 		dget(dentry);
-		if (d_is_dir(dentry))
-			ret = simple_rmdir(d_inode(parent), dentry);
-		else
-			simple_unlink(d_inode(parent), dentry);
-		if (!ret)
-			d_delete(dentry);
+		if (!d_is_reg(dentry)) {
+			if (d_is_dir(dentry))
+				ret = simple_rmdir(d_inode(parent), dentry);
+			else
+				simple_unlink(d_inode(parent), dentry);
+			if (!ret)
+				d_delete(dentry);
+		} else {
+			__debugfs_remove_file(dentry, parent);
+		}
 		dput(dentry);
 	}
 	return ret;
@@ -660,8 +688,6 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-
-	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
@@ -735,8 +761,6 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
-
-	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index b3e8443a1f47..f0d73d86cc1a 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  internal.h - declarations internal to debugfs
  *
  *  Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  */
 
 #ifndef _DEBUGFS_INTERNAL_H_
@@ -19,4 +15,18 @@ extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
 extern const struct file_operations debugfs_full_proxy_file_operations;
 
+struct debugfs_fsdata {
+	const struct file_operations *real_fops;
+	refcount_t active_users;
+	struct completion active_users_drained;
+};
+
+/*
+ * A dentry's ->d_fsdata either points to the real fops or to a
+ * dynamically allocated debugfs_fsdata instance.
+ * In order to distinguish between these two cases, a real fops
+ * pointer gets its lowest bit set.
+ */
+#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
+
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e5e29f8c920b..846ca150d52e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -36,27 +36,13 @@
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
+#include <linux/kernel.h>
 #include "ecryptfs_kernel.h"
 
 #define DECRYPT		0
 #define ENCRYPT		1
 
 /**
- * ecryptfs_to_hex
- * @dst: Buffer to take hex character representation of contents of
- *       src; must be at least of size (src_size * 2)
- * @src: Buffer to be converted to a hex string representation
- * @src_size: number of bytes to convert
- */
-void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
-{
-	int x;
-
-	for (x = 0; x < src_size; x++)
-		sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
-}
-
-/**
  * ecryptfs_from_hex
  * @dst: Buffer to take the bytes from src hex; must be at least of
  *       size (src_size / 2)
@@ -899,8 +885,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
 	u32 flags;
 
 	flags = get_unaligned_be32(page_virt);
-	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
-			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++)
 		if (flags & ecryptfs_flag_map[i].file_flag) {
 			crypt_stat->flags |= ecryptfs_flag_map[i].local_flag;
 		} else
@@ -937,8 +922,7 @@ void ecryptfs_write_crypt_stat_flags(char *page_virt,
 	u32 flags = 0;
 	int i;
 
-	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
-			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++)
 		if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag)
 			flags |= ecryptfs_flag_map[i].file_flag;
 	/* Version is in top 8 bits of the 32-bit flag vector */
@@ -1434,8 +1418,6 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 	page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
 	if (!page_virt) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Unable to allocate page_virt\n",
-		       __func__);
 		goto out;
 	}
 	rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
@@ -1522,9 +1504,6 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
 		filename->encrypted_filename =
 			kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
 		if (!filename->encrypted_filename) {
-			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc [%zd] bytes\n", __func__,
-			       filename->encrypted_filename_size);
 			rc = -ENOMEM;
 			goto out;
 		}
@@ -1669,12 +1648,10 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
 	BUG_ON(!mutex_is_locked(&key_tfm_list_mutex));
 
 	tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL);
-	if (key_tfm != NULL)
+	if (key_tfm)
 		(*key_tfm) = tmp_tfm;
 	if (!tmp_tfm) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "Error attempting to allocate from "
-		       "ecryptfs_key_tfm_cache\n");
 		goto out;
 	}
 	mutex_init(&tmp_tfm->key_tfm_mutex);
@@ -1690,7 +1667,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
 		       "cipher with name = [%s]; rc = [%d]\n",
 		       tmp_tfm->cipher_name, rc);
 		kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm);
-		if (key_tfm != NULL)
+		if (key_tfm)
 			(*key_tfm) = NULL;
 		goto out;
 	}
@@ -1881,7 +1858,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
 	size_t src_byte_offset = 0;
 	size_t dst_byte_offset = 0;
 
-	if (dst == NULL) {
+	if (!dst) {
 		(*dst_size) = ecryptfs_max_decoded_size(src_size);
 		goto out;
 	}
@@ -1949,9 +1926,6 @@ int ecryptfs_encrypt_and_encode_filename(
 
 		filename = kzalloc(sizeof(*filename), GFP_KERNEL);
 		if (!filename) {
-			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kzalloc [%zd] bytes\n", __func__,
-			       sizeof(*filename));
 			rc = -ENOMEM;
 			goto out;
 		}
@@ -1980,9 +1954,6 @@ int ecryptfs_encrypt_and_encode_filename(
 				 + encoded_name_no_prefix_size);
 		(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
 		if (!(*encoded_name)) {
-			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kzalloc [%zd] bytes\n", __func__,
-			       (*encoded_name_size));
 			rc = -ENOMEM;
 			kfree(filename->encrypted_filename);
 			kfree(filename);
@@ -2064,9 +2035,6 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 					      name, name_size);
 		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
 		if (!decoded_name) {
-			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc [%zd] bytes\n", __func__,
-			       decoded_name_size);
 			rc = -ENOMEM;
 			goto out;
 		}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3fbc0ff79699..e74cb2a0b299 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -31,6 +31,7 @@
 #include <crypto/skcipher.h>
 #include <keys/user-type.h>
 #include <keys/encrypted-type.h>
+#include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/fs_stack.h>
 #include <linux/namei.h>
@@ -51,7 +52,13 @@
 #define ECRYPTFS_XATTR_NAME "user.ecryptfs"
 
 void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
-extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+static inline void
+ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+	char *end = bin2hex(dst, src, src_size);
+	*end = '\0';
+}
+
 extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
 
 struct ecryptfs_key_record {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index efc2db42d175..847904aa63a9 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -64,7 +64,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
 	/* i_size will be overwritten for encrypted regular files */
 	fsstack_copy_inode_size(inode, lower_inode);
 	inode->i_ino = lower_inode->i_ino;
-	inode->i_version++;
 	inode->i_mapping->a_ops = &ecryptfs_aops;
 
 	if (S_ISLNK(inode->i_mode))
@@ -334,9 +333,6 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
 
 	dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
 	if (!dentry_info) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting "
-		       "to allocate ecryptfs_dentry_info struct\n",
-			__func__);
 		dput(lower_dentry);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index fa218cd64f74..c89a58cfc991 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -639,11 +639,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	int rc = 0;
 
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
-	if (!s) {
-		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
-		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+	if (!s)
 		return -ENOMEM;
-	}
+
 	(*packet_size) = 0;
 	rc = ecryptfs_find_auth_tok_for_sig(
 		&auth_tok_key,
@@ -687,7 +685,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	 *    separator, and then the filename */
 	s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
 			      + s->block_aligned_filename_size);
-	if (dest == NULL) {
+	if (!dest) {
 		(*packet_size) = s->max_packet_size;
 		goto out_unlock;
 	}
@@ -714,9 +712,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
 					    GFP_KERNEL);
 	if (!s->block_aligned_filename) {
-		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
-		       "kzalloc [%zd] bytes\n", __func__,
-		       s->block_aligned_filename_size);
 		rc = -ENOMEM;
 		goto out_unlock;
 	}
@@ -769,10 +764,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
 			       crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
 	if (!s->hash_desc) {
-		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
-		       "kmalloc [%zd] bytes\n", __func__,
-		       sizeof(*s->hash_desc) +
-		       crypto_shash_descsize(s->hash_tfm));
 		rc = -ENOMEM;
 		goto out_release_free_unlock;
 	}
@@ -925,11 +916,9 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	(*filename_size) = 0;
 	(*filename) = NULL;
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
-	if (!s) {
-		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
-		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+	if (!s)
 		return -ENOMEM;
-	}
+
 	if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
 		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
 		       "at least [%d]\n", __func__, max_packet_size,
@@ -1015,9 +1004,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
 					GFP_KERNEL);
 	if (!s->decrypted_filename) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%zd] bytes\n", __func__,
-		       s->block_aligned_filename_size);
 		rc = -ENOMEM;
 		goto out_unlock;
 	}
@@ -1097,9 +1083,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	}
 	(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
 	if (!(*filename)) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%zd] bytes\n", __func__,
-		       ((*filename_size) + 1));
 		rc = -ENOMEM;
 		goto out_free_unlock;
 	}
@@ -1333,7 +1316,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
 	if ((*new_auth_tok)->session_key.encrypted_key_size
 	    > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
 		printk(KERN_WARNING "Tag 1 packet contains key larger "
-		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES");
+		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
 		rc = -EINVAL;
 		goto out;
 	}
@@ -2525,11 +2508,9 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
 	struct ecryptfs_key_sig *new_key_sig;
 
 	new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
-	if (!new_key_sig) {
-		printk(KERN_ERR
-		       "Error allocating from ecryptfs_key_sig_cache\n");
+	if (!new_key_sig)
 		return -ENOMEM;
-	}
+
 	memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
 	new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
 	/* Caller must hold keysig_list_mutex */
@@ -2545,16 +2526,12 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 			     char *sig, u32 global_auth_tok_flags)
 {
 	struct ecryptfs_global_auth_tok *new_auth_tok;
-	int rc = 0;
 
 	new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
 					GFP_KERNEL);
-	if (!new_auth_tok) {
-		rc = -ENOMEM;
-		printk(KERN_ERR "Error allocating from "
-		       "ecryptfs_global_auth_tok_cache\n");
-		goto out;
-	}
+	if (!new_auth_tok)
+		return -ENOMEM;
+
 	memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
 	new_auth_tok->flags = global_auth_tok_flags;
 	new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
@@ -2562,7 +2539,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 	list_add(&new_auth_tok->mount_crypt_stat_list,
 		 &mount_crypt_stat->global_auth_tok_list);
 	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-out:
-	return rc;
+	return 0;
 }
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6b801186baa5..025d66a705db 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -426,7 +426,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
 		mount_crypt_stat->global_default_cipher_key_size);
 	if (!cipher_code) {
 		ecryptfs_printk(KERN_ERR,
-				"eCryptfs doesn't support cipher: %s",
+				"eCryptfs doesn't support cipher: %s\n",
 				mount_crypt_stat->global_default_cipher_name);
 		rc = -EINVAL;
 		goto out;
@@ -560,8 +560,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	 * Set the POSIX ACL flag based on whether they're enabled in the lower
 	 * mount.
 	 */
-	s->s_flags = flags & ~MS_POSIXACL;
-	s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL;
+	s->s_flags = flags & ~SB_POSIXACL;
+	s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL;
 
 	/**
 	 * Force a read-only eCryptfs mount when:
@@ -569,7 +569,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	 *   2) The ecryptfs_encrypted_view mount option is specified
 	 */
 	if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 
 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	s->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -602,7 +602,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	ecryptfs_set_dentry_private(s->s_root, root_info);
 	root_info->lower_path = path;
 
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 
 out_free:
@@ -660,7 +660,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	unsigned long flags;
+	slab_flags_t flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
@@ -781,7 +781,7 @@ static struct attribute *attributes[] = {
 	NULL,
 };
 
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
 	.attrs = attributes,
 };
 
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 286f10b0363b..9fdd5bcf4564 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -147,8 +147,6 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file)
 	(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
 	if (!(*daemon)) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
-		       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
 		goto out;
 	}
 	(*daemon)->file = file;
@@ -250,8 +248,6 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
 	msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
 	if (!msg_ctx->msg) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
-		       "GFP_KERNEL memory\n", __func__, msg_size);
 		goto unlock;
 	}
 	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
@@ -386,7 +382,6 @@ int __init ecryptfs_init_messaging(void)
 				       GFP_KERNEL);
 	if (!ecryptfs_daemon_hash) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
 		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto out;
 	}
@@ -398,7 +393,6 @@ int __init ecryptfs_init_messaging(void)
 				       GFP_KERNEL);
 	if (!ecryptfs_msg_ctx_arr) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
 		goto out;
 	}
 	mutex_init(&ecryptfs_msg_ctx_lists_mux);
@@ -442,15 +436,16 @@ void ecryptfs_release_messaging(void)
 	}
 	if (ecryptfs_daemon_hash) {
 		struct ecryptfs_daemon *daemon;
+		struct hlist_node *n;
 		int i;
 
 		mutex_lock(&ecryptfs_daemon_hash_mux);
 		for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
 			int rc;
 
-			hlist_for_each_entry(daemon,
-					     &ecryptfs_daemon_hash[i],
-					     euid_chain) {
+			hlist_for_each_entry_safe(daemon, n,
+						  &ecryptfs_daemon_hash[i],
+						  euid_chain) {
 				rc = ecryptfs_exorcise_daemon(daemon);
 				if (rc)
 					printk(KERN_ERR "%s: Error whilst "
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index e4141f257495..f09cacaf8c80 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -163,12 +163,8 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 	struct ecryptfs_message *msg;
 
 	msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL);
-	if (!msg) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting "
-		       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
-		       (sizeof(*msg) + data_size));
+	if (!msg)
 		return -ENOMEM;
-	}
 
 	mutex_lock(&msg_ctx->mux);
 	msg_ctx->msg = msg;
@@ -383,7 +379,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		goto memdup;
 	} else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
 		printk(KERN_WARNING "%s: Acceptable packet size range is "
-		       "[%d-%zu], but amount of data written is [%zu].",
+		       "[%d-%zu], but amount of data written is [%zu].\n",
 		       __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
 		return -EINVAL;
 	}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 1f0c471b4ba3..cdf358b209d9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -431,8 +431,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	}
 	xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL);
 	if (!xattr_virt) {
-		printk(KERN_ERR "Out of memory whilst attempting to write "
-		       "inode size to xattr\n");
 		rc = -ENOMEM;
 		goto out;
 	}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 65b59009555b..6ffb7ba1547a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -116,7 +116,7 @@ static void destroy_inodecache(void)
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -311,7 +311,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 #ifdef DEBUG
 		pr_info("forcing read-only mode\n");
 #endif
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 	}
 	s->s_op   = &efs_superblock_operations;
 	s->s_export_op = &efs_export_ops;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..afd548ebc328 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,12 +276,6 @@ static DEFINE_MUTEX(epmutex);
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
 
-/* Used for safe wake up implementation */
-static struct nested_calls poll_safewake_ncalls;
-
-/* Used to call file's f_op->poll() under the nested calls boundaries */
-static struct nested_calls poll_readywalk_ncalls;
-
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
 
@@ -551,40 +545,21 @@ out_unlock:
  * this special case of epoll.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
-				     unsigned long events, int subclass)
+
+static struct nested_calls poll_safewake_ncalls;
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 {
 	unsigned long flags;
+	wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
 
-	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
-	wake_up_locked_poll(wqueue, events);
+	spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
+	wake_up_locked_poll(wqueue, POLLIN);
 	spin_unlock_irqrestore(&wqueue->lock, flags);
-}
-#else
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
-				     unsigned long events, int subclass)
-{
-	wake_up_poll(wqueue, events);
-}
-#endif
 
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
-	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
-			  1 + call_nests);
 	return 0;
 }
 
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_NESTS deep.
- */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
 	int this_cpu = get_cpu();
@@ -595,6 +570,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
 	put_cpu();
 }
 
+#else
+
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	wake_up_poll(wq, POLLIN);
+}
+
+#endif
+
 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 {
 	wait_queue_head_t *whead;
@@ -880,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv);
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
+
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth)
 {
+	struct eventpoll *ep;
+	bool locked;
+
 	pt->_key = epi->event.events;
+	if (!is_file_epoll(epi->ffd.file))
+		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
+		       epi->event.events;
+
+	ep = epi->ffd.file->private_data;
+	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
+	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
 
-	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+	return ep_scan_ready_list(epi->ffd.file->private_data,
+				  ep_read_events_proc, &depth, depth,
+				  locked) & epi->event.events;
 }
 
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -892,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 {
 	struct epitem *epi, *tmp;
 	poll_table pt;
+	int depth = *(int *)priv;
 
 	init_poll_funcptr(&pt, NULL);
+	depth++;
 
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
-		if (ep_item_poll(epi, &pt))
+		if (ep_item_poll(epi, &pt, depth)) {
 			return POLLIN | POLLRDNORM;
-		else {
+		} else {
 			/*
 			 * Item has been dropped into the ready list by the poll
 			 * callback, but it's not actually ready, as far as
@@ -912,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	return 0;
 }
 
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
-				 poll_table *pt);
-
-struct readyevents_arg {
-	struct eventpoll *ep;
-	bool locked;
-};
-
-static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
-{
-	struct readyevents_arg *arg = priv;
-
-	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
-				  call_nests + 1, arg->locked);
-}
-
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-	int pollflags;
 	struct eventpoll *ep = file->private_data;
-	struct readyevents_arg arg;
-
-	/*
-	 * During ep_insert() we already hold the ep->mtx for the tfile.
-	 * Prevent re-aquisition.
-	 */
-	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
-	arg.ep = ep;
+	int depth = 0;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
 
 	/*
 	 * Proceed to find out if wanted events are really available inside
-	 * the ready list. This need to be done under ep_call_nested()
-	 * supervision, since the call to f_op->poll() done on listed files
-	 * could re-enter here.
+	 * the ready list.
 	 */
-	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, &arg, ep, current);
-
-	return pollflags != -1 ? pollflags : 0;
+	return ep_scan_ready_list(ep, ep_read_events_proc,
+				  &depth, depth, false);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1472,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * this operation completes, the poll callback can start hitting
 	 * the new item.
 	 */
-	revents = ep_item_poll(epi, &epq.pt);
+	revents = ep_item_poll(epi, &epq.pt, 1);
 
 	/*
 	 * We have to check if something went wrong during the poll wait queue
@@ -1606,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = ep_item_poll(epi, &pt);
+	revents = ep_item_poll(epi, &pt, 1);
 
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
@@ -1674,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 
 		list_del_init(&epi->rdllink);
 
-		revents = ep_item_poll(epi, &pt);
+		revents = ep_item_poll(epi, &pt, 1);
 
 		/*
 		 * If the event mask intersect the caller-requested one,
@@ -2259,7 +2239,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	compat_sigset_t csigmask;
 	sigset_t ksigmask, sigsaved;
 
 	/*
@@ -2269,9 +2248,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
 		sigsaved = current->blocked;
 		set_current_blocked(&ksigmask);
 	}
@@ -2315,11 +2293,10 @@ static int __init eventpoll_init(void)
 	 */
 	ep_nested_calls_init(&poll_loop_ncalls);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
-
-	/* Initialize the structure used to perform file's f_op->poll() calls */
-	ep_nested_calls_init(&poll_readywalk_ncalls);
+#endif
 
 	/*
 	 * We can have many thousands of epitems, so prevent this from
@@ -2329,11 +2306,11 @@ static int __init eventpoll_init(void)
 
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
-			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 1d6243d9f2b6..5688b5e1b937 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1216,15 +1216,14 @@ killed:
 	return -EAGAIN;
 }
 
-char *get_task_comm(char *buf, struct task_struct *tsk)
+char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 {
-	/* buf must be at least sizeof(tsk->comm) in size */
 	task_lock(tsk);
-	strncpy(buf, tsk->comm, sizeof(tsk->comm));
+	strncpy(buf, tsk->comm, buf_size);
 	task_unlock(tsk);
 	return buf;
 }
-EXPORT_SYMBOL_GPL(get_task_comm);
+EXPORT_SYMBOL_GPL(__get_task_comm);
 
 /*
  * These functions flushes out all traces of the currently running executable
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e1b3724bebf2..33db13365c5e 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -548,7 +548,7 @@ do_more:
 	}
 
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
@@ -1424,7 +1424,7 @@ allocated:
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	*errp = 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c67b486488fd..2da67699dc33 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a1fc3dabca41..6484199b35d1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -145,7 +145,7 @@ void ext2_free_inode (struct inode * inode)
 	else
 		ext2_release_inode(sb, block_group, is_directory);
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	brelse(bitmap_bh);
@@ -517,7 +517,7 @@ repeat_in_this_group:
 	goto fail;
 got:
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 	brelse(bitmap_bh);
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e2b6be03e69b..7646818ab266 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -75,7 +75,7 @@ void ext2_error(struct super_block *sb, const char *function,
 	if (test_opt(sb, ERRORS_RO)) {
 		ext2_msg(sb, KERN_CRIT,
 			     "error: remounting filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 }
 
@@ -656,7 +656,7 @@ static int ext2_setup_super (struct super_block * sb,
 		ext2_msg(sb, KERN_ERR,
 			"error: revision level too high, "
 			"forcing read-only mode");
-		res = MS_RDONLY;
+		res = SB_RDONLY;
 	}
 	if (read_only)
 		return res;
@@ -924,9 +924,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_resuid = opts.s_resuid;
 	sbi->s_resgid = opts.s_resgid;
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
-		 MS_POSIXACL : 0);
+		 SB_POSIXACL : 0);
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
@@ -1178,7 +1178,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		ext2_msg(sb, KERN_WARNING,
 			"warning: mounting ext3 filesystem as ext2");
 	if (ext2_setup_super (sb, es, sb_rdonly(sb)))
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	ext2_write_super(sb);
 	return 0;
 
@@ -1341,9 +1341,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			 "dax flag with busy inodes while remounting");
 		new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
 	}
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_set;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
 		    !(sbi->s_mount_state & EXT2_VALID_FS))
 			goto out_set;
@@ -1379,7 +1379,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		 */
 		sbi->s_mount_state = le16_to_cpu(es->s_state);
 		if (!ext2_setup_super (sb, es, 0))
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 		spin_unlock(&sbi->s_lock);
 
 		ext2_write_super(sb);
@@ -1392,8 +1392,8 @@ out_set:
 	sbi->s_mount_opt = new_opts.s_mount_opt;
 	sbi->s_resuid = new_opts.s_resuid;
 	sbi->s_resgid = new_opts.s_resgid;
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
 	spin_unlock(&sbi->s_lock);
 
 	return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 07bca11749d4..c941251ac0c0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4722,6 +4722,7 @@ retry:
 						    EXT4_INODE_EOFBLOCKS);
 		}
 		ext4_mark_inode_dirty(handle, inode);
+		ext4_update_inode_fsync_trans(handle, inode, 1);
 		ret2 = ext4_journal_stop(handle);
 		if (ret2)
 			break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ad204d2724ac..a0ae27b1bc66 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
+#include <linux/mman.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	 */
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
+	pfn_t pfn;
 
 	if (write) {
 		sb_start_pagefault(sb);
@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 					       EXT4_DATA_TRANS_BLOCKS(sb));
+		if (IS_ERR(handle)) {
+			up_read(&EXT4_I(inode)->i_mmap_sem);
+			sb_end_pagefault(sb);
+			return VM_FAULT_SIGBUS;
+		}
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
-	else
-		result = VM_FAULT_SIGBUS;
+	result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
 	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
+		ext4_journal_stop(handle);
+		/* Handling synchronous page fault? */
+		if (result & VM_FAULT_NEEDDSYNC)
+			result = dax_finish_sync_fault(vmf, pe_size, pfn);
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b4267d72f249..b32cf263750d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -816,6 +816,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 		struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
 
+		if (IS_ERR(p))
+			return ERR_CAST(p);
 		if (p) {
 			int acl_size = p->a_count * sizeof(ext4_acl_entry);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2633150e41b9..534a9130f625 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,6 +149,15 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
  */
 int ext4_inode_is_fast_symlink(struct inode *inode)
 {
+	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+		int ea_blocks = EXT4_I(inode)->i_file_acl ?
+				EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+		if (ext4_has_inline_data(inode))
+			return 0;
+
+		return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+	}
 	return S_ISLNK(inode->i_mode) && inode->i_size &&
 	       (inode->i_size < EXT4_N_BLOCKS * 4);
 }
@@ -1719,7 +1728,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 		ext4_es_remove_extent(inode, start, last - start + 1);
 	}
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (index <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
 		if (nr_pages == 0)
@@ -2345,7 +2354,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	lblk = start << bpp_bits;
 	pblock = mpd->map.m_pblk;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (start <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
 						&start, end);
@@ -2616,12 +2625,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	mpd->map.m_len = 0;
 	mpd->next_page = index;
 	while (index <= end) {
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+				tag);
 		if (nr_pages == 0)
 			goto out;
 
@@ -2629,16 +2638,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			struct page *page = pvec.pages[i];
 
 			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping. However, page->index will not change
-			 * because we have a reference on the page.
-			 */
-			if (page->index > end)
-				goto out;
-
-			/*
 			 * Accumulated enough dirty pages? This doesn't apply
 			 * to WB_SYNC_ALL mode. For integrity sync we have to
 			 * keep going because someone may be concurrently
@@ -2752,7 +2751,7 @@ static int ext4_writepages(struct address_space *mapping,
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
-	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_writepages should
 	 * *never* be called, so if that ever happens, we would want
@@ -3394,6 +3393,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 		return try_to_free_buffers(page);
 }
 
+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+	if (journal)
+		return !jbd2_transaction_committed(journal,
+					EXT4_I(inode)->i_datasync_tid);
+	/* Any metadata buffers to write? */
+	if (!list_empty(&inode->i_mapping->private_list))
+		return true;
+	return inode->i_state & I_DIRTY_DATASYNC;
+}
+
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -3507,6 +3519,8 @@ retry:
 	}
 
 	iomap->flags = 0;
+	if (ext4_inode_datasync_dirty(inode))
+		iomap->flags |= IOMAP_F_DIRTY;
 	iomap->bdev = inode->i_sb->s_bdev;
 	iomap->dax_dev = sbi->s_daxdev;
 	iomap->offset = first_block << blkbits;
@@ -5178,7 +5192,7 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
-	if (inode->i_sb->s_flags & MS_LAZYTIME)
+	if (inode->i_sb->s_flags & SB_LAZYTIME)
 		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
 					      bh->b_data);
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b7558f292420..1eec25014f62 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -592,6 +592,44 @@ static int ext4_ioc_getfsmap(struct super_block *sb,
 	return 0;
 }
 
+static long ext4_ioctl_group_add(struct file *file,
+				 struct ext4_new_group_data *input)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	int err, err2=0;
+
+	err = ext4_resize_begin(sb);
+	if (err)
+		return err;
+
+	if (ext4_has_feature_bigalloc(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Online resizing not supported with bigalloc");
+		err = -EOPNOTSUPP;
+		goto group_add_out;
+	}
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto group_add_out;
+
+	err = ext4_group_add(sb, input);
+	if (EXT4_SB(sb)->s_journal) {
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	}
+	if (err == 0)
+		err = err2;
+	mnt_drop_write_file(file);
+	if (!err && ext4_has_group_desc_csum(sb) &&
+	    test_opt(sb, INIT_INODE_TABLE))
+		err = ext4_register_li_request(sb, input->group);
+group_add_out:
+	ext4_resize_end(sb);
+	return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -776,44 +814,12 @@ mext_out:
 
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
-		int err, err2=0;
-
-		err = ext4_resize_begin(sb);
-		if (err)
-			return err;
 
 		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
-				sizeof(input))) {
-			err = -EFAULT;
-			goto group_add_out;
-		}
-
-		if (ext4_has_feature_bigalloc(sb)) {
-			ext4_msg(sb, KERN_ERR,
-				 "Online resizing not supported with bigalloc");
-			err = -EOPNOTSUPP;
-			goto group_add_out;
-		}
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			goto group_add_out;
+				sizeof(input)))
+			return -EFAULT;
 
-		err = ext4_group_add(sb, &input);
-		if (EXT4_SB(sb)->s_journal) {
-			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
-			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-		}
-		if (err == 0)
-			err = err2;
-		mnt_drop_write_file(filp);
-		if (!err && ext4_has_group_desc_csum(sb) &&
-		    test_opt(sb, INIT_INODE_TABLE))
-			err = ext4_register_li_request(sb, input.group);
-group_add_out:
-		ext4_resize_end(sb);
-		return err;
+		return ext4_ioctl_group_add(filp, &input);
 	}
 
 	case EXT4_IOC_MIGRATE:
@@ -1078,8 +1084,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		break;
 	case EXT4_IOC32_GROUP_ADD: {
 		struct compat_ext4_new_group_input __user *uinput;
-		struct ext4_new_group_input input;
-		mm_segment_t old_fs;
+		struct ext4_new_group_data input;
 		int err;
 
 		uinput = compat_ptr(arg);
@@ -1092,12 +1097,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 				&uinput->reserved_blocks);
 		if (err)
 			return -EFAULT;
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
-				 (unsigned long) &input);
-		set_fs(old_fs);
-		return err;
+		return ext4_ioctl_group_add(file, &input);
 	}
 	case EXT4_IOC_MOVE_EXT:
 	case EXT4_IOC_RESIZE_FS:
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 798b3ac680db..e750d68fbcb5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1399,6 +1399,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 			       "falling back\n"));
 	}
 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	if (!nblocks) {
+		ret = NULL;
+		goto cleanup_and_exit;
+	}
 	start = EXT4_I(dir)->i_dir_start_lookup;
 	if (start >= nblocks)
 		start = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0556cd036b69..7c46693a14d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -422,7 +422,7 @@ static void ext4_handle_error(struct super_block *sb)
 		 * before ->s_flags update
 		 */
 		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if (test_opt(sb, ERRORS_PANIC)) {
 		if (EXT4_SB(sb)->s_journal &&
@@ -635,7 +635,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
 		 * before ->s_flags update
 		 */
 		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		if (EXT4_SB(sb)->s_journal)
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
@@ -1682,10 +1682,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		sb->s_flags |= SB_I_VERSION;
 		return 1;
 	case Opt_lazytime:
-		sb->s_flags |= MS_LAZYTIME;
+		sb->s_flags |= SB_LAZYTIME;
 		return 1;
 	case Opt_nolazytime:
-		sb->s_flags &= ~MS_LAZYTIME;
+		sb->s_flags &= ~SB_LAZYTIME;
 		return 1;
 	}
 
@@ -2116,7 +2116,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
 		ext4_msg(sb, KERN_ERR, "revision level too high, "
 			 "forcing read-only mode");
-		res = MS_RDONLY;
+		res = SB_RDONLY;
 	}
 	if (read_only)
 		goto done;
@@ -2429,7 +2429,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		/* don't clear list on RO mount w/ errors */
-		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
+		if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
 			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
 				  "clearing orphan list.\n");
 			es->s_last_orphan = 0;
@@ -2438,19 +2438,19 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		return;
 	}
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 
 	/*
 	 * Turn on quotas which were not enabled for read-only mounts if
 	 * filesystem has quota feature, so that they are updated correctly.
 	 */
-	if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
+	if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
 		int ret = ext4_enable_quotas(sb);
 
 		if (!ret)
@@ -2539,7 +2539,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 	}
 #endif
-	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 }
 
 /*
@@ -2741,7 +2741,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 
 	if (ext4_has_feature_readonly(sb)) {
 		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		return 1;
 	}
 
@@ -3623,8 +3623,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_iflags |= SB_I_CGROUPWB;
 	}
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (ext4_has_compat_features(sb) ||
@@ -4199,7 +4199,7 @@ no_journal:
 	}
 
 	if (ext4_setup_super(sb, es, sb_rdonly(sb)))
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -4693,7 +4693,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	 * the clock is set in the future, and this will cause e2fsck
 	 * to complain and force a full file system check.
 	 */
-	if (!(sb->s_flags & MS_RDONLY))
+	if (!(sb->s_flags & SB_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	if (sb->s_bdev->bd_part)
 		es->s_kbytes_written =
@@ -5047,8 +5047,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, "Abort forced by user");
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	es = sbi->s_es;
 
@@ -5057,16 +5057,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 	}
 
-	if (*flags & MS_LAZYTIME)
-		sb->s_flags |= MS_LAZYTIME;
+	if (*flags & SB_LAZYTIME)
+		sb->s_flags |= SB_LAZYTIME;
 
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
 		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}
 
-		if (*flags & MS_RDONLY) {
+		if (*flags & SB_RDONLY) {
 			err = sync_filesystem(sb);
 			if (err < 0)
 				goto restore_opts;
@@ -5078,7 +5078,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
 			 */
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 
 			/*
 			 * OK, test if we are remounting a valid rw partition
@@ -5140,7 +5140,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if (!ext4_setup_super(sb, es, 0))
-				sb->s_flags &= ~MS_RDONLY;
+				sb->s_flags &= ~SB_RDONLY;
 			if (ext4_has_feature_mmp(sb))
 				if (ext4_multi_mount_protect(sb,
 						le64_to_cpu(es->s_mmp_block))) {
@@ -5164,7 +5164,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ext4_setup_system_zone(sb);
-	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY))
 		ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
@@ -5182,7 +5182,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 #endif
 
-	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
+	*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
 	kfree(orig_data);
 	return 0;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 436b3a1464d9..2bb7c9fc5144 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,6 +250,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 
 int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	return __f2fs_set_acl(inode, type, acl, NULL);
 }
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 04fe1df052b2..4aa69bc1c70a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab;
 void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
 {
 	set_ckpt_flags(sbi, CP_ERROR_FLAG);
-	sbi->sb->s_flags |= MS_RDONLY;
 	if (!end_io)
 		f2fs_flush_merged_writes(sbi);
 }
@@ -305,25 +304,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 				long nr_to_write, enum iostat_type io_type)
 {
 	struct address_space *mapping = META_MAPPING(sbi);
-	pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
+	pgoff_t index = 0, prev = ULONG_MAX;
 	struct pagevec pvec;
 	long nwritten = 0;
+	int nr_pages;
 	struct writeback_control wbc = {
 		.for_reclaim = 0,
 	};
 	struct blk_plug plug;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	blk_start_plug(&plug);
 
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (unlikely(nr_pages == 0))
-			break;
+	while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -401,24 +397,23 @@ const struct address_space_operations f2fs_meta_aops = {
 #endif
 };
 
-static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
+						unsigned int devidx, int type)
 {
 	struct inode_management *im = &sbi->im[type];
 	struct ino_entry *e, *tmp;
 
 	tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
-retry:
+
 	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
 
 	spin_lock(&im->ino_lock);
 	e = radix_tree_lookup(&im->ino_root, ino);
 	if (!e) {
 		e = tmp;
-		if (radix_tree_insert(&im->ino_root, ino, e)) {
-			spin_unlock(&im->ino_lock);
-			radix_tree_preload_end();
-			goto retry;
-		}
+		if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
+			f2fs_bug_on(sbi, 1);
+
 		memset(e, 0, sizeof(struct ino_entry));
 		e->ino = ino;
 
@@ -426,6 +421,10 @@ retry:
 		if (type != ORPHAN_INO)
 			im->ino_num++;
 	}
+
+	if (type == FLUSH_INO)
+		f2fs_set_bit(devidx, (char *)&e->dirty_device);
+
 	spin_unlock(&im->ino_lock);
 	radix_tree_preload_end();
 
@@ -454,7 +453,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* add new dirty ino entry into list */
-	__add_ino_entry(sbi, ino, type);
+	__add_ino_entry(sbi, ino, 0, type);
 }
 
 void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -480,7 +479,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 	struct ino_entry *e, *tmp;
 	int i;
 
-	for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
+	for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
 		struct inode_management *im = &sbi->im[i];
 
 		spin_lock(&im->ino_lock);
@@ -494,6 +493,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 	}
 }
 
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type)
+{
+	__add_ino_entry(sbi, ino, devidx, type);
+}
+
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type)
+{
+	struct inode_management *im = &sbi->im[type];
+	struct ino_entry *e;
+	bool is_dirty = false;
+
+	spin_lock(&im->ino_lock);
+	e = radix_tree_lookup(&im->ino_root, ino);
+	if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
+		is_dirty = true;
+	spin_unlock(&im->ino_lock);
+	return is_dirty;
+}
+
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	struct inode_management *im = &sbi->im[ORPHAN_INO];
@@ -530,7 +550,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
 void add_orphan_inode(struct inode *inode)
 {
 	/* add new orphan ino entry into list */
-	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
 	update_inode_page(inode);
 }
 
@@ -554,7 +574,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		return err;
 	}
 
-	__add_ino_entry(sbi, ino, ORPHAN_INO);
+	__add_ino_entry(sbi, ino, 0, ORPHAN_INO);
 
 	inode = f2fs_iget_retry(sbi->sb, ino);
 	if (IS_ERR(inode)) {
@@ -590,20 +610,24 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	block_t start_blk, orphan_blocks, i, j;
 	unsigned int s_flags = sbi->sb->s_flags;
 	int err = 0;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sbi->sb->s_flags &= ~MS_RDONLY;
+		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sbi->sb->s_flags |= MS_ACTIVE;
+	sbi->sb->s_flags |= SB_ACTIVE;
+
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
 #endif
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -631,9 +655,10 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
-	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
 	return err;
 }
@@ -986,7 +1011,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 				update_inode_page(inode);
 			iput(inode);
 		}
-	};
+	}
 	return 0;
 }
 
@@ -1146,6 +1171,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct super_block *sb = sbi->sb;
 	struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
 	u64 kbytes_written;
+	int err;
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
@@ -1239,6 +1265,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
+	/* flush all device cache */
+	err = f2fs_flush_device_cache(sbi);
+	if (err)
+		return err;
+
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 36b535207c88..516fa0d3ff9c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -173,7 +173,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 {
 	struct bio *bio;
 
-	bio = f2fs_bio_alloc(npages);
+	bio = f2fs_bio_alloc(sbi, npages, true);
 
 	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
@@ -418,8 +418,8 @@ next:
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	/* set submitted = 1 as a return value */
-	fio->submitted = 1;
+	/* set submitted = true as a return value */
+	fio->submitted = true;
 
 	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
 
@@ -473,7 +473,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 		f2fs_wait_on_block_writeback(sbi, blkaddr);
 	}
 
-	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
 	if (!bio) {
 		if (ctx)
 			fscrypt_release_ctx(ctx);
@@ -833,6 +833,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	struct f2fs_map_blocks map;
 	int err = 0;
 
+	/* convert inline data for Direct I/O*/
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+	}
+
 	if (is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
@@ -845,15 +852,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 
 	map.m_next_pgofs = NULL;
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
+	if (iocb->ki_flags & IOCB_DIRECT)
 		return f2fs_map_blocks(inode, &map, 1,
 			__force_buffered_io(inode, WRITE) ?
 				F2FS_GET_BLOCK_PRE_AIO :
 				F2FS_GET_BLOCK_PRE_DIO);
-	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
@@ -1334,7 +1337,7 @@ static int f2fs_read_data_pages(struct file *file,
 			struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages)
 {
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	struct page *page = list_last_entry(pages, struct page, lru);
 
 	trace_f2fs_readpages(inode, page, nr_pages);
@@ -1495,6 +1498,7 @@ static int __write_data_page(struct page *page, bool *submitted,
 	int err = 0;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
@@ -1566,8 +1570,11 @@ write:
 			err = do_write_data_page(&fio);
 		}
 	}
+
+	down_write(&F2FS_I(inode)->i_sem);
 	if (F2FS_I(inode)->last_disk_size < psize)
 		F2FS_I(inode)->last_disk_size = psize;
+	up_write(&F2FS_I(inode)->i_sem);
 
 done:
 	if (err && err != -ENOENT)
@@ -1635,7 +1642,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	if (get_dirty_pages(mapping->host) <=
 				SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
@@ -1669,8 +1676,8 @@ retry:
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+				tag);
 		if (nr_pages == 0)
 			break;
 
@@ -1678,11 +1685,6 @@ retry:
 			struct page *page = pvec.pages[i];
 			bool submitted = false;
 
-			if (page->index > end) {
-				done = 1;
-				break;
-			}
-
 			done_index = page->index;
 retry_write:
 			lock_page(page);
@@ -1937,6 +1939,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
+	if (f2fs_is_atomic_file(inode) &&
+			!available_free_memory(sbi, INMEM_PAGES)) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
 	/*
 	 * We should check this at this moment to avoid deadlock on inode page
 	 * and #0 page. The locking rule for inline_data conversion should be:
@@ -1952,7 +1960,7 @@ repeat:
 	 * Do not use grab_cache_page_write_begin() to avoid deadlock due to
 	 * wait_for_stable_page. Will wait that below with our IO control.
 	 */
-	page = pagecache_get_page(mapping, index,
+	page = f2fs_pagecache_get_page(mapping, index,
 				FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
 	if (!page) {
 		err = -ENOMEM;
@@ -2014,6 +2022,8 @@ repeat:
 fail:
 	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
+	if (f2fs_is_atomic_file(inode))
+		drop_inmem_pages_all(sbi);
 	return err;
 }
 
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 87f449845f5f..ecada8425268 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
 	si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+	si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
 	si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
 	si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+
+	si->nquota_files = 0;
+	if (f2fs_sb_has_quota_ino(sbi->sb)) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			if (f2fs_qf_ino(sbi->sb, i))
+				si->nquota_files++;
+		}
+	}
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
 	si->aw_cnt = atomic_read(&sbi->aw_cnt);
@@ -61,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 			atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
 		si->nr_flushing =
 			atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+		si->flush_list_empty =
+			llist_empty(&SM_I(sbi)->fcc_info->issue_list);
 	}
 	if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
 		si->nr_discarded =
@@ -96,9 +107,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
 	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
-	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
 	si->avail_nids = NM_I(sbi)->available_nids;
-	si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
+	si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
 	si->bg_gc = sbi->bg_gc;
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -231,14 +242,14 @@ get_cache:
 	}
 
 	/* free nids */
-	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
-				NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
+				NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
 				sizeof(struct free_nid);
 	si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
 	si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-	for (i = 0; i <= ORPHAN_INO; i++)
+	for (i = 0; i < MAX_INO_ENTRY; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
 	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
 						sizeof(struct extent_tree);
@@ -262,9 +273,10 @@ static int stat_show(struct seq_file *s, void *v)
 	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
 		update_general_status(si->sbi);
 
-		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
 			si->sbi->sb->s_bdev, i++,
-			f2fs_readonly(si->sbi->sb) ? "RO": "RW");
+			f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+			f2fs_cp_error(si->sbi) ? "Error": "Good");
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -349,10 +361,11 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
 			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
 			   si->nr_wb_cp_data, si->nr_wb_data,
 			   si->nr_flushing, si->nr_flushed,
+			   si->flush_list_empty,
 			   si->nr_discarding, si->nr_discarded,
 			   si->nr_discard_cmd, si->undiscard_blks);
 		seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d), "
@@ -365,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
 		seq_printf(s, "  - datas: %4d in files:%4d\n",
 			   si->ndirty_data, si->ndirty_files);
+		seq_printf(s, "  - quota datas: %4d in quota files:%4d\n",
+			   si->ndirty_qdata, si->nquota_files);
 		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
 		seq_printf(s, "  - imeta: %4d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c0c933ad43c8..2d98d877c09d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -10,10 +10,12 @@
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
+#include <linux/sched/signal.h>
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
 #include "xattr.h"
+#include <trace/events/f2fs.h>
 
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -847,6 +849,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct page *dentry_page = NULL;
 	struct file_ra_state *ra = &file->f_ra;
+	loff_t start_pos = ctx->pos;
 	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	struct f2fs_dentry_ptr d;
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
@@ -855,24 +858,32 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	if (f2fs_encrypted_inode(inode)) {
 		err = fscrypt_get_encryption_info(inode);
 		if (err && err != -ENOKEY)
-			return err;
+			goto out;
 
 		err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
 		if (err < 0)
-			return err;
+			goto out;
 	}
 
 	if (f2fs_has_inline_dentry(inode)) {
 		err = f2fs_read_inline_dir(file, ctx, &fstr);
-		goto out;
+		goto out_free;
 	}
 
-	/* readahead for multi pages of dir */
-	if (npages - n > 1 && !ra_has_index(ra, n))
-		page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+	for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+
+		/* allow readdir() to be interrupted */
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			goto out_free;
+		}
+		cond_resched();
+
+		/* readahead for multi pages of dir */
+		if (npages - n > 1 && !ra_has_index(ra, n))
+			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-	for (; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n, false);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
@@ -880,7 +891,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 				err = 0;
 				continue;
 			} else {
-				goto out;
+				goto out_free;
 			}
 		}
 
@@ -896,12 +907,13 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 
-		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
-out:
+out_free:
 	fscrypt_fname_free_buffer(&fstr);
+out:
+	trace_f2fs_readdir(inode, start_pos, ctx->pos, err);
 	return err < 0 ? err : 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 115204fdefcc..6abf26c31d01 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -44,6 +44,8 @@
 enum {
 	FAULT_KMALLOC,
 	FAULT_PAGE_ALLOC,
+	FAULT_PAGE_GET,
+	FAULT_ALLOC_BIO,
 	FAULT_ALLOC_NID,
 	FAULT_ORPHAN,
 	FAULT_BLOCK,
@@ -91,6 +93,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
 #define F2FS_MOUNT_PRJQUOTA		0x00200000
 #define F2FS_MOUNT_QUOTA		0x00400000
+#define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -116,6 +119,8 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_EXTRA_ATTR		0x0008
 #define F2FS_FEATURE_PRJQUOTA		0x0010
 #define F2FS_FEATURE_INODE_CHKSUM	0x0020
+#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
+#define F2FS_FEATURE_QUOTA_INO		0x0080
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -145,7 +150,7 @@ enum {
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 #define MAX_DISCARD_BLOCKS(sbi)		BLKS_PER_SEC(sbi)
-#define DISCARD_ISSUE_RATE		8
+#define DEF_MAX_DISCARD_REQUEST		8	/* issue 8 discards per round */
 #define DEF_MIN_DISCARD_ISSUE_TIME	50	/* 50 ms, if exists */
 #define DEF_MAX_DISCARD_ISSUE_TIME	60000	/* 60 s, if no candidates */
 #define DEF_CP_INTERVAL			60	/* 60 secs */
@@ -156,7 +161,6 @@ struct cp_control {
 	__u64 trim_start;
 	__u64 trim_end;
 	__u64 trim_minlen;
-	__u64 trimmed;
 };
 
 /*
@@ -175,12 +179,14 @@ enum {
 	ORPHAN_INO,		/* for orphan ino list */
 	APPEND_INO,		/* for append ino list */
 	UPDATE_INO,		/* for update ino list */
+	FLUSH_INO,		/* for multiple device flushing */
 	MAX_INO_ENTRY,		/* max. list */
 };
 
 struct ino_entry {
-	struct list_head list;	/* list head */
-	nid_t ino;		/* inode number */
+	struct list_head list;		/* list head */
+	nid_t ino;			/* inode number */
+	unsigned int dirty_device;	/* dirty device bitmap */
 };
 
 /* for the list of inodes to be GCed */
@@ -204,10 +210,6 @@ struct discard_entry {
 #define plist_idx(blk_num)	((blk_num) >= MAX_PLIST_NUM ?		\
 					(MAX_PLIST_NUM - 1) : (blk_num - 1))
 
-#define P_ACTIVE	0x01
-#define P_TRIM		0x02
-#define plist_issue(tag)	(((tag) & P_ACTIVE) || ((tag) & P_TRIM))
-
 enum {
 	D_PREP,
 	D_SUBMIT,
@@ -239,12 +241,32 @@ struct discard_cmd {
 	int error;			/* bio error */
 };
 
+enum {
+	DPOLICY_BG,
+	DPOLICY_FORCE,
+	DPOLICY_FSTRIM,
+	DPOLICY_UMOUNT,
+	MAX_DPOLICY,
+};
+
+struct discard_policy {
+	int type;			/* type of discard */
+	unsigned int min_interval;	/* used for candidates exist */
+	unsigned int max_interval;	/* used for candidates not exist */
+	unsigned int max_requests;	/* # of discards issued per round */
+	unsigned int io_aware_gran;	/* minimum granularity discard not be aware of I/O */
+	bool io_aware;			/* issue discard in idle time */
+	bool sync;			/* submit discard with REQ_SYNC flag */
+	unsigned int granularity;	/* discard granularity */
+};
+
 struct discard_cmd_control {
 	struct task_struct *f2fs_issue_discard;	/* discard thread */
 	struct list_head entry_list;		/* 4KB discard entry list */
 	struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
 	unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
 	struct list_head wait_list;		/* store on-flushing entries */
+	struct list_head fstrim_list;		/* in-flight discard from fstrim */
 	wait_queue_head_t discard_wait_queue;	/* waiting queue for wake-up */
 	unsigned int discard_wake;		/* to wake up discard thread */
 	struct mutex cmd_lock;
@@ -377,11 +399,14 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
+#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
-#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * \
-				(CUR_ADDRS_PER_INODE(inode) - \
-				DEF_INLINE_RESERVED_SIZE - \
-				F2FS_INLINE_XATTR_ADDRS))
+static inline int get_inline_xattr_addrs(struct inode *inode);
+#define F2FS_INLINE_XATTR_ADDRS(inode)	get_inline_xattr_addrs(inode)
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
+				(CUR_ADDRS_PER_INODE(inode) -		\
+				F2FS_INLINE_XATTR_ADDRS(inode) -	\
+				DEF_INLINE_RESERVED_SIZE))
 
 /* for inline dir */
 #define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
@@ -581,6 +606,7 @@ struct f2fs_inode_info {
 #endif
 	struct list_head dirty_list;	/* dirty list for dirs and files */
 	struct list_head gdirty_list;	/* linked in global dirty list */
+	struct list_head inmem_ilist;	/* list for inmem inodes */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct task_struct *inmem_task;	/* store inmemory task */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
@@ -591,6 +617,7 @@ struct f2fs_inode_info {
 
 	int i_extra_isize;		/* size of extra space located in i_addr */
 	kprojid_t i_projid;		/* id for project quota */
+	int i_inline_xattr_size;	/* inline xattr size */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -664,10 +691,13 @@ static inline void __try_update_largest_extent(struct inode *inode,
 	}
 }
 
-enum nid_list {
-	FREE_NID_LIST,
-	ALLOC_NID_LIST,
-	MAX_NID_LIST,
+/*
+ * For free nid management
+ */
+enum nid_state {
+	FREE_NID,		/* newly added to free nid list */
+	PREALLOC_NID,		/* it is preallocated */
+	MAX_NID_STATE,
 };
 
 struct f2fs_nm_info {
@@ -690,8 +720,8 @@ struct f2fs_nm_info {
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
-	struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
-	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
+	struct list_head free_nid_list;		/* list for free nids excluding preallocated nids */
+	unsigned int nid_cnt[MAX_NID_STATE];	/* the number of free node id */
 	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
 	unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
@@ -769,6 +799,7 @@ enum {
 struct flush_cmd {
 	struct completion wait;
 	struct llist_node llnode;
+	nid_t ino;
 	int ret;
 };
 
@@ -787,6 +818,8 @@ struct f2fs_sm_info {
 	struct dirty_seglist_info *dirty_info;	/* dirty segment information */
 	struct curseg_info *curseg_array;	/* active segment information */
 
+	struct rw_semaphore curseg_lock;	/* for preventing curseg change */
+
 	block_t seg0_blkaddr;		/* block address of 0'th segment */
 	block_t main_blkaddr;		/* start block address of main area */
 	block_t ssa_blkaddr;		/* start block address of SSA area */
@@ -808,6 +841,7 @@ struct f2fs_sm_info {
 	unsigned int min_ipu_util;	/* in-place-update threshold */
 	unsigned int min_fsync_blocks;	/* threshold for fsync */
 	unsigned int min_hot_blocks;	/* threshold for hot block allocation */
+	unsigned int min_ssr_sections;	/* threshold to trigger SSR allocation */
 
 	/* for flush command control */
 	struct flush_cmd_control *fcc_info;
@@ -829,6 +863,7 @@ struct f2fs_sm_info {
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
+	F2FS_DIRTY_QDATA,
 	F2FS_DIRTY_NODES,
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
@@ -877,6 +912,18 @@ enum need_lock_type {
 	LOCK_RETRY,
 };
 
+enum cp_reason_type {
+	CP_NO_NEEDED,
+	CP_NON_REGULAR,
+	CP_HARDLINK,
+	CP_SB_NEED_CP,
+	CP_WRONG_PINO,
+	CP_NO_SPC_ROLL,
+	CP_NODE_NEED_CP,
+	CP_FASTBOOT_MODE,
+	CP_SPEC_LOG_NUM,
+};
+
 enum iostat_type {
 	APP_DIRECT_IO,			/* app direct IOs */
 	APP_BUFFERED_IO,		/* app buffered IOs */
@@ -896,6 +943,7 @@ enum iostat_type {
 
 struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
+	nid_t ino;		/* inode number */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	enum temp_type temp;	/* contains HOT/WARM/COLD */
 	int op;			/* contains REQ_OP_ */
@@ -940,6 +988,7 @@ enum inode_type {
 	DIR_INODE,			/* for dirty dir inode */
 	FILE_INODE,			/* for dirty regular/symlink inode */
 	DIRTY_META,			/* for all dirtied inode metadata */
+	ATOMIC_FILE,			/* for all atomic files */
 	NR_INODE_TYPE,
 };
 
@@ -1042,12 +1091,15 @@ struct f2fs_sb_info {
 	loff_t max_file_blocks;			/* max block index of file */
 	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
+	int inline_xattr_size;			/* inline xattr size */
+	unsigned int trigger_ssr_threshold;	/* threshold to trigger ssr */
 
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	block_t reserved_blocks;		/* configurable reserved blocks */
+	block_t current_reserved_blocks;	/* current reserved blocks */
 
 	u32 s_next_generation;			/* for NFS support */
 
@@ -1113,6 +1165,8 @@ struct f2fs_sb_info {
 	struct list_head s_list;
 	int s_ndevs;				/* number of devices */
 	struct f2fs_dev_info *devs;		/* for device list */
+	unsigned int dirty_device;		/* for checkpoint data flush */
+	spinlock_t dev_lock;			/* protect dirty_device */
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
 
@@ -1176,8 +1230,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 
 static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
 {
-	struct timespec ts = {sbi->interval_time[type], 0};
-	unsigned long interval = timespec_to_jiffies(&ts);
+	unsigned long interval = sbi->interval_time[type] * HZ;
 
 	return time_after(jiffies, sbi->last_time[type] + interval);
 }
@@ -1344,6 +1397,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
 	return le64_to_cpu(cp->checkpoint_ver);
 }
 
+static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type)
+{
+	if (type < F2FS_MAX_QUOTAS)
+		return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]);
+	return 0;
+}
+
 static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
 {
 	size_t crc_offset = le32_to_cpu(cp->checksum_offset);
@@ -1522,7 +1582,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 	sbi->total_valid_block_count += (block_t)(*count);
-	avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks;
+	avail_user_block_count = sbi->user_block_count -
+					sbi->current_reserved_blocks;
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
 		diff = sbi->total_valid_block_count - avail_user_block_count;
 		*count -= diff;
@@ -1556,6 +1617,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
 	f2fs_bug_on(sbi, inode->i_blocks < sectors);
 	sbi->total_valid_block_count -= (block_t)count;
+	if (sbi->reserved_blocks &&
+		sbi->current_reserved_blocks < sbi->reserved_blocks)
+		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+					sbi->current_reserved_blocks + count);
 	spin_unlock(&sbi->stat_lock);
 	f2fs_i_blocks_write(inode, count, false, true);
 }
@@ -1576,6 +1641,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode)
 	atomic_inc(&F2FS_I(inode)->dirty_pages);
 	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+	if (IS_NOQUOTA(inode))
+		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1592,6 +1659,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 	atomic_dec(&F2FS_I(inode)->dirty_pages);
 	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+	if (IS_NOQUOTA(inode))
+		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
 }
 
 static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1699,10 +1768,17 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 			return ret;
 	}
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_BLOCK)) {
+		f2fs_show_injection_info(FAULT_BLOCK);
+		goto enospc;
+	}
+#endif
+
 	spin_lock(&sbi->stat_lock);
 
 	valid_block_count = sbi->total_valid_block_count + 1;
-	if (unlikely(valid_block_count + sbi->reserved_blocks >
+	if (unlikely(valid_block_count + sbi->current_reserved_blocks >
 						sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		goto enospc;
@@ -1745,6 +1821,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 
 	sbi->total_valid_node_count--;
 	sbi->total_valid_block_count--;
+	if (sbi->reserved_blocks &&
+		sbi->current_reserved_blocks < sbi->reserved_blocks)
+		sbi->current_reserved_blocks++;
 
 	spin_unlock(&sbi->stat_lock);
 
@@ -1791,6 +1870,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
 	return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 }
 
+static inline struct page *f2fs_pagecache_get_page(
+				struct address_space *mapping, pgoff_t index,
+				int fgp_flags, gfp_t gfp_mask)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
+		f2fs_show_injection_info(FAULT_PAGE_GET);
+		return NULL;
+	}
+#endif
+	return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
+}
+
 static inline void f2fs_copy_page(struct page *src, struct page *dst)
 {
 	char *src_kaddr = kmap(src);
@@ -1840,15 +1932,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
 	return entry;
 }
 
-static inline struct bio *f2fs_bio_alloc(int npages)
+static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
+						int npages, bool no_fail)
 {
 	struct bio *bio;
 
-	/* No failure on bio allocation */
-	bio = bio_alloc(GFP_NOIO, npages);
-	if (!bio)
-		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
-	return bio;
+	if (no_fail) {
+		/* No failure on bio allocation */
+		bio = bio_alloc(GFP_NOIO, npages);
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+		return bio;
+	}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+		f2fs_show_injection_info(FAULT_ALLOC_BIO);
+		return NULL;
+	}
+#endif
+	return bio_alloc(GFP_KERNEL, npages);
 }
 
 static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
@@ -2158,25 +2260,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
-	return CUR_ADDRS_PER_INODE(inode);
+	return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode);
 }
 
-static inline void *inline_xattr_addr(struct page *page)
+static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
 
 	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
-					F2FS_INLINE_XATTR_ADDRS]);
+					F2FS_INLINE_XATTR_ADDRS(inode)]);
 }
 
 static inline int inline_xattr_size(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return F2FS_INLINE_XATTR_ADDRS << 2;
-	else
-		return 0;
+	return get_inline_xattr_addrs(inode) * sizeof(__le32);
 }
 
 static inline int f2fs_has_inline_data(struct inode *inode)
@@ -2257,9 +2354,10 @@ static inline void clear_file(struct inode *inode, int type)
 
 static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
+	bool ret;
+
 	if (dsync) {
 		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-		bool ret;
 
 		spin_lock(&sbi->inode_lock[DIRTY_META]);
 		ret = list_empty(&F2FS_I(inode)->gdirty_list);
@@ -2270,12 +2368,17 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 			file_keep_isize(inode) ||
 			i_size_read(inode) & PAGE_MASK)
 		return false;
-	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+
+	down_read(&F2FS_I(inode)->i_sem);
+	ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
+	up_read(&F2FS_I(inode)->i_sem);
+
+	return ret;
 }
 
 static inline int f2fs_readonly(struct super_block *sb)
 {
-	return sb->s_flags & MS_RDONLY;
+	return sb->s_flags & SB_RDONLY;
 }
 
 static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
@@ -2320,6 +2423,12 @@ static inline int get_extra_isize(struct inode *inode)
 	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb);
+static inline int get_inline_xattr_addrs(struct inode *inode)
+{
+	return F2FS_I(inode)->i_inline_xattr_size;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -2448,7 +2557,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
  */
 int f2fs_inode_dirtied(struct inode *inode, bool sync);
 void f2fs_inode_synced(struct inode *inode);
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
 void f2fs_quota_off_umount(struct super_block *sb);
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
 int f2fs_sync_fs(struct super_block *sb, int sync);
@@ -2476,7 +2585,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni);
 pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
 int truncate_inode_blocks(struct inode *inode, pgoff_t from);
-int truncate_xattr_node(struct inode *inode, struct page *page);
+int truncate_xattr_node(struct inode *inode);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
 int remove_inode_page(struct inode *inode);
 struct page *new_inode_page(struct inode *inode);
@@ -2511,19 +2620,22 @@ void destroy_node_manager_caches(void);
  */
 bool need_SSR(struct f2fs_sb_info *sbi);
 void register_inmem_page(struct inode *inode, struct page *page);
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi);
 void drop_inmem_pages(struct inode *inode);
 void drop_inmem_page(struct inode *inode, struct page *page);
 int commit_inmem_pages(struct inode *inode);
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
-int f2fs_issue_flush(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
 int create_flush_cmd_control(struct f2fs_sb_info *sbi);
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
 void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
+void init_discard_policy(struct discard_policy *dpolicy, int discard_type,
+						unsigned int granularity);
 void stop_discard_thread(struct f2fs_sb_info *sbi);
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount);
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void release_discard_addrs(struct f2fs_sb_info *sbi);
 int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2578,6 +2690,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
 void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
 void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
 bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type);
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type);
 int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
 int acquire_orphan_inode(struct f2fs_sb_info *sbi);
 void release_orphan_inode(struct f2fs_sb_info *sbi);
@@ -2665,14 +2781,16 @@ struct f2fs_stat_info {
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
 	int ext_tree, zombie_tree, ext_node;
-	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
+	int ndirty_data, ndirty_qdata;
 	int inmem_pages;
-	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
 	int nats, dirty_nats, sits, dirty_sits;
 	int free_nids, avail_nids, alloc_nids;
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
-	int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+	int nr_flushing, nr_flushed, flush_list_empty;
+	int nr_discarding, nr_discarded;
 	int nr_discard_cmd;
 	unsigned int undiscard_blks;
 	int inline_xattr, inline_inode, inline_dir, append, update, orphans;
@@ -2981,6 +3099,16 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
+}
+
+static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 517e112c8a9a..7874bbd7311d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -53,6 +53,11 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	struct dnode_of_data dn;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi))) {
+		err = -EIO;
+		goto err;
+	}
+
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -114,6 +119,7 @@ out_sem:
 out:
 	sb_end_pagefault(inode->i_sb);
 	f2fs_update_time(sbi, REQ_TIME);
+err:
 	return block_page_mkwrite_return(err);
 }
 
@@ -138,27 +144,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 	return 1;
 }
 
-static inline bool need_do_checkpoint(struct inode *inode)
+static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	bool need_cp = false;
+	enum cp_reason_type cp_reason = CP_NO_NEEDED;
 
-	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
-		need_cp = true;
+	if (!S_ISREG(inode->i_mode))
+		cp_reason = CP_NON_REGULAR;
+	else if (inode->i_nlink != 1)
+		cp_reason = CP_HARDLINK;
 	else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
-		need_cp = true;
+		cp_reason = CP_SB_NEED_CP;
 	else if (file_wrong_pino(inode))
-		need_cp = true;
+		cp_reason = CP_WRONG_PINO;
 	else if (!space_for_roll_forward(sbi))
-		need_cp = true;
+		cp_reason = CP_NO_SPC_ROLL;
 	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-		need_cp = true;
+		cp_reason = CP_NODE_NEED_CP;
 	else if (test_opt(sbi, FASTBOOT))
-		need_cp = true;
+		cp_reason = CP_FASTBOOT_MODE;
 	else if (sbi->active_logs == 2)
-		need_cp = true;
+		cp_reason = CP_SPEC_LOG_NUM;
 
-	return need_cp;
+	return cp_reason;
 }
 
 static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -193,7 +201,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t ino = inode->i_ino;
 	int ret = 0;
-	bool need_cp = false;
+	enum cp_reason_type cp_reason = 0;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
@@ -212,7 +220,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	clear_inode_flag(inode, FI_NEED_IPU);
 
 	if (ret) {
-		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+		trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 		return ret;
 	}
 
@@ -243,10 +251,10 @@ go_write:
 	 * sudden-power-off.
 	 */
 	down_read(&F2FS_I(inode)->i_sem);
-	need_cp = need_do_checkpoint(inode);
+	cp_reason = need_do_checkpoint(inode);
 	up_read(&F2FS_I(inode)->i_sem);
 
-	if (need_cp) {
+	if (cp_reason) {
 		/* all the dirty node pages should be flushed for POR */
 		ret = f2fs_sync_fs(inode->i_sb, 1);
 
@@ -294,37 +302,43 @@ sync_nodes:
 	remove_ino_entry(sbi, ino, APPEND_INO);
 	clear_inode_flag(inode, FI_APPEND_WRITE);
 flush_out:
-	remove_ino_entry(sbi, ino, UPDATE_INO);
-	clear_inode_flag(inode, FI_UPDATE_WRITE);
 	if (!atomic)
-		ret = f2fs_issue_flush(sbi);
+		ret = f2fs_issue_flush(sbi, inode->i_ino);
+	if (!ret) {
+		remove_ino_entry(sbi, ino, UPDATE_INO);
+		clear_inode_flag(inode, FI_UPDATE_WRITE);
+		remove_ino_entry(sbi, ino, FLUSH_INO);
+	}
 	f2fs_update_time(sbi, REQ_TIME);
 out:
-	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+	trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 	f2fs_trace_ios(NULL, 1);
 	return ret;
 }
 
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)))))
+		return -EIO;
 	return f2fs_do_sync_file(file, start, end, datasync, false);
 }
 
 static pgoff_t __get_first_dirty_index(struct address_space *mapping,
 						pgoff_t pgofs, int whence)
 {
-	struct pagevec pvec;
+	struct page *page;
 	int nr_pages;
 
 	if (whence != SEEK_DATA)
 		return 0;
 
 	/* find first dirty page index */
-	pagevec_init(&pvec, 0);
-	nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
-					PAGECACHE_TAG_DIRTY, 1);
-	pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
-	pagevec_release(&pvec);
+	nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY,
+				      1, &page);
+	if (!nr_pages)
+		return ULONG_MAX;
+	pgofs = page->index;
+	put_page(page);
 	return pgofs;
 }
 
@@ -443,6 +457,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file_inode(file);
 	int err;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	/* we don't need to use inline_data strictly */
 	err = f2fs_convert_inline_inode(inode);
 	if (err)
@@ -629,6 +646,9 @@ int f2fs_truncate(struct inode *inode)
 {
 	int err;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 				S_ISLNK(inode->i_mode)))
 		return 0;
@@ -683,6 +703,12 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_NODUMP);
 
 	generic_fillattr(inode, stat);
+
+	/* we need to show initial sectors used for inline_data/dentries */
+	if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
+					f2fs_has_inline_dentry(inode))
+		stat->blocks += (stat->size + 511) >> 9;
+
 	return 0;
 }
 
@@ -722,6 +748,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	int err;
 	bool size_changed = false;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	err = setattr_prepare(dentry, attr);
 	if (err)
 		return err;
@@ -774,6 +803,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			inode->i_mtime = inode->i_ctime = current_time(inode);
 		}
 
+		down_write(&F2FS_I(inode)->i_sem);
+		F2FS_I(inode)->last_disk_size = i_size_read(inode);
+		up_write(&F2FS_I(inode)->i_sem);
+
 		size_changed = true;
 	}
 
@@ -844,7 +877,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 		err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
 		if (err) {
 			if (err == -ENOENT) {
-				pg_start++;
+				pg_start = get_next_page_offset(&dn, pg_start);
 				continue;
 			}
 			return err;
@@ -1159,11 +1192,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	truncate_pagecache(inode, offset);
 
 	ret = f2fs_do_collapse(inode, pg_start, pg_end);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	/* write out all moved pages, if possible */
 	filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1175,7 +1211,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	ret = truncate_blocks(inode, new_size, true);
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
-
+out_unlock:
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
@@ -1358,6 +1395,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	truncate_pagecache(inode, offset);
 
 	pg_start = offset >> PAGE_SHIFT;
@@ -1385,6 +1425,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
+
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
@@ -1434,8 +1476,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 		new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
 	}
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
-		f2fs_i_size_write(inode, new_size);
+	if (new_size > i_size_read(inode)) {
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			file_set_keep_isize(inode);
+		else
+			f2fs_i_size_write(inode, new_size);
+	}
 
 	return err;
 }
@@ -1446,6 +1492,9 @@ static long f2fs_fallocate(struct file *file, int mode,
 	struct inode *inode = file_inode(file);
 	long ret = 0;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	/* f2fs only support ->fallocate for regular file */
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
@@ -1479,8 +1528,6 @@ static long f2fs_fallocate(struct file *file, int mode,
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 		f2fs_mark_inode_dirty_sync(inode, false);
-		if (mode & FALLOC_FL_KEEP_SIZE)
-			file_set_keep_isize(inode);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
@@ -1882,6 +1929,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 
+	if (!f2fs_sb_has_crypto(inode->i_sb))
+		return -EOPNOTSUPP;
+
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
 	return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
@@ -1889,6 +1939,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
+	if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb))
+		return -EOPNOTSUPP;
 	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
 
@@ -2244,9 +2296,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	}
 
 	inode_lock(src);
+	down_write(&F2FS_I(src)->dio_rwsem[WRITE]);
 	if (src != dst) {
-		if (!inode_trylock(dst)) {
-			ret = -EBUSY;
+		ret = -EBUSY;
+		if (!inode_trylock(dst))
+			goto out;
+		if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) {
+			inode_unlock(dst);
 			goto out;
 		}
 	}
@@ -2306,9 +2362,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	}
 	f2fs_unlock_op(sbi);
 out_unlock:
-	if (src != dst)
+	if (src != dst) {
+		up_write(&F2FS_I(dst)->dio_rwsem[WRITE]);
 		inode_unlock(dst);
+	}
 out:
+	up_write(&F2FS_I(src)->dio_rwsem[WRITE]);
 	inode_unlock(src);
 	return ret;
 }
@@ -2624,6 +2683,9 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
+		return -EIO;
+
 	switch (cmd) {
 	case F2FS_IOC_GETFLAGS:
 		return f2fs_ioc_getflags(filp, arg);
@@ -2681,6 +2743,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct blk_plug plug;
 	ssize_t ret;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
@@ -2691,6 +2756,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 		err = f2fs_preallocate_blocks(iocb, from);
 		if (err) {
+			clear_inode_flag(inode, FI_NO_PREALLOC);
 			inode_unlock(inode);
 			return err;
 		}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index bfe6a8ccc3a0..d844dcb80570 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
 }
 
-static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
-						unsigned int segno)
-{
-	unsigned int valid_blocks =
-			get_valid_blocks(sbi, segno, true);
-
-	return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
-				valid_blocks * 2 : valid_blocks;
-}
-
 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct victim_sel_policy *p)
 {
@@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 
 	/* alloc_mode == LFS */
 	if (p->gc_mode == GC_GREEDY)
-		return get_greedy_cost(sbi, segno);
+		return get_valid_blocks(sbi, segno, true);
 	else
 		return get_cb_cost(sbi, segno);
 }
@@ -466,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
 	struct seg_entry *sentry;
 	int ret;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_read(&sit_i->sentry_lock);
 	sentry = get_seg_entry(sbi, segno);
 	ret = f2fs_test_bit(offset, sentry->cur_valid_map);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_read(&sit_i->sentry_lock);
 	return ret;
 }
 
@@ -608,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx,
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
+		.ino = inode->i_ino,
 		.type = DATA,
 		.temp = COLD,
 		.op = REQ_OP_READ,
@@ -659,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx,
 	allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 					&sum, CURSEG_COLD_DATA, NULL, false);
 
-	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
-					FGP_LOCK | FGP_CREAT, GFP_NOFS);
+	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
+				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
 	if (!fio.encrypted_page) {
 		err = -ENOMEM;
 		goto recover_block;
@@ -738,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 	} else {
 		struct f2fs_io_info fio = {
 			.sbi = F2FS_I_SB(inode),
+			.ino = inode->i_ino,
 			.type = DATA,
 			.temp = COLD,
 			.op = REQ_OP_WRITE,
@@ -840,10 +832,17 @@ next_step:
 				continue;
 			}
 
+			if (!down_write_trylock(
+				&F2FS_I(inode)->dio_rwsem[WRITE])) {
+				iput(inode);
+				continue;
+			}
+
 			start_bidx = start_bidx_of_node(nofs, inode);
 			data_page = get_read_data_page(inode,
 					start_bidx + ofs_in_node, REQ_RAHEAD,
 					true);
+			up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 			if (IS_ERR(data_page)) {
 				iput(inode);
 				continue;
@@ -901,10 +900,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 	struct sit_info *sit_i = SIT_I(sbi);
 	int ret;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 	ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
 					      NO_CHECK_TYPE, LFS);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 	return ret;
 }
 
@@ -952,8 +951,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 		/*
 		 * this is to avoid deadlock:
 		 * - lock_page(sum_page)         - f2fs_replace_block
-		 *  - check_valid_map()            - mutex_lock(sentry_lock)
-		 *   - mutex_lock(sentry_lock)     - change_curseg()
+		 *  - check_valid_map()            - down_write(sentry_lock)
+		 *   - down_read(sentry_lock)     - change_curseg()
 		 *                                  - lock_page(sum_page)
 		 */
 		if (type == SUM_TYPE_NODE)
@@ -1006,7 +1005,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 
 	cpc.reason = __get_cp_reason(sbi);
 gc_more:
-	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
+	if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
 		ret = -EINVAL;
 		goto stop;
 	}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 8322e4e7bb3f..90e38d8ea688 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -112,6 +112,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(dn->inode),
+		.ino = dn->inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_PRIO,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 53fb08810ee9..b4c4f2b25304 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -235,6 +235,23 @@ static int do_read_inode(struct inode *inode)
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
 					le16_to_cpu(ri->i_extra_isize) : 0;
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	} else {
+
+		/*
+		 * Previous inline data or directory always reserved 200 bytes
+		 * in inode layout, even if inline_xattr is disabled. In order
+		 * to keep inline_dentry's structure for backward compatibility,
+		 * we get the space back only from inline_data.
+		 */
+		fi->i_inline_xattr_size = 0;
+	}
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);
@@ -387,6 +404,10 @@ int update_inode(struct inode *inode, struct page *node_page)
 	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
+		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+			ri->i_inline_xattr_size =
+				cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
+
 		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_projid)) {
@@ -483,6 +504,7 @@ void f2fs_evict_inode(struct inode *inode)
 
 	remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
 
 	sb_start_intwrite(inode->i_sb);
 	set_inode_flag(inode, FI_NO_ALLOC);
@@ -522,8 +544,10 @@ no_delete:
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
-	if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))
+	if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)))
 		f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+	else
+		f2fs_inode_synced(inode);
 
 	/* ino == 0, if f2fs_new_inode() was failed t*/
 	if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a4dab98c4b7b..28bdf8828e73 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
+	int xattr_size = 0;
 	int err;
 
 	inode = new_inode(dir->i_sb);
@@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	if (test_opt(sbi, INLINE_XATTR))
 		set_inode_flag(inode, FI_INLINE_XATTR);
+
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(inode, FI_INLINE_DATA);
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(inode, FI_INLINE_DENTRY);
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		if (f2fs_has_inline_xattr(inode))
+			xattr_size = sbi->inline_xattr_size;
+		/* Otherwise, will be 0 */
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	}
+	F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
 	f2fs_init_extent_tree(inode, NULL);
 
 	stat_inc_inline_xattr(inode);
@@ -177,6 +190,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	nid_t ino = 0;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -221,6 +237,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir) &&
 			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
@@ -331,12 +350,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct f2fs_dir_entry *de;
 	struct page *page;
-	nid_t ino;
+	struct dentry *new;
+	nid_t ino = -1;
 	int err = 0;
 	unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
 
+	trace_f2fs_lookup_start(dir, dentry, flags);
+
 	if (f2fs_encrypted_inode(dir)) {
-		int res = fscrypt_get_encryption_info(dir);
+		err = fscrypt_get_encryption_info(dir);
 
 		/*
 		 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
@@ -346,18 +368,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 		if (fscrypt_has_encryption_key(dir))
 			fscrypt_set_encrypted_dentry(dentry);
 		fscrypt_set_d_op(dentry);
-		if (res && res != -ENOKEY)
-			return ERR_PTR(res);
+		if (err && err != -ENOKEY)
+			goto out;
 	}
 
-	if (dentry->d_name.len > F2FS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
+	if (dentry->d_name.len > F2FS_NAME_LEN) {
+		err = -ENAMETOOLONG;
+		goto out;
+	}
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de) {
-		if (IS_ERR(page))
-			return (struct dentry *)page;
-		return d_splice_alias(inode, dentry);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out;
+		}
+		goto out_splice;
 	}
 
 	ino = le32_to_cpu(de->ino);
@@ -365,19 +391,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	f2fs_put_page(page, 0);
 
 	inode = f2fs_iget(dir->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
 
 	if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
 		err = __recover_dot_dentries(dir, root_ino);
 		if (err)
-			goto err_out;
+			goto out_iput;
 	}
 
 	if (f2fs_has_inline_dots(inode)) {
 		err = __recover_dot_dentries(inode, dir->i_ino);
 		if (err)
-			goto err_out;
+			goto out_iput;
 	}
 	if (f2fs_encrypted_inode(dir) &&
 	    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
@@ -386,12 +414,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 			 "Inconsistent encryption contexts: %lu/%lu",
 			 dir->i_ino, inode->i_ino);
 		err = -EPERM;
-		goto err_out;
+		goto out_iput;
 	}
-	return d_splice_alias(inode, dentry);
-
-err_out:
+out_splice:
+	new = d_splice_alias(inode, dentry);
+	if (IS_ERR(new))
+		err = PTR_ERR(new);
+	trace_f2fs_lookup_end(dir, dentry, ino, err);
+	return new;
+out_iput:
 	iput(inode);
+out:
+	trace_f2fs_lookup_end(dir, dentry, ino, err);
 	return ERR_PTR(err);
 }
 
@@ -405,9 +439,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 
 	trace_f2fs_unlink_enter(dir, dentry);
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de) {
@@ -460,6 +500,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct fscrypt_symlink_data *sd = NULL;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir)) {
 		err = fscrypt_get_encryption_info(dir);
 		if (err)
@@ -566,6 +609,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -618,6 +664,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -712,6 +761,9 @@ out:
 
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir)) {
 		int err = fscrypt_get_encryption_info(dir);
 		if (err)
@@ -723,6 +775,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+		return -EIO;
+
 	return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
 }
 
@@ -742,6 +797,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	bool is_old_inline = f2fs_has_inline_dentry(old_dir);
 	int err = -ENOENT;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if ((f2fs_encrypted_inode(old_dir) &&
 			!fscrypt_has_encryption_key(old_dir)) ||
 			(f2fs_encrypted_inode(new_dir) &&
@@ -767,6 +825,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (err)
 		goto out;
 
+	if (new_inode) {
+		err = dquot_initialize(new_inode);
+		if (err)
+			goto out;
+	}
+
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -935,6 +999,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int old_nlink = 0, new_nlink = 0;
 	int err = -ENOENT;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if ((f2fs_encrypted_inode(old_dir) &&
 			!fscrypt_has_encryption_key(old_dir)) ||
 			(f2fs_encrypted_inode(new_dir) &&
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index fca87835a1da..d3322752426f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
-		mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+		mem_size = (nm_i->nid_cnt[FREE_NID] *
 				sizeof(struct free_nid)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
@@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	} else if (type == INO_ENTRIES) {
 		int i;
 
-		for (i = 0; i <= UPDATE_INO; i++)
+		for (i = 0; i < MAX_INO_ENTRY; i++)
 			mem_size += sbi->im[i].ino_num *
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
@@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				atomic_read(&sbi->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+	} else if (type == INMEM_PAGES) {
+		/* it allows 20% / total_ram for inmemory pages */
+		mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
+		res = mem_size < (val.totalram / 5);
 	} else {
 		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
 			return true;
@@ -134,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 	return dst_page;
 }
 
+static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+{
+	struct nat_entry *new;
+
+	if (no_fail)
+		new = f2fs_kmem_cache_alloc(nat_entry_slab,
+						GFP_NOFS | __GFP_ZERO);
+	else
+		new = kmem_cache_alloc(nat_entry_slab,
+						GFP_NOFS | __GFP_ZERO);
+	if (new) {
+		nat_set_nid(new, nid);
+		nat_reset_flag(new);
+	}
+	return new;
+}
+
+static void __free_nat_entry(struct nat_entry *e)
+{
+	kmem_cache_free(nat_entry_slab, e);
+}
+
+/* must be locked by nat_tree_lock */
+static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
+	struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+{
+	if (no_fail)
+		f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
+	else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
+		return NULL;
+
+	if (raw_ne)
+		node_info_from_raw_nat(&ne->ni, raw_ne);
+	list_add_tail(&ne->list, &nm_i->nat_entries);
+	nm_i->nat_cnt++;
+	return ne;
+}
+
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
 {
 	return radix_tree_lookup(&nm_i->nat_root, n);
@@ -150,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 	list_del(&e->list);
 	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
 	nm_i->nat_cnt--;
-	kmem_cache_free(nat_entry_slab, e);
+	__free_nat_entry(e);
 }
 
 static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
@@ -246,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 	return need_update;
 }
 
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
-								bool no_fail)
-{
-	struct nat_entry *new;
-
-	if (no_fail) {
-		new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-		f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
-	} else {
-		new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-		if (!new)
-			return NULL;
-		if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
-			kmem_cache_free(nat_entry_slab, new);
-			return NULL;
-		}
-	}
-
-	memset(new, 0, sizeof(struct nat_entry));
-	nat_set_nid(new, nid);
-	nat_reset_flag(new);
-	list_add_tail(&new->list, &nm_i->nat_entries);
-	nm_i->nat_cnt++;
-	return new;
-}
-
+/* must be locked by nat_tree_lock */
 static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 						struct f2fs_nat_entry *ne)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct nat_entry *e;
+	struct nat_entry *new, *e;
 
+	new = __alloc_nat_entry(nid, false);
+	if (!new)
+		return;
+
+	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
-	if (!e) {
-		e = grab_nat_entry(nm_i, nid, false);
-		if (e)
-			node_info_from_raw_nat(&e->ni, ne);
-	} else {
+	if (!e)
+		e = __init_nat_entry(nm_i, new, ne, false);
+	else
 		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
 				nat_get_blkaddr(e) !=
 					le32_to_cpu(ne->block_addr) ||
 				nat_get_version(e) != ne->version);
-	}
+	up_write(&nm_i->nat_tree_lock);
+	if (e != new)
+		__free_nat_entry(new);
 }
 
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -296,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
+	struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
 
 	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, ni->nid);
 	if (!e) {
-		e = grab_nat_entry(nm_i, ni->nid, true);
+		e = __init_nat_entry(nm_i, new, NULL, true);
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 	} else if (new_blkaddr == NEW_ADDR) {
@@ -312,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
 	}
+	/* let's free early to reduce memory consumption */
+	if (e != new)
+		__free_nat_entry(new);
 
 	/* sanity check */
 	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
@@ -327,10 +353,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 		unsigned char version = nat_get_version(e);
 		nat_set_version(e, inc_node_version(version));
-
-		/* in order to reuse the nid */
-		if (nm_i->next_scan_nid > ni->nid)
-			nm_i->next_scan_nid = ni->nid;
 	}
 
 	/* change address */
@@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 	f2fs_put_page(page, 1);
 cache:
 	/* cache nat entry */
-	down_write(&nm_i->nat_tree_lock);
 	cache_nat_entry(sbi, nid, &ne);
-	up_write(&nm_i->nat_tree_lock);
 }
 
 /*
@@ -962,7 +982,8 @@ fail:
 	return err > 0 ? 0 : err;
 }
 
-int truncate_xattr_node(struct inode *inode, struct page *page)
+/* caller must lock inode page */
+int truncate_xattr_node(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t nid = F2FS_I(inode)->i_xattr_nid;
@@ -978,10 +999,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 
 	f2fs_i_xnid_write(inode, 0);
 
-	set_new_dnode(&dn, inode, page, npage, nid);
-
-	if (page)
-		dn.inode_page_locked = true;
+	set_new_dnode(&dn, inode, NULL, npage, nid);
 	truncate_node(&dn);
 	return 0;
 }
@@ -1000,7 +1018,7 @@ int remove_inode_page(struct inode *inode)
 	if (err)
 		return err;
 
-	err = truncate_xattr_node(inode, dn.inode_page);
+	err = truncate_xattr_node(inode);
 	if (err) {
 		f2fs_put_dnode(&dn);
 		return err;
@@ -1220,7 +1238,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
 	if (!inode)
 		return;
 
-	page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
+	page = f2fs_pagecache_get_page(inode->i_mapping, 0,
+					FGP_LOCK|FGP_NOWAIT, 0);
 	if (!page)
 		goto iput_out;
 
@@ -1244,54 +1263,19 @@ iput_out:
 	iput(inode);
 }
 
-void move_node_page(struct page *node_page, int gc_type)
-{
-	if (gc_type == FG_GC) {
-		struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 1,
-			.for_reclaim = 0,
-		};
-
-		set_page_dirty(node_page);
-		f2fs_wait_on_page_writeback(node_page, NODE, true);
-
-		f2fs_bug_on(sbi, PageWriteback(node_page));
-		if (!clear_page_dirty_for_io(node_page))
-			goto out_page;
-
-		if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
-			unlock_page(node_page);
-		goto release_page;
-	} else {
-		/* set page dirty and write it */
-		if (!PageWriteback(node_page))
-			set_page_dirty(node_page);
-	}
-out_page:
-	unlock_page(node_page);
-release_page:
-	f2fs_put_page(node_page, 0);
-}
-
 static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	pgoff_t index, end;
+	pgoff_t index;
 	struct pagevec pvec;
 	struct page *last_page = NULL;
+	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = 0;
-	end = ULONG_MAX;
-
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1344,6 +1328,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 	struct node_info ni;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = ino_of_node(page),
 		.type = NODE,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
@@ -1416,6 +1401,37 @@ redirty_out:
 	return AOP_WRITEPAGE_ACTIVATE;
 }
 
+void move_node_page(struct page *node_page, int gc_type)
+{
+	if (gc_type == FG_GC) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 1,
+			.for_reclaim = 0,
+		};
+
+		set_page_dirty(node_page);
+		f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+		f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
+		if (!clear_page_dirty_for_io(node_page))
+			goto out_page;
+
+		if (__write_node_page(node_page, false, NULL,
+					&wbc, false, FS_GC_NODE_IO))
+			unlock_page(node_page);
+		goto release_page;
+	} else {
+		/* set page dirty and write it */
+		if (!PageWriteback(node_page))
+			set_page_dirty(node_page);
+	}
+out_page:
+	unlock_page(node_page);
+release_page:
+	f2fs_put_page(node_page, 0);
+}
+
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
@@ -1425,13 +1441,14 @@ static int f2fs_write_node_page(struct page *page,
 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			struct writeback_control *wbc, bool atomic)
 {
-	pgoff_t index, end;
+	pgoff_t index;
 	pgoff_t last_idx = ULONG_MAX;
 	struct pagevec pvec;
 	int ret = 0;
 	struct page *last_page = NULL;
 	bool marked = false;
 	nid_t ino = inode->i_ino;
+	int nr_pages;
 
 	if (atomic) {
 		last_page = last_fsync_dnode(sbi, ino);
@@ -1439,17 +1456,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			return PTR_ERR_OR_ZERO(last_page);
 	}
 retry:
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = 0;
-	end = ULONG_MAX;
-
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1548,25 +1560,21 @@ out:
 int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
 				bool do_balance, enum iostat_type io_type)
 {
-	pgoff_t index, end;
+	pgoff_t index;
 	struct pagevec pvec;
 	int step = 0;
 	int nwritten = 0;
 	int ret = 0;
+	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 next_step:
 	index = 0;
-	end = ULONG_MAX;
-
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1655,27 +1663,20 @@ out:
 
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	pgoff_t index = 0, end = ULONG_MAX;
+	pgoff_t index = 0;
 	struct pagevec pvec;
 	int ret2, ret = 0;
+	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_WRITEBACK,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_WRITEBACK))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			/* until radix tree lookup accepts end_index */
-			if (unlikely(page->index > end))
-				continue;
-
 			if (ino && ino_of_node(page) == ino) {
 				f2fs_wait_on_page_writeback(page, NODE, true);
 				if (TestClearPageError(page))
@@ -1761,35 +1762,54 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
 	return radix_tree_lookup(&nm_i->free_nid_root, n);
 }
 
-static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
-			struct free_nid *i, enum nid_list list, bool new)
+static int __insert_free_nid(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_state state)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	if (new) {
-		int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
-		if (err)
-			return err;
-	}
+	int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+	if (err)
+		return err;
 
-	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-						i->state != NID_ALLOC);
-	nm_i->nid_cnt[list]++;
-	list_add_tail(&i->list, &nm_i->nid_list[list]);
+	f2fs_bug_on(sbi, state != i->state);
+	nm_i->nid_cnt[state]++;
+	if (state == FREE_NID)
+		list_add_tail(&i->list, &nm_i->free_nid_list);
 	return 0;
 }
 
-static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
-			struct free_nid *i, enum nid_list list, bool reuse)
+static void __remove_free_nid(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_state state)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-						i->state != NID_ALLOC);
-	nm_i->nid_cnt[list]--;
-	list_del(&i->list);
-	if (!reuse)
-		radix_tree_delete(&nm_i->free_nid_root, i->nid);
+	f2fs_bug_on(sbi, state != i->state);
+	nm_i->nid_cnt[state]--;
+	if (state == FREE_NID)
+		list_del(&i->list);
+	radix_tree_delete(&nm_i->free_nid_root, i->nid);
+}
+
+static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
+			enum nid_state org_state, enum nid_state dst_state)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	f2fs_bug_on(sbi, org_state != i->state);
+	i->state = dst_state;
+	nm_i->nid_cnt[org_state]--;
+	nm_i->nid_cnt[dst_state]++;
+
+	switch (dst_state) {
+	case PREALLOC_NID:
+		list_del(&i->list);
+		break;
+	case FREE_NID:
+		list_add_tail(&i->list, &nm_i->free_nid_list);
+		break;
+	default:
+		BUG_ON(1);
+	}
 }
 
 /* return if the nid is recognized as free */
@@ -1807,7 +1827,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
-	i->state = NID_NEW;
+	i->state = FREE_NID;
 
 	if (radix_tree_preload(GFP_NOFS))
 		goto err;
@@ -1820,7 +1840,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		 *  - f2fs_create
 		 *   - f2fs_new_inode
 		 *    - alloc_nid
-		 *     - __insert_nid_to_list(ALLOC_NID_LIST)
+		 *     - __insert_nid_to_list(PREALLOC_NID)
 		 *                     - f2fs_balance_fs_bg
 		 *                      - build_free_nids
 		 *                       - __build_free_nids
@@ -1833,8 +1853,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		 *     - new_node_page
 		 *      - set_node_addr
 		 *  - alloc_nid_done
-		 *   - __remove_nid_from_list(ALLOC_NID_LIST)
-		 *                         - __insert_nid_to_list(FREE_NID_LIST)
+		 *   - __remove_nid_from_list(PREALLOC_NID)
+		 *                         - __insert_nid_to_list(FREE_NID)
 		 */
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
@@ -1843,13 +1863,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 		e = __lookup_free_nid_list(nm_i, nid);
 		if (e) {
-			if (e->state == NID_NEW)
+			if (e->state == FREE_NID)
 				ret = true;
 			goto err_out;
 		}
 	}
 	ret = true;
-	err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+	err = __insert_free_nid(sbi, i, FREE_NID);
 err_out:
 	spin_unlock(&nm_i->nid_list_lock);
 	radix_tree_preload_end();
@@ -1867,8 +1887,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	if (i && i->state == NID_NEW) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+	if (i && i->state == FREE_NID) {
+		__remove_free_nid(sbi, i, FREE_NID);
 		need_free = true;
 	}
 	spin_unlock(&nm_i->nid_list_lock);
@@ -1887,15 +1907,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
 	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
 		return;
 
-	if (set)
+	if (set) {
+		if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
 		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-	else
-		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-
-	if (set)
 		nm_i->free_nid_count[nat_ofs]++;
-	else if (!build)
-		nm_i->free_nid_count[nat_ofs]--;
+	} else {
+		if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
+		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+		if (!build)
+			nm_i->free_nid_count[nat_ofs]--;
+	}
 }
 
 static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1930,12 +1953,32 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+static void scan_curseg_cache(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_journal *journal = curseg->journal;
+	int i;
+
+	down_read(&curseg->journal_rwsem);
+	for (i = 0; i < nats_in_cursum(journal); i++) {
+		block_t addr;
+		nid_t nid;
+
+		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+		nid = le32_to_cpu(nid_in_journal(journal, i));
+		if (addr == NULL_ADDR)
+			add_free_nid(sbi, nid, true);
+		else
+			remove_free_nid(sbi, nid);
+	}
+	up_read(&curseg->journal_rwsem);
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned int i, idx;
+	nid_t nid;
 
 	down_read(&nm_i->nat_tree_lock);
 
@@ -1945,40 +1988,27 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
 		if (!nm_i->free_nid_count[i])
 			continue;
 		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
-			nid_t nid;
-
-			if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
-				continue;
+			idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
+						NAT_ENTRY_PER_BLOCK, idx);
+			if (idx >= NAT_ENTRY_PER_BLOCK)
+				break;
 
 			nid = i * NAT_ENTRY_PER_BLOCK + idx;
 			add_free_nid(sbi, nid, true);
 
-			if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS)
+			if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
 				goto out;
 		}
 	}
 out:
-	down_read(&curseg->journal_rwsem);
-	for (i = 0; i < nats_in_cursum(journal); i++) {
-		block_t addr;
-		nid_t nid;
+	scan_curseg_cache(sbi);
 
-		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-		nid = le32_to_cpu(nid_in_journal(journal, i));
-		if (addr == NULL_ADDR)
-			add_free_nid(sbi, nid, true);
-		else
-			remove_free_nid(sbi, nid);
-	}
-	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
 }
 
 static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_journal *journal = curseg->journal;
 	int i = 0;
 	nid_t nid = nm_i->next_scan_nid;
 
@@ -1986,7 +2016,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 		nid = 0;
 
 	/* Enough entries */
-	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+	if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
 		return;
 
 	if (!sync && !available_free_memory(sbi, FREE_NIDS))
@@ -1996,7 +2026,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 		/* try to find free nids in free_nid_bitmap */
 		scan_free_nid_bits(sbi);
 
-		if (nm_i->nid_cnt[FREE_NID_LIST])
+		if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
 			return;
 	}
 
@@ -2024,18 +2054,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 	nm_i->next_scan_nid = nid;
 
 	/* find free nids from current sum_pages */
-	down_read(&curseg->journal_rwsem);
-	for (i = 0; i < nats_in_cursum(journal); i++) {
-		block_t addr;
+	scan_curseg_cache(sbi);
 
-		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-		nid = le32_to_cpu(nid_in_journal(journal, i));
-		if (addr == NULL_ADDR)
-			add_free_nid(sbi, nid, true);
-		else
-			remove_free_nid(sbi, nid);
-	}
-	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -2073,15 +2093,13 @@ retry:
 	}
 
 	/* We should not use stale free nids created by build_free_nids */
-	if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
-		f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
-		i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+	if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
+		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
+		i = list_first_entry(&nm_i->free_nid_list,
 					struct free_nid, list);
 		*nid = i->nid;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
-		i->state = NID_ALLOC;
-		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+		__move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
 		nm_i->available_nids--;
 
 		update_free_nid_bitmap(sbi, *nid, false, false);
@@ -2107,7 +2125,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	f2fs_bug_on(sbi, !i);
-	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+	__remove_free_nid(sbi, i, PREALLOC_NID);
 	spin_unlock(&nm_i->nid_list_lock);
 
 	kmem_cache_free(free_nid_slab, i);
@@ -2130,12 +2148,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 	f2fs_bug_on(sbi, !i);
 
 	if (!available_free_memory(sbi, FREE_NIDS)) {
-		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+		__remove_free_nid(sbi, i, PREALLOC_NID);
 		need_free = true;
 	} else {
-		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
-		i->state = NID_NEW;
-		__insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
+		__move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
 	}
 
 	nm_i->available_nids++;
@@ -2154,20 +2170,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct free_nid *i, *next;
 	int nr = nr_shrink;
 
-	if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+	if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
 		return 0;
 
 	if (!mutex_trylock(&nm_i->build_lock))
 		return 0;
 
 	spin_lock(&nm_i->nid_list_lock);
-	list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
-									list) {
+	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
 		if (nr_shrink <= 0 ||
-				nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+				nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
 			break;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+		__remove_free_nid(sbi, i, FREE_NID);
 		kmem_cache_free(free_nid_slab, i);
 		nr_shrink--;
 	}
@@ -2193,8 +2208,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 		goto update_inode;
 	}
 
-	dst_addr = inline_xattr_addr(ipage);
-	src_addr = inline_xattr_addr(page);
+	dst_addr = inline_xattr_addr(inode, ipage);
+	src_addr = inline_xattr_addr(inode, page);
 	inline_size = inline_xattr_size(inode);
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
@@ -2283,6 +2298,12 @@ retry:
 	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
 	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
+
+		if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+							i_inline_xattr_size))
+			dst->i_inline_xattr_size = src->i_inline_xattr_size;
+
 		if (f2fs_sb_has_project_quota(sbi->sb) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
@@ -2354,8 +2375,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
-			ne = grab_nat_entry(nm_i, nid, true);
-			node_info_from_raw_nat(&ne->ni, &raw_ne);
+			ne = __alloc_nat_entry(nid, true);
+			__init_nat_entry(nm_i, ne, &raw_ne, true);
 		}
 
 		/*
@@ -2401,15 +2422,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
 	struct f2fs_nat_block *nat_blk = page_address(page);
 	int valid = 0;
-	int i;
+	int i = 0;
 
 	if (!enabled_nat_bits(sbi, NULL))
 		return;
 
-	for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
-		if (start_nid == 0 && i == 0)
-			valid++;
-		if (nat_blk->entries[i].block_addr)
+	if (nat_index == 0) {
+		valid = 1;
+		i = 1;
+	}
+	for (; i < NAT_ENTRY_PER_BLOCK; i++) {
+		if (nat_blk->entries[i].block_addr != NULL_ADDR)
 			valid++;
 	}
 	if (valid == 0) {
@@ -2604,7 +2627,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
 		__set_bit_le(i, nm_i->nat_block_bitmap);
 
 		nid = i * NAT_ENTRY_PER_BLOCK;
-		last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
+		last_nid = nid + NAT_ENTRY_PER_BLOCK;
 
 		spin_lock(&NM_I(sbi)->nid_list_lock);
 		for (; nid < last_nid; nid++)
@@ -2639,16 +2662,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
 							F2FS_RESERVED_NODE_NUM;
-	nm_i->nid_cnt[FREE_NID_LIST] = 0;
-	nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
+	nm_i->nid_cnt[FREE_NID] = 0;
+	nm_i->nid_cnt[PREALLOC_NID] = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
 	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
-	INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
-	INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
+	INIT_LIST_HEAD(&nm_i->free_nid_list);
 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
 	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
 	INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -2740,16 +2762,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 
 	/* destroy free nid list */
 	spin_lock(&nm_i->nid_list_lock);
-	list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
-									list) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+		__remove_free_nid(sbi, i, FREE_NID);
 		spin_unlock(&nm_i->nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
 		spin_lock(&nm_i->nid_list_lock);
 	}
-	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
-	f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
-	f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
+	f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* destroy nat cache */
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index bb53e9955ff2..0ee3e5ff49a3 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -140,6 +140,7 @@ enum mem_type {
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	EXTENT_CACHE,	/* indicates extent cache */
+	INMEM_PAGES,	/* indicates inmemory pages */
 	BASE_CHECK,	/* check kernel status */
 };
 
@@ -150,18 +151,10 @@ struct nat_entry_set {
 	unsigned int entry_cnt;		/* the # of nat entries in set */
 };
 
-/*
- * For free nid mangement
- */
-enum nid_state {
-	NID_NEW,	/* newly added to free nid list */
-	NID_ALLOC	/* it is allocated */
-};
-
 struct free_nid {
 	struct list_head list;	/* for free node id list */
 	nid_t nid;		/* node id */
-	int state;		/* in use or not: NID_NEW or NID_ALLOC */
+	int state;		/* in use or not: FREE_NID or PREALLOC_NID */
 };
 
 static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
@@ -170,12 +163,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct free_nid *fnid;
 
 	spin_lock(&nm_i->nid_list_lock);
-	if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+	if (nm_i->nid_cnt[FREE_NID] <= 0) {
 		spin_unlock(&nm_i->nid_list_lock);
 		return;
 	}
-	fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
-						struct free_nid, list);
+	fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list);
 	*nid = fnid->nid;
 	spin_unlock(&nm_i->nid_list_lock);
 }
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9626758bc762..b3a14b0429f2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -594,17 +594,20 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int ret = 0;
 	unsigned long s_flags = sbi->sb->s_flags;
 	bool need_writecp = false;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sbi->sb->s_flags &= ~MS_RDONLY;
+		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sbi->sb->s_flags |= MS_ACTIVE;
+	sbi->sb->s_flags |= SB_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
 #endif
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -665,9 +668,10 @@ skip:
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
-	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
 	return ret ? ret: err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c695ff462ee6..c117e0913f2a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -181,11 +181,12 @@ bool need_SSR(struct f2fs_sb_info *sbi)
 		return true;
 
 	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
-						2 * reserved_sections(sbi));
+			SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct inmem_pages *new;
 
@@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page)
 	mutex_lock(&fi->inmem_lock);
 	get_page(page);
 	list_add_tail(&new->list, &fi->inmem_pages);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(&fi->inmem_ilist))
+		list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
 	mutex_unlock(&fi->inmem_lock);
 
@@ -262,12 +267,41 @@ next:
 	return err;
 }
 
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
+	struct inode *inode;
+	struct f2fs_inode_info *fi;
+next:
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(head)) {
+		spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+		return;
+	}
+	fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
+	inode = igrab(&fi->vfs_inode);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+	if (inode) {
+		drop_inmem_pages(inode);
+		iput(inode);
+	}
+	congestion_wait(BLK_RW_ASYNC, HZ/50);
+	cond_resched();
+	goto next;
+}
+
 void drop_inmem_pages(struct inode *inode)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
 	mutex_lock(&fi->inmem_lock);
 	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
@@ -313,6 +347,7 @@ static int __commit_inmem_pages(struct inode *inode,
 	struct inmem_pages *cur, *tmp;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_PRIO,
@@ -398,6 +433,10 @@ int commit_inmem_pages(struct inode *inode)
 		/* drop all uncommitted pages */
 		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	}
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -472,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	struct bio *bio = f2fs_bio_alloc(0);
+	struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
 	int ret;
 
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
@@ -485,15 +524,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 	return ret;
 }
 
-static int submit_flush_wait(struct f2fs_sb_info *sbi)
+static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
+	int ret = 0;
 	int i;
 
-	if (!sbi->s_ndevs || ret)
-		return ret;
+	if (!sbi->s_ndevs)
+		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
 
-	for (i = 1; i < sbi->s_ndevs; i++) {
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		if (!is_dirty_device(sbi, ino, i, FLUSH_INO))
+			continue;
 		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
 		if (ret)
 			break;
@@ -519,7 +560,9 @@ repeat:
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
-		ret = submit_flush_wait(sbi);
+		cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
+
+		ret = submit_flush_wait(sbi, cmd->ino);
 		atomic_inc(&fcc->issued_flush);
 
 		llist_for_each_entry_safe(cmd, next,
@@ -537,7 +580,7 @@ repeat:
 	goto repeat;
 }
 
-int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 	struct flush_cmd cmd;
@@ -547,19 +590,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE)) {
-		ret = submit_flush_wait(sbi);
+		ret = submit_flush_wait(sbi, ino);
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
-	if (atomic_inc_return(&fcc->issing_flush) == 1) {
-		ret = submit_flush_wait(sbi);
+	if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+		ret = submit_flush_wait(sbi, ino);
 		atomic_dec(&fcc->issing_flush);
 
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
+	cmd.ino = ino;
 	init_completion(&cmd.wait);
 
 	llist_add(&cmd.llnode, &fcc->issue_list);
@@ -583,7 +627,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		} else {
 			struct flush_cmd *tmp, *next;
 
-			ret = submit_flush_wait(sbi);
+			ret = submit_flush_wait(sbi, ino);
 
 			llist_for_each_entry_safe(tmp, next, list, llnode) {
 				if (tmp == &cmd) {
@@ -653,6 +697,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 	}
 }
 
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
+{
+	int ret = 0, i;
+
+	if (!sbi->s_ndevs)
+		return 0;
+
+	for (i = 1; i < sbi->s_ndevs; i++) {
+		if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
+			continue;
+		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+		if (ret)
+			break;
+
+		spin_lock(&sbi->dev_lock);
+		f2fs_clear_bit(i, (char *)&sbi->dirty_device);
+		spin_unlock(&sbi->dev_lock);
+	}
+
+	return ret;
+}
+
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		enum dirty_type dirty_type)
 {
@@ -794,6 +860,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 
+	trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len);
+
 	f2fs_bug_on(sbi, dc->ref);
 
 	if (dc->error == -EOPNOTSUPP)
@@ -845,10 +913,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
 static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
-				struct discard_cmd *dc)
+						struct discard_policy *dpolicy,
+						struct discard_cmd *dc)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+					&(dcc->fstrim_list) : &(dcc->wait_list);
 	struct bio *bio = NULL;
+	int flag = dpolicy->sync ? REQ_SYNC : 0;
 
 	if (dc->state != D_PREP)
 		return;
@@ -867,9 +939,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
 		if (bio) {
 			bio->bi_private = dc;
 			bio->bi_end_io = f2fs_submit_discard_endio;
-			bio->bi_opf |= REQ_SYNC;
+			bio->bi_opf |= flag;
 			submit_bio(bio);
-			list_move_tail(&dc->list, &dcc->wait_list);
+			list_move_tail(&dc->list, wait_list);
 			__check_sit_bitmap(sbi, dc->start, dc->start + dc->len);
 
 			f2fs_update_iostat(sbi, FS_DISCARD, 1);
@@ -886,7 +958,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
 				struct rb_node *insert_parent)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct rb_node **p = &dcc->root.rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct discard_cmd *dc = NULL;
 
@@ -1054,58 +1126,107 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 	return 0;
 }
 
-static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy,
+					unsigned int start, unsigned int end)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	struct discard_cmd *dc;
+	struct blk_plug plug;
+	int issued;
+
+next:
+	issued = 0;
+
+	mutex_lock(&dcc->cmd_lock);
+	f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+
+	dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+					NULL, start,
+					(struct rb_entry **)&prev_dc,
+					(struct rb_entry **)&next_dc,
+					&insert_p, &insert_parent, true);
+	if (!dc)
+		dc = next_dc;
+
+	blk_start_plug(&plug);
+
+	while (dc && dc->lstart <= end) {
+		struct rb_node *node;
+
+		if (dc->len < dpolicy->granularity)
+			goto skip;
+
+		if (dc->state != D_PREP) {
+			list_move_tail(&dc->list, &dcc->fstrim_list);
+			goto skip;
+		}
+
+		__submit_discard_cmd(sbi, dpolicy, dc);
+
+		if (++issued >= dpolicy->max_requests) {
+			start = dc->lstart + dc->len;
+
+			blk_finish_plug(&plug);
+			mutex_unlock(&dcc->cmd_lock);
+
+			schedule();
+
+			goto next;
+		}
+skip:
+		node = rb_next(&dc->rb_node);
+		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	blk_finish_plug(&plug);
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *pend_list;
 	struct discard_cmd *dc, *tmp;
 	struct blk_plug plug;
-	int iter = 0, issued = 0;
-	int i;
+	int i, iter = 0, issued = 0;
 	bool io_interrupted = false;
 
-	mutex_lock(&dcc->cmd_lock);
-	f2fs_bug_on(sbi,
-		!__check_rb_tree_consistence(sbi, &dcc->root));
-	blk_start_plug(&plug);
-	for (i = MAX_PLIST_NUM - 1;
-			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (i + 1 < dpolicy->granularity)
+			break;
 		pend_list = &dcc->pend_list[i];
+
+		mutex_lock(&dcc->cmd_lock);
+		f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+		blk_start_plug(&plug);
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 
-			/* Hurry up to finish fstrim */
-			if (dcc->pend_list_tag[i] & P_TRIM) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-
-				if (fatal_signal_pending(current))
-					break;
-				continue;
-			}
-
-			if (!issue_cond) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-				continue;
-			}
-
-			if (is_idle(sbi)) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-			} else {
+			if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
+								!is_idle(sbi)) {
 				io_interrupted = true;
+				goto skip;
 			}
 
-			if (++iter >= DISCARD_ISSUE_RATE)
-				goto out;
+			__submit_discard_cmd(sbi, dpolicy, dc);
+			issued++;
+skip:
+			if (++iter >= dpolicy->max_requests)
+				break;
 		}
-		if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM)
-			dcc->pend_list_tag[i] &= (~P_TRIM);
+		blk_finish_plug(&plug);
+		mutex_unlock(&dcc->cmd_lock);
+
+		if (iter >= dpolicy->max_requests)
+			break;
 	}
-out:
-	blk_finish_plug(&plug);
-	mutex_unlock(&dcc->cmd_lock);
 
 	if (!issued && io_interrupted)
 		issued = -1;
@@ -1113,12 +1234,13 @@ out:
 	return issued;
 }
 
-static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
+static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *pend_list;
 	struct discard_cmd *dc, *tmp;
 	int i;
+	bool dropped = false;
 
 	mutex_lock(&dcc->cmd_lock);
 	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
@@ -1126,39 +1248,58 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 			__remove_discard_cmd(sbi, dc);
+			dropped = true;
 		}
 	}
 	mutex_unlock(&dcc->cmd_lock);
+
+	return dropped;
 }
 
-static void __wait_one_discard_bio(struct f2fs_sb_info *sbi,
+static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
 							struct discard_cmd *dc)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	unsigned int len = 0;
 
 	wait_for_completion_io(&dc->wait);
 	mutex_lock(&dcc->cmd_lock);
 	f2fs_bug_on(sbi, dc->state != D_DONE);
 	dc->ref--;
-	if (!dc->ref)
+	if (!dc->ref) {
+		if (!dc->error)
+			len = dc->len;
 		__remove_discard_cmd(sbi, dc);
+	}
 	mutex_unlock(&dcc->cmd_lock);
+
+	return len;
 }
 
-static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
+						struct discard_policy *dpolicy,
+						block_t start, block_t end)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct list_head *wait_list = &(dcc->wait_list);
+	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+					&(dcc->fstrim_list) : &(dcc->wait_list);
 	struct discard_cmd *dc, *tmp;
 	bool need_wait;
+	unsigned int trimmed = 0;
 
 next:
 	need_wait = false;
 
 	mutex_lock(&dcc->cmd_lock);
 	list_for_each_entry_safe(dc, tmp, wait_list, list) {
-		if (!wait_cond || (dc->state == D_DONE && !dc->ref)) {
+		if (dc->lstart + dc->len <= start || end <= dc->lstart)
+			continue;
+		if (dc->len < dpolicy->granularity)
+			continue;
+		if (dc->state == D_DONE && !dc->ref) {
 			wait_for_completion_io(&dc->wait);
+			if (!dc->error)
+				trimmed += dc->len;
 			__remove_discard_cmd(sbi, dc);
 		} else {
 			dc->ref++;
@@ -1169,9 +1310,17 @@ next:
 	mutex_unlock(&dcc->cmd_lock);
 
 	if (need_wait) {
-		__wait_one_discard_bio(sbi, dc);
+		trimmed += __wait_one_discard_bio(sbi, dc);
 		goto next;
 	}
+
+	return trimmed;
+}
+
+static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
+						struct discard_policy *dpolicy)
+{
+	__wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
 }
 
 /* This should be covered by global mutex, &sit_i->sentry_lock */
@@ -1209,23 +1358,19 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
 	}
 }
 
-/* This comes from f2fs_put_super and f2fs_trim_fs */
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount)
-{
-	__issue_discard_cmd(sbi, false);
-	__drop_discard_cmd(sbi);
-	__wait_discard_cmd(sbi, !umount);
-}
-
-static void mark_discard_range_all(struct f2fs_sb_info *sbi)
+/* This comes from f2fs_put_super */
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	int i;
+	struct discard_policy dpolicy;
+	bool dropped;
 
-	mutex_lock(&dcc->cmd_lock);
-	for (i = 0; i < MAX_PLIST_NUM; i++)
-		dcc->pend_list_tag[i] |= P_TRIM;
-	mutex_unlock(&dcc->cmd_lock);
+	init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity);
+	__issue_discard_cmd(sbi, &dpolicy);
+	dropped = __drop_discard_cmd(sbi);
+	__wait_all_discard_cmd(sbi, &dpolicy);
+
+	return dropped;
 }
 
 static int issue_discard_thread(void *data)
@@ -1233,12 +1378,16 @@ static int issue_discard_thread(void *data)
 	struct f2fs_sb_info *sbi = data;
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	wait_queue_head_t *q = &dcc->discard_wait_queue;
+	struct discard_policy dpolicy;
 	unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
 	int issued;
 
 	set_freezable();
 
 	do {
+		init_discard_policy(&dpolicy, DPOLICY_BG,
+					dcc->discard_granularity);
+
 		wait_event_interruptible_timeout(*q,
 				kthread_should_stop() || freezing(current) ||
 				dcc->discard_wake,
@@ -1251,17 +1400,18 @@ static int issue_discard_thread(void *data)
 		if (dcc->discard_wake) {
 			dcc->discard_wake = 0;
 			if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
-				mark_discard_range_all(sbi);
+				init_discard_policy(&dpolicy,
+							DPOLICY_FORCE, 1);
 		}
 
 		sb_start_intwrite(sbi->sb);
 
-		issued = __issue_discard_cmd(sbi, true);
+		issued = __issue_discard_cmd(sbi, &dpolicy);
 		if (issued) {
-			__wait_discard_cmd(sbi, true);
-			wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+			__wait_all_discard_cmd(sbi, &dpolicy);
+			wait_ms = dpolicy.min_interval;
 		} else {
-			wait_ms = DEF_MAX_DISCARD_ISSUE_TIME;
+			wait_ms = dpolicy.max_interval;
 		}
 
 		sb_end_intwrite(sbi->sb);
@@ -1525,7 +1675,6 @@ find_next:
 
 			f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
 									len);
-			cpc->trimmed += len;
 			total_len += len;
 		} else {
 			next_pos = find_next_bit_le(entry->discard_map,
@@ -1546,6 +1695,37 @@ skip:
 	wake_up_discard_thread(sbi, false);
 }
 
+void init_discard_policy(struct discard_policy *dpolicy,
+				int discard_type, unsigned int granularity)
+{
+	/* common policy */
+	dpolicy->type = discard_type;
+	dpolicy->sync = true;
+	dpolicy->granularity = granularity;
+
+	if (discard_type == DPOLICY_BG) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = true;
+	} else if (discard_type == DPOLICY_FORCE) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = true;
+	} else if (discard_type == DPOLICY_FSTRIM) {
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = false;
+	} else if (discard_type == DPOLICY_UMOUNT) {
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = false;
+	}
+}
+
 static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
@@ -1563,12 +1743,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 
 	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
 	INIT_LIST_HEAD(&dcc->entry_list);
-	for (i = 0; i < MAX_PLIST_NUM; i++) {
+	for (i = 0; i < MAX_PLIST_NUM; i++)
 		INIT_LIST_HEAD(&dcc->pend_list[i]);
-		if (i >= dcc->discard_granularity - 1)
-			dcc->pend_list_tag[i] |= P_ACTIVE;
-	}
 	INIT_LIST_HEAD(&dcc->wait_list);
+	INIT_LIST_HEAD(&dcc->fstrim_list);
 	mutex_init(&dcc->cmd_lock);
 	atomic_set(&dcc->issued_discard, 0);
 	atomic_set(&dcc->issing_discard, 0);
@@ -1716,16 +1894,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 		get_sec_entry(sbi, segno)->valid_blocks += del;
 }
 
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
-{
-	update_sit_entry(sbi, new, 1);
-	if (GET_SEGNO(sbi, old) != NULL_SEGNO)
-		update_sit_entry(sbi, old, -1);
-
-	locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
-	locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
-}
-
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 {
 	unsigned int segno = GET_SEGNO(sbi, addr);
@@ -1736,14 +1904,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 		return;
 
 	/* add it into sit main buffer */
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	update_sit_entry(sbi, addr, -1);
 
 	/* add it into dirty seglist */
 	locate_dirty_segment(sbi, segno);
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 }
 
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
@@ -1756,7 +1924,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
 		return true;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_read(&sit_i->sentry_lock);
 
 	segno = GET_SEGNO(sbi, blkaddr);
 	se = get_seg_entry(sbi, segno);
@@ -1765,7 +1933,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	if (f2fs_test_bit(offset, se->ckpt_valid_map))
 		is_cp = true;
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_read(&sit_i->sentry_lock);
 
 	return is_cp;
 }
@@ -1823,12 +1991,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
 void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
 {
 	struct page *page = grab_meta_page(sbi, blk_addr);
-	void *dst = page_address(page);
 
-	if (src)
-		memcpy(dst, src, PAGE_SIZE);
-	else
-		memset(dst, 0, PAGE_SIZE);
+	memcpy(page_address(page), src, PAGE_SIZE);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 }
@@ -1927,7 +2091,6 @@ find_other_zone:
 	}
 	secno = left_start;
 skip_left:
-	hint = secno;
 	segno = GET_SEG_FROM_SEC(sbi, secno);
 	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
 
@@ -2162,12 +2325,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
 	unsigned int old_segno;
 	int i;
 
+	down_write(&SIT_I(sbi)->sentry_lock);
+
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		curseg = CURSEG_I(sbi, i);
 		old_segno = curseg->segno;
 		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
 		locate_dirty_segment(sbi, old_segno);
 	}
+
+	up_write(&SIT_I(sbi)->sentry_lock);
 }
 
 static const struct segment_allocation default_salloc_ops = {
@@ -2179,14 +2346,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	__u64 trim_start = cpc->trim_start;
 	bool has_candidate = false;
 
-	mutex_lock(&SIT_I(sbi)->sentry_lock);
+	down_write(&SIT_I(sbi)->sentry_lock);
 	for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
 		if (add_discard_addrs(sbi, cpc, true)) {
 			has_candidate = true;
 			break;
 		}
 	}
-	mutex_unlock(&SIT_I(sbi)->sentry_lock);
+	up_write(&SIT_I(sbi)->sentry_lock);
 
 	cpc->trim_start = trim_start;
 	return has_candidate;
@@ -2196,14 +2363,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
 	__u64 start = F2FS_BYTES_TO_BLK(range->start);
 	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
-	unsigned int start_segno, end_segno;
+	unsigned int start_segno, end_segno, cur_segno;
+	block_t start_block, end_block;
 	struct cp_control cpc;
+	struct discard_policy dpolicy;
+	unsigned long long trimmed = 0;
 	int err = 0;
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
 
-	cpc.trimmed = 0;
 	if (end <= MAIN_BLKADDR(sbi))
 		goto out;
 
@@ -2217,12 +2386,14 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
+
 	cpc.reason = CP_DISCARD;
 	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
 
 	/* do checkpoint to issue discard commands safely */
-	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
-		cpc.trim_start = start_segno;
+	for (cur_segno = start_segno; cur_segno <= end_segno;
+					cur_segno = cpc.trim_end + 1) {
+		cpc.trim_start = cur_segno;
 
 		if (sbi->discard_blks == 0)
 			break;
@@ -2230,7 +2401,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 			cpc.trim_end = end_segno;
 		else
 			cpc.trim_end = min_t(unsigned int,
-				rounddown(start_segno +
+				rounddown(cur_segno +
 				BATCHED_TRIM_SEGMENTS(sbi),
 				sbi->segs_per_sec) - 1, end_segno);
 
@@ -2242,11 +2413,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 
 		schedule();
 	}
-	/* It's time to issue all the filed discards */
-	mark_discard_range_all(sbi);
-	f2fs_wait_discard_bios(sbi, false);
+
+	start_block = START_BLOCK(sbi, start_segno);
+	end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1);
+
+	init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
+	__issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block);
+	trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
+					start_block, end_block);
 out:
-	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+	range->len = F2FS_BLK_TO_BYTES(trimmed);
 	return err;
 }
 
@@ -2258,6 +2434,18 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 	return false;
 }
 
+int rw_hint_to_seg_type(enum rw_hint hint)
+{
+	switch (hint) {
+	case WRITE_LIFE_SHORT:
+		return CURSEG_HOT_DATA;
+	case WRITE_LIFE_EXTREME:
+		return CURSEG_COLD_DATA;
+	default:
+		return CURSEG_WARM_DATA;
+	}
+}
+
 static int __get_segment_type_2(struct f2fs_io_info *fio)
 {
 	if (fio->type == DATA)
@@ -2292,7 +2480,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 			return CURSEG_COLD_DATA;
 		if (is_inode_flag_set(inode, FI_HOT_DATA))
 			return CURSEG_HOT_DATA;
-		return CURSEG_WARM_DATA;
+		return rw_hint_to_seg_type(inode->i_write_hint);
 	} else {
 		if (IS_DNODE(fio->page))
 			return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
@@ -2336,8 +2524,10 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
+	down_read(&SM_I(sbi)->curseg_lock);
+
 	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
@@ -2354,15 +2544,26 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
 	stat_inc_block_count(sbi, curseg);
 
+	/*
+	 * SIT information should be updated before segment allocation,
+	 * since SSR needs latest valid block information.
+	 */
+	update_sit_entry(sbi, *new_blkaddr, 1);
+	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+		update_sit_entry(sbi, old_blkaddr, -1);
+
 	if (!__has_curseg_space(sbi, type))
 		sit_i->s_ops->allocate_segment(sbi, type, false);
+
 	/*
-	 * SIT information should be updated after segment allocation,
-	 * since we need to keep dirty segments precisely under SSR.
+	 * segment dirty status should be updated after segment allocation,
+	 * so we just need to update status only one time after previous
+	 * segment being closed.
 	 */
-	refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 
 	if (page && IS_NODESEG(type)) {
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
@@ -2382,6 +2583,29 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	}
 
 	mutex_unlock(&curseg->curseg_mutex);
+
+	up_read(&SM_I(sbi)->curseg_lock);
+}
+
+static void update_device_state(struct f2fs_io_info *fio)
+{
+	struct f2fs_sb_info *sbi = fio->sbi;
+	unsigned int devidx;
+
+	if (!sbi->s_ndevs)
+		return;
+
+	devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+
+	/* update device state for fsync */
+	set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+
+	/* update device state for checkpoint */
+	if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+		spin_lock(&sbi->dev_lock);
+		f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+		spin_unlock(&sbi->dev_lock);
+	}
 }
 
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
@@ -2398,6 +2622,8 @@ reallocate:
 	if (err == -EAGAIN) {
 		fio->old_blkaddr = fio->new_blkaddr;
 		goto reallocate;
+	} else if (!err) {
+		update_device_state(fio);
 	}
 }
 
@@ -2458,12 +2684,26 @@ int rewrite_data_page(struct f2fs_io_info *fio)
 	stat_inc_inplace_blocks(fio->sbi);
 
 	err = f2fs_submit_page_bio(fio);
+	if (!err)
+		update_device_state(fio);
 
 	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
 
 	return err;
 }
 
+static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
+						unsigned int segno)
+{
+	int i;
+
+	for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+		if (CURSEG_I(sbi, i)->segno == segno)
+			break;
+	}
+	return i;
+}
+
 void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				block_t old_blkaddr, block_t new_blkaddr,
 				bool recover_curseg, bool recover_newaddr)
@@ -2479,6 +2719,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	se = get_seg_entry(sbi, segno);
 	type = se->type;
 
+	down_write(&SM_I(sbi)->curseg_lock);
+
 	if (!recover_curseg) {
 		/* for recovery flow */
 		if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
@@ -2488,14 +2730,19 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				type = CURSEG_WARM_DATA;
 		}
 	} else {
-		if (!IS_CURSEG(sbi, segno))
+		if (IS_CURSEG(sbi, segno)) {
+			/* se->type is volatile as SSR allocation */
+			type = __f2fs_get_curseg(sbi, segno);
+			f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
+		} else {
 			type = CURSEG_WARM_DATA;
+		}
 	}
 
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	old_cursegno = curseg->segno;
 	old_blkoff = curseg->next_blkoff;
@@ -2527,8 +2774,9 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		curseg->next_blkoff = old_blkoff;
 	}
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
+	up_write(&SM_I(sbi)->curseg_lock);
 }
 
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -2982,7 +3230,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	bool to_journal = true;
 	struct seg_entry *se;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	if (!sit_i->dirty_sentries)
 		goto out;
@@ -3076,7 +3324,7 @@ out:
 
 		cpc->trim_start = trim_start;
 	}
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 
 	set_prefree_as_free_segments(sbi);
 }
@@ -3169,7 +3417,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
 	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
 	sit_i->mounted_time = ktime_get_real_seconds();
-	mutex_init(&sit_i->sentry_lock);
+	init_rwsem(&sit_i->sentry_lock);
 	return 0;
 }
 
@@ -3410,7 +3658,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 	struct sit_info *sit_i = SIT_I(sbi);
 	unsigned int segno;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	sit_i->min_mtime = LLONG_MAX;
 
@@ -3427,7 +3675,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 			sit_i->min_mtime = mtime;
 	}
 	sit_i->max_mtime = get_mtime(sbi);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 }
 
 int build_segment_manager(struct f2fs_sb_info *sbi)
@@ -3460,11 +3708,14 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
+	sm_info->min_ssr_sections = reserved_sections(sbi);
 
 	sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
 
 	INIT_LIST_HEAD(&sm_info->sit_entry_set);
 
+	init_rwsem(&sm_info->curseg_lock);
+
 	if (!f2fs_readonly(sbi->sb)) {
 		err = create_flush_cmd_control(sbi);
 		if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e0a6cc23ace3..d1d394cdf61d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -231,7 +231,7 @@ struct sit_info {
 	unsigned long *dirty_sentries_bitmap;	/* bitmap for dirty sentries */
 	unsigned int dirty_sentries;		/* # of dirty sentries */
 	unsigned int sents_per_block;		/* # of SIT entries per block */
-	struct mutex sentry_lock;		/* to protect SIT cache */
+	struct rw_semaphore sentry_lock;	/* to protect SIT cache */
 	struct seg_entry *sentries;		/* SIT segment-level cache */
 	struct sec_entry *sec_entries;		/* SIT section-level cache */
 
@@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 	return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
 }
 
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+{
+	unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+					get_pages(sbi, F2FS_DIRTY_DENTS);
+	unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+	unsigned int segno, left_blocks;
+	int i;
+
+	/* check current node segment */
+	for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
+		segno = CURSEG_I(sbi, i)->segno;
+		left_blocks = sbi->blocks_per_seg -
+			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+		if (node_blocks > left_blocks)
+			return false;
+	}
+
+	/* check current data segment */
+	segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
+	left_blocks = sbi->blocks_per_seg -
+			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+	if (dent_blocks > left_blocks)
+		return false;
+	return true;
+}
+
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 					int freed, int needed)
 {
@@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return false;
 
+	if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
+			has_curseg_enough_space(sbi))
+		return false;
 	return (free_sections(sbi) + freed) <=
 		(node_secs + 2 * dent_secs + imeta_secs +
 		reserved_sections(sbi) + needed);
@@ -731,7 +761,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
 static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
 						unsigned int secno)
 {
-	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
+	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >
 						sbi->fggc_threshold)
 		return true;
 	return false;
@@ -796,8 +826,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
 		goto wake_up;
 
 	mutex_lock(&dcc->cmd_lock);
-	for (i = MAX_PLIST_NUM - 1;
-			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (i + 1 < dcc->discard_granularity)
+			break;
 		if (!list_empty(&dcc->pend_list[i])) {
 			wakeup = true;
 			break;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 5c60fc28ec75..0b5664a1a6cc 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+	long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS;
 
 	return count > 0 ? count : 0;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 97e03c637e90..708155d9c2e4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,6 +44,8 @@ static struct kmem_cache *f2fs_inode_cachep;
 char *fault_name[FAULT_MAX] = {
 	[FAULT_KMALLOC]		= "kmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
+	[FAULT_PAGE_GET]	= "page get",
+	[FAULT_ALLOC_BIO]	= "alloc bio",
 	[FAULT_ALLOC_NID]	= "alloc nid",
 	[FAULT_ORPHAN]		= "orphan",
 	[FAULT_BLOCK]		= "no more block",
@@ -92,6 +94,7 @@ enum {
 	Opt_disable_ext_identify,
 	Opt_inline_xattr,
 	Opt_noinline_xattr,
+	Opt_inline_xattr_size,
 	Opt_inline_data,
 	Opt_inline_dentry,
 	Opt_noinline_dentry,
@@ -141,6 +144,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_disable_ext_identify, "disable_ext_identify"},
 	{Opt_inline_xattr, "inline_xattr"},
 	{Opt_noinline_xattr, "noinline_xattr"},
+	{Opt_inline_xattr_size, "inline_xattr_size=%u"},
 	{Opt_inline_data, "inline_data"},
 	{Opt_inline_dentry, "inline_dentry"},
 	{Opt_noinline_dentry, "noinline_dentry"},
@@ -209,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"quota options when quota turned on");
 		return -EINVAL;
 	}
+	if (f2fs_sb_has_quota_ino(sb)) {
+		f2fs_msg(sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore qf_name");
+		return 0;
+	}
+
 	qname = match_strdup(args);
 	if (!qname) {
 		f2fs_msg(sb, KERN_ERR,
@@ -287,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 			return -1;
 		}
 	}
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore jquota_fmt");
+		sbi->s_jquota_fmt = 0;
+	}
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			 "Filesystem with quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return -1;
+	}
 	return 0;
 }
 #endif
@@ -383,6 +405,12 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noinline_xattr:
 			clear_opt(sbi, INLINE_XATTR);
 			break;
+		case Opt_inline_xattr_size:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			set_opt(sbi, INLINE_XATTR_SIZE);
+			sbi->inline_xattr_size = arg;
+			break;
 #else
 		case Opt_user_xattr:
 			f2fs_msg(sb, KERN_INFO,
@@ -506,10 +534,10 @@ static int parse_options(struct super_block *sb, char *options)
 #endif
 			break;
 		case Opt_lazytime:
-			sb->s_flags |= MS_LAZYTIME;
+			sb->s_flags |= SB_LAZYTIME;
 			break;
 		case Opt_nolazytime:
-			sb->s_flags &= ~MS_LAZYTIME;
+			sb->s_flags &= ~SB_LAZYTIME;
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_quota:
@@ -604,6 +632,24 @@ static int parse_options(struct super_block *sb, char *options)
 				F2FS_IO_SIZE_KB(sbi));
 		return -EINVAL;
 	}
+
+	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+		if (!test_opt(sbi, INLINE_XATTR)) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline_xattr_size option should be "
+					"set with inline_xattr option");
+			return -EINVAL;
+		}
+		if (!sbi->inline_xattr_size ||
+			sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+					F2FS_TOTAL_EXTRA_ATTR_SIZE -
+					DEF_INLINE_RESERVED_SIZE -
+					DEF_MIN_INLINE_SIZE) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline xattr size is out of range");
+			return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -618,13 +664,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	init_once((void *) fi);
 
 	/* Initialize f2fs-specific inode info */
-	fi->vfs_inode.i_version = 1;
 	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
 	INIT_LIST_HEAD(&fi->dirty_list);
 	INIT_LIST_HEAD(&fi->gdirty_list);
+	INIT_LIST_HEAD(&fi->inmem_ilist);
 	INIT_LIST_HEAD(&fi->inmem_pages);
 	mutex_init(&fi->inmem_lock);
 	init_rwsem(&fi->dio_rwsem[READ]);
@@ -673,7 +719,6 @@ static int f2fs_drop_inode(struct inode *inode)
 
 			sb_end_intwrite(inode->i_sb);
 
-			fscrypt_put_encryption_info(inode, NULL);
 			spin_lock(&inode->i_lock);
 			atomic_dec(&inode->i_count);
 		}
@@ -781,6 +826,7 @@ static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int i;
+	bool dropped;
 
 	f2fs_quota_off_umount(sb);
 
@@ -801,9 +847,9 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	f2fs_wait_discard_bios(sbi, true);
+	dropped = f2fs_wait_discard_bios(sbi);
 
-	if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
+	if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
@@ -858,6 +904,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int err = 0;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return 0;
+
 	trace_f2fs_sync_fs(sb, sync);
 
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -957,7 +1006,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks = total_count - start_count;
 	buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
-						sbi->reserved_blocks;
+						sbi->current_reserved_blocks;
 
 	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
 
@@ -1046,6 +1095,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",inline_xattr");
 	else
 		seq_puts(seq, ",noinline_xattr");
+	if (test_opt(sbi, INLINE_XATTR_SIZE))
+		seq_printf(seq, ",inline_xattr_size=%u",
+					sbi->inline_xattr_size);
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	if (test_opt(sbi, POSIX_ACL))
@@ -1108,6 +1160,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
+	sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_XATTR);
@@ -1115,7 +1168,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, INLINE_DENTRY);
 	set_opt(sbi, EXTENT_CACHE);
 	set_opt(sbi, NOHEAP);
-	sbi->sb->s_flags |= MS_LAZYTIME;
+	sbi->sb->s_flags |= SB_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
@@ -1136,6 +1189,9 @@ static void default_options(struct f2fs_sb_info *sbi)
 #endif
 }
 
+#ifdef CONFIG_QUOTA
+static int f2fs_enable_quotas(struct super_block *sb);
+#endif
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -1180,7 +1236,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 #endif
 
 	/* recover superblocks we couldn't write due to previous RO mount */
-	if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+	if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
 		err = f2fs_commit_super(sbi, false);
 		f2fs_msg(sb, KERN_INFO,
 			"Try to recover all the superblocks, ret: %d", err);
@@ -1199,19 +1255,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * Previous and new state of filesystem is RO,
 	 * so skip checking GC and FLUSH_MERGE conditions.
 	 */
-	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
+	if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
 		goto skip;
 
-	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
+#ifdef CONFIG_QUOTA
+	if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
 			goto restore_opts;
 	} else {
 		/* dquot_resume needs RW */
-		sb->s_flags &= ~MS_RDONLY;
-		dquot_resume(sb, -1);
+		sb->s_flags &= ~SB_RDONLY;
+		if (sb_any_quota_suspended(sb)) {
+			dquot_resume(sb, -1);
+		} else if (f2fs_sb_has_quota_ino(sb)) {
+			err = f2fs_enable_quotas(sb);
+			if (err)
+				goto restore_opts;
+		}
 	}
-
+#endif
 	/* disallow enable/disable extent_cache dynamically */
 	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
 		err = -EINVAL;
@@ -1225,7 +1288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * or if background_gc = off is passed in mount
 	 * option. Also sync the filesystem.
 	 */
-	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+	if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
 		if (sbi->gc_thread) {
 			stop_gc_thread(sbi);
 			need_restart_gc = true;
@@ -1237,7 +1300,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		need_stop_gc = true;
 	}
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 		sync_inodes_sb(sb);
 
@@ -1251,7 +1314,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
 	 */
-	if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+	if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
 		clear_opt(sbi, FLUSH_MERGE);
 		destroy_flush_cmd_control(sbi, false);
 	} else {
@@ -1266,8 +1329,8 @@ skip:
 		kfree(s_qf_names[i]);
 #endif
 	/* Update the POSIXACL Flag */
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	return 0;
 restore_gc:
@@ -1320,8 +1383,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
 repeat:
 		page = read_mapping_page(mapping, blkidx, NULL);
-		if (IS_ERR(page))
+		if (IS_ERR(page)) {
+			if (PTR_ERR(page) == -ENOMEM) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto repeat;
+			}
 			return PTR_ERR(page);
+		}
 
 		lock_page(page);
 
@@ -1364,11 +1432,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
 	while (towrite > 0) {
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset,
 								towrite);
-
+retry:
 		err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
 							&page, NULL);
-		if (unlikely(err))
+		if (unlikely(err)) {
+			if (err == -ENOMEM) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto retry;
+			}
 			break;
+		}
 
 		kaddr = kmap_atomic(page);
 		memcpy(kaddr + offset, data, tocopy);
@@ -1385,8 +1458,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
 	}
 
 	if (len == towrite)
-		return 0;
-	inode->i_version++;
+		return err;
 	inode->i_mtime = inode->i_ctime = current_time(inode);
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return len - towrite;
@@ -1408,19 +1480,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
 						sbi->s_jquota_fmt, type);
 }
 
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
 {
-	int i, ret;
+	int enabled = 0;
+	int i, err;
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+		err = f2fs_enable_quotas(sbi->sb);
+		if (err) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+					"Cannot turn on quota_ino: %d", err);
+			return 0;
+		}
+		return 1;
+	}
 
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sbi->s_qf_names[i]) {
-			ret = f2fs_quota_on_mount(sbi, i);
-			if (ret < 0)
-				f2fs_msg(sbi->sb, KERN_ERR,
-					"Cannot turn on journaled "
-					"quota: error %d", ret);
+			err = f2fs_quota_on_mount(sbi, i);
+			if (!err) {
+				enabled = 1;
+				continue;
+			}
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Cannot turn on quotas: %d on %d", err, i);
+		}
+	}
+	return enabled;
+}
+
+static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags)
+{
+	struct inode *qf_inode;
+	unsigned long qf_inum;
+	int err;
+
+	BUG_ON(!f2fs_sb_has_quota_ino(sb));
+
+	qf_inum = f2fs_qf_ino(sb, type);
+	if (!qf_inum)
+		return -EPERM;
+
+	qf_inode = f2fs_iget(sb, qf_inum);
+	if (IS_ERR(qf_inode)) {
+		f2fs_msg(sb, KERN_ERR,
+			"Bad quota inode %u:%lu", type, qf_inum);
+		return PTR_ERR(qf_inode);
+	}
+
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
+	err = dquot_enable(qf_inode, type, format_id, flags);
+	iput(qf_inode);
+	return err;
+}
+
+static int f2fs_enable_quotas(struct super_block *sb)
+{
+	int type, err = 0;
+	unsigned long qf_inum;
+	bool quota_mopt[MAXQUOTAS] = {
+		test_opt(F2FS_SB(sb), USRQUOTA),
+		test_opt(F2FS_SB(sb), GRPQUOTA),
+		test_opt(F2FS_SB(sb), PRJQUOTA),
+	};
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		qf_inum = f2fs_qf_ino(sb, type);
+		if (qf_inum) {
+			err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
+				DQUOT_USAGE_ENABLED |
+				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+			if (err) {
+				f2fs_msg(sb, KERN_ERR,
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"fsck to fix.", type, err);
+				for (type--; type >= 0; type--)
+					dquot_quota_off(sb, type);
+				return err;
+			}
 		}
 	}
+	return 0;
 }
 
 static int f2fs_quota_sync(struct super_block *sb, int type)
@@ -1491,7 +1635,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
 	f2fs_quota_sync(sb, type);
 
 	err = dquot_quota_off(sb, type);
-	if (err)
+	if (err || f2fs_sb_has_quota_ino(sb))
 		goto out_put;
 
 	inode_lock(inode);
@@ -1651,7 +1795,7 @@ static loff_t max_file_blocks(void)
 
 	/*
 	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
-	 * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+	 * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
 	 * space in inode.i_addr, it will be more safe to reassign
 	 * result as zero.
 	 */
@@ -1960,6 +2104,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		for (j = HOT; j < NR_TEMP_TYPE; j++)
 			mutex_init(&sbi->wio_mutex[i][j]);
 	spin_lock_init(&sbi->cp_lock);
+
+	sbi->dirty_device = 0;
+	spin_lock_init(&sbi->dev_lock);
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2310,7 +2457,10 @@ try_onemore:
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
-	sb->s_qcop = &f2fs_quotactl_ops;
+	if (f2fs_sb_has_quota_ino(sb))
+		sb->s_qcop = &dquot_quotactl_sysfile_ops;
+	else
+		sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 
@@ -2322,8 +2472,8 @@ try_onemore:
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
 	sb->s_time_gran = 1;
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 	memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
 
 	/* init f2fs-specific super block info */
@@ -2408,6 +2558,7 @@ try_onemore:
 				le64_to_cpu(sbi->ckpt->valid_block_count);
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
 	sbi->reserved_blocks = 0;
+	sbi->current_reserved_blocks = 0;
 
 	for (i = 0; i < NR_INODE_TYPE; i++) {
 		INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -2482,10 +2633,24 @@ try_onemore:
 	if (err)
 		goto free_root_inode;
 
+#ifdef CONFIG_QUOTA
+	/*
+	 * Turn on quotas which were not enabled for read-only mounts if
+	 * filesystem has quota feature, so that they are updated correctly.
+	 */
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+		err = f2fs_enable_quotas(sb);
+		if (err) {
+			f2fs_msg(sb, KERN_ERR,
+				"Cannot turn on quotas: error %d", err);
+			goto free_sysfs;
+		}
+	}
+#endif
 	/* if there are nt orphan nodes free them */
 	err = recover_orphan_inodes(sbi);
 	if (err)
-		goto free_sysfs;
+		goto free_meta;
 
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -2519,7 +2684,7 @@ try_onemore:
 			err = -EINVAL;
 			f2fs_msg(sb, KERN_ERR,
 				"Need to recover fsync data");
-			goto free_sysfs;
+			goto free_meta;
 		}
 	}
 skip_recovery:
@@ -2553,6 +2718,10 @@ skip_recovery:
 	return 0;
 
 free_meta:
+#ifdef CONFIG_QUOTA
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+		f2fs_quota_off_umount(sbi->sb);
+#endif
 	f2fs_sync_inode_meta(sbi);
 	/*
 	 * Some dirty meta pages can be produced by recover_orphan_inodes()
@@ -2561,7 +2730,9 @@ free_meta:
 	 * falls into an infinite loop in sync_meta_pages().
 	 */
 	truncate_inode_pages_final(META_MAPPING(sbi));
+#ifdef CONFIG_QUOTA
 free_sysfs:
+#endif
 	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index e2c258f717cd..9835348b6e5d 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -30,7 +30,7 @@ enum {
 	FAULT_INFO_RATE,	/* struct f2fs_fault_info */
 	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
 #endif
-	RESERVED_BLOCKS,
+	RESERVED_BLOCKS,	/* struct f2fs_sb_info */
 };
 
 struct f2fs_attr {
@@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 	return NULL;
 }
 
+static ssize_t dirty_segments_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)(dirty_segments(sbi)));
+}
+
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
@@ -100,10 +107,22 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_inode_chksum(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_checksum");
+	if (f2fs_sb_has_flexible_inline_xattr(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "flexible_inline_xattr");
+	if (f2fs_sb_has_quota_ino(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "quota_ino");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
 
+static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
+					struct f2fs_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks);
+}
+
 static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 			struct f2fs_sb_info *sbi, char *buf)
 {
@@ -143,34 +162,22 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 #endif
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);
-		if ((unsigned long)sbi->total_valid_block_count + t >
-				(unsigned long)sbi->user_block_count) {
+		if (t > (unsigned long)sbi->user_block_count) {
 			spin_unlock(&sbi->stat_lock);
 			return -EINVAL;
 		}
 		*ui = t;
+		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+				sbi->user_block_count - valid_user_blocks(sbi));
 		spin_unlock(&sbi->stat_lock);
 		return count;
 	}
 
 	if (!strcmp(a->attr.name, "discard_granularity")) {
-		struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-		int i;
-
 		if (t == 0 || t > MAX_PLIST_NUM)
 			return -EINVAL;
 		if (t == *ui)
 			return count;
-
-		mutex_lock(&dcc->cmd_lock);
-		for (i = 0; i < MAX_PLIST_NUM; i++) {
-			if (i >= t - 1)
-				dcc->pend_list_tag[i] |= P_ACTIVE;
-			else
-				dcc->pend_list_tag[i] &= (~P_ACTIVE);
-		}
-		mutex_unlock(&dcc->cmd_lock);
-
 		*ui = t;
 		return count;
 	}
@@ -222,6 +229,8 @@ enum feat_id {
 	FEAT_EXTRA_ATTR,
 	FEAT_PROJECT_QUOTA,
 	FEAT_INODE_CHECKSUM,
+	FEAT_FLEXIBLE_INLINE_XATTR,
+	FEAT_QUOTA_INO,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -234,6 +243,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_EXTRA_ATTR:
 	case FEAT_PROJECT_QUOTA:
 	case FEAT_INODE_CHECKSUM:
+	case FEAT_FLEXIBLE_INLINE_XATTR:
+	case FEAT_QUOTA_INO:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -279,6 +290,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -291,8 +303,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
 F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
 #endif
+F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 F2FS_GENERAL_RO_ATTR(features);
+F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
@@ -304,6 +318,8 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
 F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
 F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
 F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -321,6 +337,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),
 	ATTR_LIST(min_hot_blocks),
+	ATTR_LIST(min_ssr_sections),
 	ATTR_LIST(max_victim_search),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
@@ -333,9 +350,11 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(inject_rate),
 	ATTR_LIST(inject_type),
 #endif
+	ATTR_LIST(dirty_segments),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(features),
 	ATTR_LIST(reserved_blocks),
+	ATTR_LIST(current_reserved_blocks),
 	NULL,
 };
 
@@ -350,6 +369,8 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(extra_attr),
 	ATTR_LIST(project_quota),
 	ATTR_LIST(inode_checksum),
+	ATTR_LIST(flexible_inline_xattr),
+	ATTR_LIST(quota_ino),
 	NULL,
 };
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7c65540148f8..ec8961ef8cac 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -217,12 +217,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 	return entry;
 }
 
-static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
-					void **last_addr, int index,
-					size_t len, const char *name)
+static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
+				void *base_addr, void **last_addr, int index,
+				size_t len, const char *name)
 {
 	struct f2fs_xattr_entry *entry;
-	unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+	unsigned int inline_size = inline_xattr_size(inode);
 
 	list_for_each_xattr(entry, base_addr) {
 		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
@@ -241,12 +241,54 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
 	return entry;
 }
 
+static int read_inline_xattr(struct inode *inode, struct page *ipage,
+							void *txattr_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int inline_size = inline_xattr_size(inode);
+	struct page *page = NULL;
+	void *inline_addr;
+
+	if (ipage) {
+		inline_addr = inline_xattr_addr(inode, ipage);
+	} else {
+		page = get_node_page(sbi, inode->i_ino);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		inline_addr = inline_xattr_addr(inode, page);
+	}
+	memcpy(txattr_addr, inline_addr, inline_size);
+	f2fs_put_page(page, 1);
+
+	return 0;
+}
+
+static int read_xattr_block(struct inode *inode, void *txattr_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	unsigned int inline_size = inline_xattr_size(inode);
+	struct page *xpage;
+	void *xattr_addr;
+
+	/* The inode already has an extended attribute block. */
+	xpage = get_node_page(sbi, xnid);
+	if (IS_ERR(xpage))
+		return PTR_ERR(xpage);
+
+	xattr_addr = page_address(xpage);
+	memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE);
+	f2fs_put_page(xpage, 1);
+
+	return 0;
+}
+
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 				unsigned int index, unsigned int len,
 				const char *name, struct f2fs_xattr_entry **xe,
 				void **base_addr)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	void *cur_addr, *txattr_addr, *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
@@ -263,23 +305,11 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
-		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
-				err = PTR_ERR(page);
-				goto out;
-			}
-			inline_addr = inline_xattr_addr(page);
-		}
-		memcpy(txattr_addr, inline_addr, inline_size);
-		f2fs_put_page(page, 1);
+		err = read_inline_xattr(inode, ipage, txattr_addr);
+		if (err)
+			goto out;
 
-		*xe = __find_inline_xattr(txattr_addr, &last_addr,
+		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
 		if (*xe)
 			goto check;
@@ -287,19 +317,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from xattr node block */
 	if (xnid) {
-		struct page *xpage;
-		void *xattr_addr;
-
-		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, xnid);
-		if (IS_ERR(xpage)) {
-			err = PTR_ERR(xpage);
+		err = read_xattr_block(inode, txattr_addr);
+		if (err)
 			goto out;
-		}
-
-		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, size);
-		f2fs_put_page(xpage, 1);
 	}
 
 	if (last_addr)
@@ -324,7 +344,6 @@ out:
 static int read_all_xattrs(struct inode *inode, struct page *ipage,
 							void **base_addr)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_xattr_header *header;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int size = VALID_XATTR_BLOCK_SIZE;
@@ -339,38 +358,16 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
-		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
-				err = PTR_ERR(page);
-				goto fail;
-			}
-			inline_addr = inline_xattr_addr(page);
-		}
-		memcpy(txattr_addr, inline_addr, inline_size);
-		f2fs_put_page(page, 1);
+		err = read_inline_xattr(inode, ipage, txattr_addr);
+		if (err)
+			goto fail;
 	}
 
 	/* read from xattr node block */
 	if (xnid) {
-		struct page *xpage;
-		void *xattr_addr;
-
-		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, xnid);
-		if (IS_ERR(xpage)) {
-			err = PTR_ERR(xpage);
+		err = read_xattr_block(inode, txattr_addr);
+		if (err)
 			goto fail;
-		}
-
-		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, size);
-		f2fs_put_page(xpage, 1);
 	}
 
 	header = XATTR_HDR(txattr_addr);
@@ -392,10 +389,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	size_t inline_size = inline_xattr_size(inode);
+	struct page *in_page = NULL;
 	void *xattr_addr;
+	void *inline_addr = NULL;
 	struct page *xpage;
 	nid_t new_nid = 0;
-	int err;
+	int err = 0;
 
 	if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
 		if (!alloc_nid(sbi, &new_nid))
@@ -403,30 +402,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 
 	/* write to inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
 		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-			f2fs_wait_on_page_writeback(ipage, NODE, true);
-			set_page_dirty(ipage);
+			inline_addr = inline_xattr_addr(inode, ipage);
 		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
+			in_page = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(in_page)) {
 				alloc_nid_failed(sbi, new_nid);
-				return PTR_ERR(page);
+				return PTR_ERR(in_page);
 			}
-			inline_addr = inline_xattr_addr(page);
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			inline_addr = inline_xattr_addr(inode, in_page);
 		}
-		memcpy(inline_addr, txattr_addr, inline_size);
-		f2fs_put_page(page, 1);
 
+		f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
+							NODE, true);
 		/* no need to use xattr node block */
 		if (hsize <= inline_size) {
-			err = truncate_xattr_node(inode, ipage);
+			err = truncate_xattr_node(inode);
 			alloc_nid_failed(sbi, new_nid);
-			return err;
+			if (err) {
+				f2fs_put_page(in_page, 1);
+				return err;
+			}
+			memcpy(inline_addr, txattr_addr, inline_size);
+			set_page_dirty(ipage ? ipage : in_page);
+			goto in_page_out;
 		}
 	}
 
@@ -435,7 +434,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
-			return PTR_ERR(xpage);
+			goto in_page_out;
 		}
 		f2fs_bug_on(sbi, new_nid);
 		f2fs_wait_on_page_writeback(xpage, NODE, true);
@@ -445,17 +444,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
-			return PTR_ERR(xpage);
+			goto in_page_out;
 		}
 		alloc_nid_done(sbi, new_nid);
 	}
-
 	xattr_addr = page_address(xpage);
+
+	if (inline_size)
+		memcpy(inline_addr, txattr_addr, inline_size);
 	memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
+
+	if (inline_size)
+		set_page_dirty(ipage ? ipage : in_page);
 	set_page_dirty(xpage);
-	f2fs_put_page(xpage, 1);
 
-	return 0;
+	f2fs_put_page(xpage, 1);
+in_page_out:
+	f2fs_put_page(in_page, 1);
+	return err;
 }
 
 int f2fs_getxattr(struct inode *inode, int index, const char *name,
@@ -681,6 +687,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
 	/* this case is only from init_inode_metadata */
 	if (ipage)
 		return __f2fs_setxattr(inode, index, name, value,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 81cecbe6d7cf..b833ffeee1e1 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos,
 		}
 	}
 parse_long:
-	slots = 0;
 	ds = (struct msdos_dir_slot *)*de;
 	id = ds->id;
 	if (!(id & 0x40))
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 48b2336692f9..bac10de678cc 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -392,7 +392,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
 			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
-			if (sb->s_flags & MS_SYNCHRONOUS)
+			if (sb->s_flags & SB_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);
 			brelse(c_bh);
 			if (err)
@@ -597,7 +597,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		}
 
 		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
-			if (sb->s_flags & MS_SYNCHRONOUS) {
+			if (sb->s_flags & SB_SYNCHRONOUS) {
 				err = fat_sync_bhs(bhs, nr_bhs);
 				if (err)
 					goto error;
@@ -612,7 +612,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		fat_collect_bhs(bhs, &nr_bhs, &fatent);
 	} while (cluster != FAT_ENT_EOF);
 
-	if (sb->s_flags & MS_SYNCHRONOUS) {
+	if (sb->s_flags & SB_SYNCHRONOUS) {
 		err = fat_sync_bhs(bhs, nr_bhs);
 		if (err)
 			goto error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 30c52394a7ad..20a0a89eaca5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -779,14 +779,14 @@ static void __exit fat_destroy_inodecache(void)
 
 static int fat_remount(struct super_block *sb, int *flags, char *data)
 {
-	int new_rdonly;
+	bool new_rdonly;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+	*flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
 
 	sync_filesystem(sb);
 
 	/* make sure we update state on remount. */
-	new_rdonly = *flags & MS_RDONLY;
+	new_rdonly = *flags & SB_RDONLY;
 	if (new_rdonly != sb_rdonly(sb)) {
 		if (new_rdonly)
 			fat_set_state(sb, 0, 0);
@@ -1352,7 +1352,7 @@ out:
 	if (opts->unicode_xlate)
 		opts->utf8 = 0;
 	if (opts->nfs == FAT_NFS_NOSTALE_RO) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		sb->s_export_op = &fat_export_ops_nostale;
 	}
 
@@ -1608,7 +1608,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
 
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 	sb->s_magic = MSDOS_SUPER_MAGIC;
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index acc3aa30ee54..f9bdc1e01c98 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 	if (opts->errors == FAT_ERRORS_PANIC)
 		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
 	else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
 	}
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7d6a105d601b..d24d2758a363 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -646,7 +646,7 @@ static void setup(struct super_block *sb)
 {
 	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
 	sb->s_d_op = &msdos_dentry_operations;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 }
 
 static int msdos_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 30f47d0f74a0..0522e283a4f4 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -563,6 +563,9 @@ static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __u
 {
 	struct compat_flock64 fl;
 
+	BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
+	BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));
+
 	memset(&fl, 0, sizeof(struct compat_flock64));
 	copy_flock_fields(&fl, kfl);
 	if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
@@ -632,9 +635,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		if (err)
 			break;
 		err = fixup_compat_flock(&flock);
-		if (err)
-			return err;
-		err = put_compat_flock(&flock, compat_ptr(arg));
+		if (!err)
+			err = put_compat_flock(&flock, compat_ptr(arg));
 		break;
 	case F_GETLK64:
 	case F_OFD_GETLK:
@@ -642,12 +644,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		if (err)
 			break;
 		err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
-		if (err)
-			break;
-		err = fixup_compat_flock(&flock);
-		if (err)
-			return err;
-		err = put_compat_flock64(&flock, compat_ptr(arg));
+		if (!err)
+			err = put_compat_flock64(&flock, compat_ptr(arg));
 		break;
 	case F_SETLK:
 	case F_SETLKW:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 474adc8d2a3a..0ace128f5d23 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -213,8 +213,8 @@ out_err:
 	return retval;
 }
 
-long do_handle_open(int mountdirfd,
-		    struct file_handle __user *ufh, int open_flag)
+static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
+			   int open_flag)
 {
 	long retval = 0;
 	struct path path;
diff --git a/fs/file.c b/fs/file.c
index 4eecbf4244a5..3b080834b870 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -593,13 +593,16 @@ void __fd_install(struct files_struct *files, unsigned int fd,
 {
 	struct fdtable *fdt;
 
-	might_sleep();
 	rcu_read_lock_sched();
 
-	while (unlikely(files->resize_in_progress)) {
+	if (unlikely(files->resize_in_progress)) {
 		rcu_read_unlock_sched();
-		wait_event(files->resize_wait, !files->resize_in_progress);
-		rcu_read_lock_sched();
+		spin_lock(&files->file_lock);
+		fdt = files_fdtable(files);
+		BUG_ON(fdt->fd[fd] != NULL);
+		rcu_assign_pointer(fdt->fd[fd], file);
+		spin_unlock(&files->file_lock);
+		return;
 	}
 	/* coupled with smp_wmb() in expand_fdtable() */
 	smp_rmb();
@@ -632,7 +635,6 @@ int __close_fd(struct files_struct *files, unsigned fd)
 	if (!file)
 		goto out_unlock;
 	rcu_assign_pointer(fdt->fd[fd], NULL);
-	__clear_close_on_exec(fd, fdt);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
 	return filp_close(file, files);
diff --git a/fs/file_table.c b/fs/file_table.c
index 49e1f2f1a4cb..2dc9f38bd195 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -312,7 +312,7 @@ void put_filp(struct file *file)
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 455ce5b77e9b..f989efa051a0 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -116,7 +116,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -220,7 +220,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 	int ret = -EINVAL;
 	u32 j;
 
-	sbp->s_flags |= MS_RDONLY;
+	sbp->s_flags |= SB_RDONLY;
 
 	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
 	if (!infp) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08f5debd07d1..cea4836385b7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -490,7 +490,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
 	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
 	    inode_to_wb(inode) == isw->new_wb) {
 		spin_unlock(&inode->i_lock);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 40d61077bead..ff84258132bb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,7 +558,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
-		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+		wait_on_atomic_t(&cookie->n_active, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	/* Make sure any pending writes are cancelled. */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 97ec45110957..0ff4b49a0037 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
 	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
 }
 
-extern int fscache_wait_atomic_t(atomic_t *);
-
 /*
  * object.c
  */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index b39d487ccfb0..249968dcbf5c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -195,12 +195,3 @@ static void __exit fscache_exit(void)
 }
 
 module_exit(fscache_exit);
-
-/*
- * wait_on_atomic_t() sleep function for uninterruptible waiting
- */
-int fscache_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 0ad3fd3ad0b4..961029e04027 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		return;
 	}
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	next = 0;
 	do {
 		if (!pagevec_lookup(&pvec, mapping, &next))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a42d89371748..17f0d05bfd4c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1636,7 +1636,7 @@ out_finish:
 
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	release_pages(req->pages, req->num_pages, false);
+	release_pages(req->pages, req->num_pages);
 }
 
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 94a745acaef8..624f18bbfd2b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -31,7 +31,7 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
-static int set_global_limit(const char *val, struct kernel_param *kp);
+static int set_global_limit(const char *val, const struct kernel_param *kp);
 
 unsigned max_user_bgreq;
 module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
@@ -130,7 +130,7 @@ static void fuse_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
-	if (inode->i_sb->s_flags & MS_ACTIVE) {
+	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
@@ -141,7 +141,7 @@ static void fuse_evict_inode(struct inode *inode)
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (*flags & MS_MANDLOCK)
+	if (*flags & SB_MANDLOCK)
 		return -EINVAL;
 
 	return 0;
@@ -823,7 +823,7 @@ static void sanitize_global_limit(unsigned *limit)
 		*limit = (1 << 16) - 1;
 }
 
-static int set_global_limit(const char *val, struct kernel_param *kp)
+static int set_global_limit(const char *val, const struct kernel_param *kp)
 {
 	int rv;
 
@@ -1056,10 +1056,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	int is_bdev = sb->s_bdev != NULL;
 
 	err = -EINVAL;
-	if (sb->s_flags & MS_MANDLOCK)
+	if (sb->s_flags & SB_MANDLOCK)
 		goto err;
 
-	sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
+	sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
 
 	if (!parse_fuse_opt(data, &d, is_bdev))
 		goto err;
@@ -1109,9 +1109,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_dev_free;
 
 	/* Handle umasking inside the fuse code */
-	if (sb->s_flags & MS_POSIXACL)
+	if (sb->s_flags & SB_POSIXACL)
 		fc->dont_mask = 1;
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 
 	fc->default_permissions = d.default_permissions;
 	fc->allow_other = d.allow_other;
@@ -1273,9 +1273,9 @@ static int __init fuse_fs_init(void)
 	int err;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
-					      sizeof(struct fuse_inode), 0,
-					      SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
-					      fuse_inode_init_once);
+			sizeof(struct fuse_inode), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
+			fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
 		goto out;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 68ed06962537..1daf15a1f00c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -280,22 +280,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 	for(i = 0; i < nr_pages; i++) {
 		struct page *page = pvec->pages[i];
 
-		/*
-		 * At this point, the page may be truncated or
-		 * invalidated (changing page->mapping to NULL), or
-		 * even swizzled back from swapper_space to tmpfs file
-		 * mapping. However, page->index will not change
-		 * because we have a reference on the page.
-		 */
-		if (page->index > end) {
-			/*
-			 * can't be range_cyclic (1st pass) because
-			 * end == -1 in that case.
-			 */
-			ret = 1;
-			break;
-		}
-
 		*done_index = page->index;
 
 		lock_page(page);
@@ -387,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
@@ -413,8 +397,8 @@ retry:
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+				tag);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a3711f543405..ad55eb86a250 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1065,15 +1065,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	sdp->sd_args = *args;
 
 	if (sdp->sd_args.ar_spectator) {
-                sb->s_flags |= MS_RDONLY;
+                sb->s_flags |= SB_RDONLY;
 		set_bit(SDF_RORECOVERY, &sdp->sd_flags);
 	}
 	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	if (sdp->sd_args.ar_nobarrier)
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 
-	sb->s_flags |= MS_NOSEC;
+	sb->s_flags |= SB_NOSEC;
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_d_op = &gfs2_dops;
@@ -1257,7 +1257,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1313,15 +1313,15 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 
 	if (s->s_root) {
 		error = -EBUSY;
-		if ((flags ^ s->s_flags) & MS_RDONLY)
+		if ((flags ^ s->s_flags) & SB_RDONLY)
 			goto error_super;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+		error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0);
 		if (error)
 			goto error_super;
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 		bdev->bd_super = s;
 	}
 
@@ -1365,7 +1365,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
 		pr_warn("gfs2 mount does not exist\n");
 		return ERR_CAST(s);
 	}
-	if ((flags ^ s->s_flags) & MS_RDONLY) {
+	if ((flags ^ s->s_flags) & SB_RDONLY) {
 		deactivate_locked_super(s);
 		return ERR_PTR(-EBUSY);
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9cb5c9a97d69..d81d46e19726 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1256,10 +1256,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 
 	if (sdp->sd_args.ar_spectator)
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 
-	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-		if (*flags & MS_RDONLY)
+	if ((sb->s_flags ^ *flags) & SB_RDONLY) {
+		if (*flags & SB_RDONLY)
 			error = gfs2_make_fs_ro(sdp);
 		else
 			error = gfs2_make_fs_rw(sdp);
@@ -1269,9 +1269,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	sdp->sd_args = args;
 	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	else
-		sb->s_flags &= ~MS_POSIXACL;
+		sb->s_flags &= ~SB_POSIXACL;
 	if (sdp->sd_args.ar_nobarrier)
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 	else
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a85ca8b2c9ba..ca8b72d0a831 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		kfree(tr);
 	up_read(&sdp->sd_log_flush_lock);
 
-	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+	if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 	if (alloced)
 		sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 8aec5e732abf..b63a4df7327b 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
 void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 		struct hfs_bnode *src_node, int src, int len)
 {
-	struct hfs_btree *tree;
 	struct page *src_page, *dst_page;
 
 	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
-	tree = src_node->tree;
 	src += src_node->page_offset;
 	dst += dst_node->page_offset;
 	src_page = src_node->page[0];
@@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 
 static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 {
-	struct super_block *sb;
 	struct hfs_bnode *node, *node2;
 	struct address_space *mapping;
 	struct page *page;
@@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 		return NULL;
 	}
 
-	sb = tree->inode->i_sb;
 	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
 		sizeof(struct page *);
 	node = kzalloc(size, GFP_KERNEL);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 894994d2c885..460281b1299e 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -204,11 +204,11 @@ int hfs_mdb_get(struct super_block *sb)
 	attrib = mdb->drAtrb;
 	if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
 		pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
 		pr_warn("filesystem is marked locked, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if (!sb_rdonly(sb)) {
 		/* Mark the volume uncleanly unmounted in case we crash */
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 7e0d65e9586c..173876782f73 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -114,18 +114,18 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	*flags |= SB_NODIRATIME;
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (!(*flags & MS_RDONLY)) {
+	if (!(*flags & SB_RDONLY)) {
 		if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
 			pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
 			pr_warn("filesystem is marked locked, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		}
 	}
 	return 0;
@@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &hfs_super_operations;
 	sb->s_xattr = hfs_xattr_handlers;
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 	mutex_init(&sbi->bitmap_lock);
 
 	res = hfs_mdb_get(sb);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index d77015c3f22c..177fae4e6581 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
 void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 		    struct hfs_bnode *src_node, int src, int len)
 {
-	struct hfs_btree *tree;
 	struct page **src_page, **dst_page;
 	int l;
 
 	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
-	tree = src_node->tree;
 	src += src_node->page_offset;
 	dst += dst_node->page_offset;
 	src_page = src_node->page + (src >> PAGE_SHIFT);
@@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 
 static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 {
-	struct super_block *sb;
 	struct hfs_bnode *node, *node2;
 	struct address_space *mapping;
 	struct page *page;
@@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 		return NULL;
 	}
 
-	sb = tree->inode->i_sb;
 	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
 		sizeof(struct page *);
 	node = kzalloc(size, GFP_KERNEL);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e5bb2de2262a..1d458b716957 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -329,9 +329,9 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (!(*flags & MS_RDONLY)) {
+	if (!(*flags & SB_RDONLY)) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
 		int force = 0;
 
@@ -340,20 +340,20 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 
 		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 			pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (force) {
 			/* nothing */
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			pr_warn("filesystem is marked locked, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
 			pr_warn("filesystem is marked journaled, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		}
 	}
 	return 0;
@@ -455,16 +455,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 		pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		pr_warn("Filesystem is marked locked, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
 			!sb_rdonly(sb)) {
 		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	err = -EINVAL;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8d6b7e35faf9..c83ece7facc5 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -150,7 +150,6 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx)
 			if (unlikely(ret < 0))
 				goto out;
 			ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
-			file->f_version = inode->i_version;
 		}
 		next_pos = ctx->pos;
 		if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 3b834563b1f1..a4ad18afbdec 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -419,7 +419,6 @@ int hpfs_add_dirent(struct inode *i,
 		c = 1;
 		goto ret;
 	}	
-	i->i_version++;
 	c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0);
 	ret:
 	return c;
@@ -726,7 +725,6 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de,
 			return 2;
 		}
 	}
-	i->i_version++;
 	for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1);
 	hpfs_delete_de(i->i_sb, dnode, de);
 	hpfs_mark_4buffers_dirty(qbh);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index e0e60b148400..7c49f1ef0c85 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -288,7 +288,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 					goto bail;
 				}
 				if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) {
-					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok;
+					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & SB_RDONLY) goto ok;
 					hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
 					goto bail;
 				}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 1516fb4e28f4..f2c3ebcd309c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -78,7 +78,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
 			else {
 				pr_cont("; remounting read-only\n");
 				mark_dirty(s, 0);
-				s->s_flags |= MS_RDONLY;
+				s->s_flags |= SB_RDONLY;
 			}
 		} else if (sb_rdonly(s))
 				pr_cont("; going on - but anything won't be destroyed because it's read-only\n");
@@ -235,7 +235,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-	ei->vfs_inode.i_version = 1;
 	return &ei->vfs_inode;
 }
 
@@ -457,7 +456,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	sync_filesystem(s);
 
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 
 	hpfs_lock(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
@@ -488,7 +487,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
 	sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
 
-	if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
+	if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
 
 	hpfs_unlock(s);
 	return 0;
@@ -614,7 +613,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 		goto bail4;
 	}
 
-	s->s_flags |= MS_NOATIME;
+	s->s_flags |= SB_NOATIME;
 
 	/* Fill superblock stuff */
 	s->s_magic = HPFS_SUPER_MAGIC;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ed113ea17aff..8a85f3f53446 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	next = start;
 	while (next < end) {
 		/*
@@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
 		/*
-		 * page_put due to reference from alloc_huge_page()
 		 * unlock_page because locked by add_to_page_cache()
+		 * page_put due to reference from alloc_huge_page()
 		 */
-		put_page(page);
 		unlock_page(page);
+		put_page(page);
 	}
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
@@ -668,7 +668,6 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return error;
 
 	if (ia_valid & ATTR_SIZE) {
-		error = -EINVAL;
 		if (attr->ia_size & ~huge_page_mask(h))
 			return -EINVAL;
 		error = hugetlb_vmtruncate(inode, attr->ia_size);
diff --git a/fs/inode.c b/fs/inode.c
index fd401028a309..03102d6ef044 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -416,7 +416,7 @@ void inode_add_lru(struct inode *inode)
 {
 	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
 				I_FREEING | I_WILL_FREE)) &&
-	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
 		inode_lru_list_add(inode);
 }
 
@@ -595,7 +595,7 @@ static void dispose_list(struct list_head *head)
  * @sb:		superblock to operate on
  *
  * Make sure that no inodes with zero refcount are retained.  This is
- * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * called by superblock shutdown after having SB_ACTIVE flag removed,
  * so any inode reaching zero refcount during or after that call will
  * be immediately evicted.
  */
@@ -1492,7 +1492,7 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
-	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+	if (!drop && (sb->s_flags & SB_ACTIVE)) {
 		inode_add_lru(inode);
 		spin_unlock(&inode->i_lock);
 		return;
@@ -1644,7 +1644,7 @@ int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
 
-	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME) || (flags & S_VERSION))
 		iflags |= I_DIRTY_SYNC;
 	__mark_inode_dirty(inode, iflags);
 	return 0;
@@ -1691,7 +1691,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode,
 
 	if (IS_NOATIME(inode))
 		return false;
-	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return false;
 
 	if (mnt->mnt_flags & MNT_NOATIME)
diff --git a/fs/internal.h b/fs/internal.h
index 48cee21b4f14..df262f41a0ef 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,6 +55,7 @@ extern void __init chrdev_init(void);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
+long do_unlinkat(int dfd, struct filename *name);
 
 /*
  * namespace.c
diff --git a/fs/iomap.c b/fs/iomap.c
index b9f74803e56c..47d29ccffaef 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -856,6 +856,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct bio *bio;
 	bool need_zeroout = false;
 	int nr_pages, ret;
+	size_t copied = 0;
 
 	if ((pos | length | align) & ((1 << blkbits) - 1))
 		return -EINVAL;
@@ -867,7 +868,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		/*FALLTHRU*/
 	case IOMAP_UNWRITTEN:
 		if (!(dio->flags & IOMAP_DIO_WRITE)) {
-			iov_iter_zero(length, dio->submit.iter);
+			length = iov_iter_zero(length, dio->submit.iter);
 			dio->size += length;
 			return length;
 		}
@@ -904,8 +905,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	}
 
 	do {
-		if (dio->error)
+		size_t n;
+		if (dio->error) {
+			iov_iter_revert(dio->submit.iter, copied);
 			return 0;
+		}
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 		bio_set_dev(bio, iomap->bdev);
@@ -918,20 +922,24 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		ret = bio_iov_iter_get_pages(bio, &iter);
 		if (unlikely(ret)) {
 			bio_put(bio);
-			return ret;
+			return copied ? copied : ret;
 		}
 
+		n = bio->bi_iter.bi_size;
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
-			task_io_account_write(bio->bi_iter.bi_size);
+			task_io_account_write(n);
 		} else {
 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			if (dio->flags & IOMAP_DIO_DIRTY)
 				bio_set_pages_dirty(bio);
 		}
 
-		dio->size += bio->bi_iter.bi_size;
-		pos += bio->bi_iter.bi_size;
+		iov_iter_advance(dio->submit.iter, n);
+
+		dio->size += n;
+		pos += n;
+		copied += n;
 
 		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
 
@@ -947,9 +955,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		if (pad)
 			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
 	}
-
-	iov_iter_advance(dio->submit.iter, length);
-	return length;
+	return copied;
 }
 
 ssize_t
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 447a24d77b89..bc258a4402f6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static void destroy_inodecache(void)
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		return -EROFS;
 	return 0;
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d2a85c9720e9..67546c7ad473 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+	int ret = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid)
+		ret = 0;
+	if (journal->j_committing_transaction &&
+	    journal->j_committing_transaction->t_tid == tid)
+		ret = 0;
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
 /*
  * When this function returns the transaction corresponding to tid
  * will be completed.  If the transaction has currently running, start
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e96c6b05e43e..d8c274d39ddb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -409,10 +409,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
 		mutex_unlock(&c->alloc_sem);
 	}
 
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		jffs2_start_garbage_collect_thread(c);
 
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 824e61ede465..c2fbec19c616 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -59,7 +59,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 }
 
 
-#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
+#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & SB_RDONLY)
 
 #define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) )
 #ifndef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 153f1c6eb169..f60dee7faf03 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -301,10 +301,10 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &jffs2_super_operations;
 	sb->s_export_op = &jffs2_export_ops;
-	sb->s_flags = sb->s_flags | MS_NOATIME;
+	sb->s_flags = sb->s_flags | SB_NOATIME;
 	sb->s_xattr = jffs2_xattr_handlers;
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	ret = jffs2_do_fill_super(sb, data, silent);
 	return ret;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2f7b3af5b8b7..90373aebfdca 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -87,7 +87,7 @@ static void jfs_handle_error(struct super_block *sb)
 	else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
 		jfs_err("ERROR: (device %s): remounting filesystem as read-only",
 			sb->s_id);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/* nothing is done for continue beyond marking the superblock dirty */
@@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 			return rc;
 	}
 
-	if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
 		/*
 		 * Invalidate any previously read metadata.  fsck may have
 		 * changed the on-disk data since we mounted r/o
@@ -488,12 +488,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		ret = jfs_mount_rw(sb, 1);
 
 		/* mark the fs r/w for quota activity */
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		dquot_resume(sb, -1);
 		return ret;
 	}
-	if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+	if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
 		rc = dquot_suspend(sb, -1);
 		if (rc < 0)
 			return rc;
@@ -545,7 +545,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->flag = flag;
 
 #ifdef CONFIG_JFS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 
 	if (newLVSize) {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 95a7c88baed9..26dd9a50f383 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -335,7 +335,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
 		}
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 
 		mutex_lock(&kernfs_mutex);
 		list_add(&info->node, &root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index 3aabe553fc45..7ff3cb904acd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,7 +246,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	struct inode *root;
 	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
-	s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
 			&init_user_ns, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
@@ -277,7 +277,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_d_op = dops;
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 
 Enomem:
@@ -578,7 +578,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
+		mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 0d4e590e0549..826a89184f90 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -578,8 +578,10 @@ static void nlm_complain_hosts(struct net *net)
 
 		if (ln->nrhosts == 0)
 			return;
-		printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net);
-		dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net);
+		pr_warn("lockd: couldn't shutdown host module for net %x!\n",
+			net->ns.inum);
+		dprintk("lockd: %lu hosts left in net %x:\n", ln->nrhosts,
+			net->ns.inum);
 	} else {
 		if (nrhosts == 0)
 			return;
@@ -590,9 +592,9 @@ static void nlm_complain_hosts(struct net *net)
 	for_each_host(host, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
-		dprintk("       %s (cnt %d use %d exp %ld net %p)\n",
+		dprintk("       %s (cnt %d use %d exp %ld net %x)\n",
 			host->h_name, atomic_read(&host->h_count),
-			host->h_inuse, host->h_expires, host->net);
+			host->h_inuse, host->h_expires, host->net->ns.inum);
 	}
 }
 
@@ -605,7 +607,8 @@ nlm_shutdown_hosts_net(struct net *net)
 	mutex_lock(&nlm_host_mutex);
 
 	/* First, make all hosts eligible for gc */
-	dprintk("lockd: nuking all hosts in net %p...\n", net);
+	dprintk("lockd: nuking all hosts in net %x...\n",
+		net ? net->ns.inum : 0);
 	for_each_host(host, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
@@ -618,9 +621,8 @@ nlm_shutdown_hosts_net(struct net *net)
 
 	/* Then, perform a garbage collection pass */
 	nlm_gc_hosts(net);
-	mutex_unlock(&nlm_host_mutex);
-
 	nlm_complain_hosts(net);
+	mutex_unlock(&nlm_host_mutex);
 }
 
 /*
@@ -646,7 +648,8 @@ nlm_gc_hosts(struct net *net)
 	struct hlist_node *next;
 	struct nlm_host	*host;
 
-	dprintk("lockd: host garbage collection for net %p\n", net);
+	dprintk("lockd: host garbage collection for net %x\n",
+		net ? net->ns.inum : 0);
 	for_each_host(host, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
@@ -662,9 +665,10 @@ nlm_gc_hosts(struct net *net)
 		if (atomic_read(&host->h_count) || host->h_inuse
 		 || time_before(jiffies, host->h_expires)) {
 			dprintk("nlm_gc_hosts skipping %s "
-				"(cnt %d use %d exp %ld net %p)\n",
+				"(cnt %d use %d exp %ld net %x)\n",
 				host->h_name, atomic_read(&host->h_count),
-				host->h_inuse, host->h_expires, host->net);
+				host->h_inuse, host->h_expires,
+				host->net->ns.inum);
 			continue;
 		}
 		nlm_destroy_host_locked(host);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 9fbbd11f9ecb..96cfb2967ac7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -110,7 +110,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 	clnt = nsm_create(host->net, host->nodename);
 	if (IS_ERR(clnt)) {
 		dprintk("lockd: failed to create NSM upcall transport, "
-			"status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+			"status=%ld, net=%x\n", PTR_ERR(clnt),
+			host->net->ns.inum);
 		return PTR_ERR(clnt);
 	}
 
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index b995bdc13976..9c36d614bf89 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,6 +57,9 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
+atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
+
 unsigned int lockd_net_id;
 
 /*
@@ -259,7 +262,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
 	if (error < 0)
 		goto err_bind;
 	set_grace_period(net);
-	dprintk("lockd_up_net: per-net data created; net=%p\n", net);
+	dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum);
 	return 0;
 
 err_bind:
@@ -274,12 +277,15 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
 	if (ln->nlmsvc_users) {
 		if (--ln->nlmsvc_users == 0) {
 			nlm_shutdown_hosts_net(net);
+			cancel_delayed_work_sync(&ln->grace_period_end);
+			locks_end_grace(&ln->lockd_manager);
 			svc_shutdown_net(serv, net);
-			dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
+			dprintk("%s: per-net data destroyed; net=%x\n",
+				__func__, net->ns.inum);
 		}
 	} else {
-		printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
-				nlmsvc_task, net);
+		pr_err("%s: no users! task=%p, net=%x\n",
+			__func__, nlmsvc_task, net->ns.inum);
 		BUG();
 	}
 }
@@ -290,7 +296,8 @@ static int lockd_inetaddr_event(struct notifier_block *this,
 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
 	struct sockaddr_in sin;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nlm_ntf_refcnt))
 		goto out;
 
 	if (nlmsvc_rqst) {
@@ -301,6 +308,8 @@ static int lockd_inetaddr_event(struct notifier_block *this,
 		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
 			(struct sockaddr *)&sin);
 	}
+	atomic_dec(&nlm_ntf_refcnt);
+	wake_up(&nlm_ntf_wq);
 
 out:
 	return NOTIFY_DONE;
@@ -317,7 +326,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
 	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
 	struct sockaddr_in6 sin6;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nlm_ntf_refcnt))
 		goto out;
 
 	if (nlmsvc_rqst) {
@@ -329,6 +339,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
 		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
 			(struct sockaddr *)&sin6);
 	}
+	atomic_dec(&nlm_ntf_refcnt);
+	wake_up(&nlm_ntf_wq);
 
 out:
 	return NOTIFY_DONE;
@@ -345,10 +357,12 @@ static void lockd_unregister_notifiers(void)
 #if IS_ENABLED(CONFIG_IPV6)
 	unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
 #endif
+	wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0);
 }
 
 static void lockd_svc_exit_thread(void)
 {
+	atomic_dec(&nlm_ntf_refcnt);
 	lockd_unregister_notifiers();
 	svc_exit_thread(nlmsvc_rqst);
 }
@@ -369,9 +383,11 @@ static int lockd_start_svc(struct svc_serv *serv)
 		printk(KERN_WARNING
 			"lockd_up: svc_rqst allocation failed, error=%d\n",
 			error);
+		lockd_unregister_notifiers();
 		goto out_rqst;
 	}
 
+	atomic_inc(&nlm_ntf_refcnt);
 	svc_sock_update_bufs(serv);
 	serv->sv_maxconn = nlm_max_connections;
 
@@ -459,13 +475,16 @@ int lockd_up(struct net *net)
 	}
 
 	error = lockd_up_net(serv, net);
-	if (error < 0)
-		goto err_net;
+	if (error < 0) {
+		lockd_unregister_notifiers();
+		goto err_put;
+	}
 
 	error = lockd_start_svc(serv);
-	if (error < 0)
-		goto err_start;
-
+	if (error < 0) {
+		lockd_down_net(serv, net);
+		goto err_put;
+	}
 	nlmsvc_users++;
 	/*
 	 * Note: svc_serv structures have an initial use count of 1,
@@ -476,12 +495,6 @@ err_put:
 err_create:
 	mutex_unlock(&nlmsvc_mutex);
 	return error;
-
-err_start:
-	lockd_down_net(serv, net);
-err_net:
-	lockd_unregister_notifiers();
-	goto err_put;
 }
 EXPORT_SYMBOL_GPL(lockd_up);
 
@@ -602,7 +615,7 @@ static struct ctl_table nlm_sysctl_root[] = {
  */
 
 #define param_set_min_max(name, type, which_strtol, min, max)		\
-static int param_set_##name(const char *val, struct kernel_param *kp)	\
+static int param_set_##name(const char *val, const struct kernel_param *kp) \
 {									\
 	char *endp;							\
 	__typeof__(type) num = which_strtol(val, &endp, 0);		\
@@ -678,6 +691,17 @@ static int lockd_init_net(struct net *net)
 
 static void lockd_exit_net(struct net *net)
 {
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+	WARN_ONCE(!list_empty(&ln->lockd_manager.list),
+		  "net %x %s: lockd_manager.list is not empty\n",
+		  net->ns.inum, __func__);
+	WARN_ONCE(!list_empty(&ln->nsm_handles),
+		  "net %x %s: nsm_handles list is not empty\n",
+		  net->ns.inum, __func__);
+	WARN_ONCE(delayed_work_pending(&ln->grace_period_end),
+		  "net %x %s: grace_period_end was not cancelled\n",
+		  net->ns.inum, __func__);
 }
 
 static struct pernet_operations lockd_net_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a563ddbc19e6..4ec3d6e03e76 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -370,7 +370,7 @@ nlmsvc_mark_resources(struct net *net)
 {
 	struct nlm_host hint;
 
-	dprintk("lockd: nlmsvc_mark_resources for net %p\n", net);
+	dprintk("lockd: %s for net %x\n", __func__, net ? net->ns.inum : 0);
 	hint.net = net;
 	nlm_traverse_files(&hint, nlmsvc_mark_host, NULL);
 }
diff --git a/fs/locks.c b/fs/locks.c
index 1bd71c4d663a..21b4dfa289ee 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -141,7 +141,7 @@
 
 static inline bool is_remote_lock(struct file *filp)
 {
-	return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
+	return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK));
 }
 
 static bool lease_breaking(struct file_lock *fl)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index d818fd236787..b8b8b9ced9f8 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
 	struct mb_cache *cache = container_of(shrink, struct mb_cache,
 					      c_shrink);
 
+	/* Unlikely, but not impossible */
+	if (unlikely(cache->c_entry_count < 0))
+		return 0;
 	return cache->c_entry_count;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index b6829d679643..72e308c3e66b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -125,9 +125,9 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
 
 	sync_filesystem(sb);
 	ms = sbi->s_ms;
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		if (ms->s_state & MINIX_VALID_FS ||
 		    !(sbi->s_mount_state & MINIX_VALID_FS))
 			return 0;
diff --git a/fs/namei.c b/fs/namei.c
index 5424b10cfdc4..9cc91fb7f156 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	 * of the daemon to instantiate them before they can be used.
 	 */
 	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-			   LOOKUP_OPEN | LOOKUP_CREATE |
-			   LOOKUP_AUTOMOUNT))) {
-		/* Positive dentry that isn't meant to trigger an
-		 * automount, EISDIR will allow it to be used,
-		 * otherwise there's no mount here "now" so return
-		 * ENOENT.
-		 */
-		if (path->dentry->d_inode)
-			return -EISDIR;
-		else
-			return -ENOENT;
-	}
+			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	    path->dentry->d_inode)
+		return -EISDIR;
 
 	if (path->dentry->d_sb->s_user_ns != &init_user_ns)
 		return -EACCES;
@@ -3459,7 +3450,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		goto out;
 	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
 	error = PTR_ERR(child);
-	if (unlikely(IS_ERR(child)))
+	if (IS_ERR(child))
 		goto out2;
 	dput(path.dentry);
 	path.dentry = child;
@@ -4010,10 +4001,9 @@ EXPORT_SYMBOL(vfs_unlink);
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
-static long do_unlinkat(int dfd, const char __user *pathname)
+long do_unlinkat(int dfd, struct filename *name)
 {
 	int error;
-	struct filename *name;
 	struct dentry *dentry;
 	struct path path;
 	struct qstr last;
@@ -4022,8 +4012,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	struct inode *delegated_inode = NULL;
 	unsigned int lookup_flags = 0;
 retry:
-	name = filename_parentat(dfd, getname(pathname), lookup_flags,
-				&path, &last, &type);
+	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
@@ -4065,12 +4054,12 @@ exit2:
 	mnt_drop_write(path.mnt);
 exit1:
 	path_put(&path);
-	putname(name);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		inode = NULL;
 		goto retry;
 	}
+	putname(name);
 	return error;
 
 slashes:
@@ -4091,12 +4080,12 @@ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
 	if (flag & AT_REMOVEDIR)
 		return do_rmdir(dfd, pathname);
 
-	return do_unlinkat(dfd, pathname);
+	return do_unlinkat(dfd, getname(pathname));
 }
 
 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 {
-	return do_unlinkat(AT_FDCWD, pathname);
+	return do_unlinkat(AT_FDCWD, getname(pathname));
 }
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
diff --git a/fs/namespace.c b/fs/namespace.c
index e158ec6b527b..9d1374ab6e06 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2826,6 +2826,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 			    SB_DIRSYNC |
 			    SB_SILENT |
 			    SB_POSIXACL |
+			    SB_LAZYTIME |
 			    SB_I_VERSION);
 
 	if (flags & MS_REMOUNT)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 129f1937fa2c..41de88cdc053 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -103,7 +103,7 @@ static void destroy_inodecache(void)
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return 0;
 }
 
@@ -547,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	else
 		default_bufsize = 1024;
 
-	sb->s_flags |= MS_NODIRATIME;	/* probably even noatime */
+	sb->s_flags |= SB_NODIRATIME;	/* probably even noatime */
 	sb->s_maxbytes = 0xFFFFFFFFU;
 	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
 	sb->s_blocksize_bits = 10;
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b60627bcfc62..ef6729568432 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -67,7 +67,7 @@ out:
  */
 void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
 {
-	if (atomic_dec_and_test(&dreq->count))
+	if (refcount_dec_and_test(&dreq->count))
 		kfree(dreq);
 }
 
@@ -87,7 +87,7 @@ static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
 
 	dreq = container_of(req, struct nfs_cache_defer_req, req);
 	dreq->deferred_req.revisit = nfs_dns_cache_revisit;
-	atomic_inc(&dreq->count);
+	refcount_inc(&dreq->count);
 
 	return &dreq->deferred_req;
 }
@@ -99,7 +99,7 @@ struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
 	dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
 	if (dreq) {
 		init_completion(&dreq->completion);
-		atomic_set(&dreq->count, 1);
+		refcount_set(&dreq->count, 1);
 		dreq->req.defer = nfs_dns_cache_defer;
 	}
 	return dreq;
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 4e6236a86cf7..220ee409abc4 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -16,7 +16,7 @@ struct nfs_cache_defer_req {
 	struct cache_req req;
 	struct cache_deferred_req deferred_req;
 	struct completion completion;
-	atomic_t count;
+	refcount_t count;
 };
 
 extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cd9d992feb2e..509dc5adeb8f 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -49,15 +49,15 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
 	if (ret <= 0)
 		goto out_err;
 	nn->nfs_callback_tcpport = ret;
-	dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
-			nn->nfs_callback_tcpport, PF_INET, net);
+	dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+		nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
 
 	ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret > 0) {
 		nn->nfs_callback_tcpport6 = ret;
-		dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
-				nn->nfs_callback_tcpport6, PF_INET6, net);
+		dprintk("NFS: Callback listener port = %u (af %u, net %x\n",
+			nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum);
 	} else if (ret != -EAFNOSUPPORT)
 		goto out_err;
 	return 0;
@@ -185,7 +185,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
 	if (--nn->cb_users[minorversion])
 		return;
 
-	dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+	dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
 	svc_shutdown_net(serv, net);
 }
 
@@ -198,7 +198,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
 	if (nn->cb_users[minorversion]++)
 		return 0;
 
-	dprintk("NFS: create per-net callback data; net=%p\n", net);
+	dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum);
 
 	ret = svc_bind(serv, net);
 	if (ret < 0) {
@@ -223,7 +223,7 @@ err_socks:
 err_bind:
 	nn->cb_users[minorversion]--;
 	dprintk("NFS: Couldn't create callback socket: err = %d; "
-			"net = %p\n", ret, net);
+			"net = %x\n", ret, net->ns.inum);
 	return ret;
 }
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 19151f6c0e97..2435af56b87e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -440,7 +440,7 @@ static bool referring_call_exists(struct nfs_client *clp,
 				  uint32_t nrclists,
 				  struct referring_call_list *rclists)
 {
-	bool status = 0;
+	bool status = false;
 	int i, j;
 	struct nfs4_session *session;
 	struct nfs4_slot_table *tbl;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 22880ef6d8dd..b9129e2befea 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -163,7 +163,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 
 	clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
 
-	atomic_set(&clp->cl_count, 1);
+	refcount_set(&clp->cl_count, 1);
 	clp->cl_cons_state = NFS_CS_INITING;
 
 	memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
@@ -269,7 +269,7 @@ void nfs_put_client(struct nfs_client *clp)
 
 	nn = net_generic(clp->cl_net, nfs_net_id);
 
-	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
+	if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
 		list_del(&clp->cl_share_link);
 		nfs_cb_idr_remove_locked(clp);
 		spin_unlock(&nn->nfs_client_lock);
@@ -291,12 +291,23 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 	const struct sockaddr *sap = data->addr;
 	struct nfs_net *nn = net_generic(data->net, nfs_net_id);
 
+again:
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
 	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 		/* Don't match clients that failed to initialise properly */
 		if (clp->cl_cons_state < 0)
 			continue;
 
+		/* If a client is still initializing then we need to wait */
+		if (clp->cl_cons_state > NFS_CS_READY) {
+			refcount_inc(&clp->cl_count);
+			spin_unlock(&nn->nfs_client_lock);
+			nfs_wait_client_init_complete(clp);
+			nfs_put_client(clp);
+			spin_lock(&nn->nfs_client_lock);
+			goto again;
+		}
+
 		/* Different NFS versions cannot share the same nfs_client */
 		if (clp->rpc_ops != data->nfs_mod->rpc_ops)
 			continue;
@@ -314,7 +325,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 							   sap))
 				continue;
 
-		atomic_inc(&clp->cl_count);
+		refcount_inc(&clp->cl_count);
 		return clp;
 	}
 	return NULL;
@@ -1006,7 +1017,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 	/* Copy data from the source */
 	server->nfs_client = source->nfs_client;
 	server->destroy = source->destroy;
-	atomic_inc(&server->nfs_client->cl_count);
+	refcount_inc(&server->nfs_client->cl_count);
 	nfs_server_copy_userdata(server, source);
 
 	server->fsid = fattr->fsid;
@@ -1166,7 +1177,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 		   clp->rpc_ops->version,
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
-		   atomic_read(&clp->cl_count),
+		   refcount_read(&clp->cl_count),
 		   clp->cl_hostname);
 	rcu_read_unlock();
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 606dd3871f66..ade44ca0c66c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -1041,6 +1041,33 @@ int nfs_delegations_present(struct nfs_client *clp)
 }
 
 /**
+ * nfs4_refresh_delegation_stateid - Update delegation stateid seqid
+ * @dst: stateid to refresh
+ * @inode: inode to check
+ *
+ * Returns "true" and updates "dst->seqid" * if inode had a delegation
+ * that matches our delegation stateid. Otherwise "false" is returned.
+ */
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+	bool ret = false;
+	if (!inode)
+		goto out;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL &&
+	    nfs4_stateid_match_other(dst, &delegation->stateid)) {
+		dst->seqid = delegation->stateid.seqid;
+		return ret;
+	}
+	rcu_read_unlock();
+out:
+	return ret;
+}
+
+/**
  * nfs4_copy_delegation_stateid - Copy inode's state ID information
  * @inode: inode to check
  * @flags: delegation type requirement
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index ddaf2644cf13..185a09f37a89 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -62,6 +62,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
 bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f439f1c45008..2f3f86726f5b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -118,13 +118,6 @@ nfs_opendir(struct inode *inode, struct file *filp)
 		goto out;
 	}
 	filp->private_data = ctx;
-	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
-		/* This is a mountpoint, so d_revalidate will never
-		 * have been called, so we need to refresh the
-		 * inode (for close-open consistency) ourselves.
-		 */
-		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	}
 out:
 	put_rpccred(cred);
 	return res;
@@ -253,7 +246,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 	desc->cache_entry_index = index;
 	return 0;
 out_eof:
-	desc->eof = 1;
+	desc->eof = true;
 	return -EBADCOOKIE;
 }
 
@@ -307,7 +300,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	if (array->eof_index >= 0) {
 		status = -EBADCOOKIE;
 		if (*desc->dir_cookie == array->last_cookie)
-			desc->eof = 1;
+			desc->eof = true;
 	}
 out:
 	return status;
@@ -761,7 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		ent = &array->array[i];
 		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
 		    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
-			desc->eof = 1;
+			desc->eof = true;
 			break;
 		}
 		desc->ctx->pos++;
@@ -773,7 +766,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 			ctx->duped = 1;
 	}
 	if (array->eof_index >= 0)
-		desc->eof = 1;
+		desc->eof = true;
 
 	kunmap(desc->page);
 	cache_page_release(desc);
@@ -873,7 +866,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res == -EBADCOOKIE) {
 			res = 0;
 			/* This means either end of directory */
-			if (*desc->dir_cookie && desc->eof == 0) {
+			if (*desc->dir_cookie && !desc->eof) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc);
 				if (res == 0)
@@ -1241,8 +1234,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
 		return 0;
 	}
 
-	if (nfs_mapping_need_revalidate_inode(inode))
-		error = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	error = nfs_lookup_verify_inode(inode, flags);
 	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
 			__func__, inode->i_ino, error ? "invalid" : "valid");
 	return !error;
@@ -1264,7 +1256,7 @@ static int nfs_dentry_delete(const struct dentry *dentry)
 		/* Unhash it, so that ->d_iput() would be called */
 		return 1;
 	}
-	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+	if (!(dentry->d_sb->s_flags & SB_ACTIVE)) {
 		/* Unhash it, so that ancestors of killed async unlink
 		 * files will be cleaned up during umount */
 		return 1;
@@ -1393,6 +1385,7 @@ static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
 
 const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs4_lookup_revalidate,
+	.d_weak_revalidate	= nfs_weak_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
 	.d_automount	= nfs_d_automount,
@@ -2064,7 +2057,7 @@ out:
 		 * should mark the directories for revalidation.
 		 */
 		d_move(old_dentry, new_dentry);
-		nfs_set_verifier(new_dentry,
+		nfs_set_verifier(old_dentry,
 					nfs_save_change_attribute(new_dir));
 	} else if (error == -ENOENT)
 		nfs_dentry_handle_enoent(old_dentry);
@@ -2369,15 +2362,15 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 }
 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
 
-#define NFS_MAY_READ (NFS4_ACCESS_READ)
-#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \
-		NFS4_ACCESS_EXTEND | \
-		NFS4_ACCESS_DELETE)
-#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \
-		NFS4_ACCESS_EXTEND)
+#define NFS_MAY_READ (NFS_ACCESS_READ)
+#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \
+		NFS_ACCESS_EXTEND | \
+		NFS_ACCESS_DELETE)
+#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \
+		NFS_ACCESS_EXTEND)
 #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE
-#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP)
-#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE)
+#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP)
+#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE)
 static int
 nfs_access_calc_mask(u32 access_result, umode_t umode)
 {
@@ -2425,9 +2418,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 	if (!may_block)
 		goto out;
 
-	/* Be clever: ask server to check for all possible rights */
-	cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE
-		     | NFS_MAY_WRITE | NFS_MAY_READ;
+	/*
+	 * Determine which access bits we want to ask for...
+	 */
+	cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
+	if (S_ISDIR(inode->i_mode))
+		cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
+	else
+		cache.mask |= NFS_ACCESS_EXECUTE;
 	cache.cred = cred;
 	status = NFS_PROTO(inode)->access(inode, &cache);
 	if (status != 0) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0214dd1e1060..81cca49a8375 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -829,23 +829,9 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
 		is_local = 1;
 
-	/*
-	 * VFS doesn't require the open mode to match a flock() lock's type.
-	 * NFS, however, may simulate flock() locking with posix locking which
-	 * requires the open mode to match the lock type.
-	 */
-	switch (fl->fl_type) {
-	case F_UNLCK:
+	/* We're simulating flock() locks using posix locks on the server */
+	if (fl->fl_type == F_UNLCK)
 		return do_unlk(filp, cmd, fl, is_local);
-	case F_RDLCK:
-		if (!(filp->f_mode & FMODE_READ))
-			return -EBADF;
-		break;
-	case F_WRLCK:
-		if (!(filp->f_mode & FMODE_WRITE))
-			return -EBADF;
-	}
-
 	return do_setlk(filp, cmd, fl, is_local);
 }
 EXPORT_SYMBOL_GPL(nfs_flock);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 508126eb49f9..4e54d8b5413a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -471,10 +471,10 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 		return PNFS_NOT_ATTEMPTED;
 
 	dprintk("%s USE DS: %s cl_count %d\n", __func__,
-		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
 
 	/* No multipath support. Use first DS */
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -515,10 +515,10 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 
 	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
-		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+		offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
 
 	hdr->pgio_done_cb = filelayout_write_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -1064,9 +1064,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 		goto out_err;
 
 	dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
-		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
+		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count));
 	data->commit_done_cb = filelayout_commit_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b0fa83a60754..c75ad982bcfc 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -187,7 +187,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 			continue;
 		if (!ff_mirror_match_fh(mirror, pos))
 			continue;
-		if (atomic_inc_not_zero(&pos->ref)) {
+		if (refcount_inc_not_zero(&pos->ref)) {
 			spin_unlock(&inode->i_lock);
 			return pos;
 		}
@@ -218,7 +218,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 	mirror = kzalloc(sizeof(*mirror), gfp_flags);
 	if (mirror != NULL) {
 		spin_lock_init(&mirror->lock);
-		atomic_set(&mirror->ref, 1);
+		refcount_set(&mirror->ref, 1);
 		INIT_LIST_HEAD(&mirror->mirrors);
 	}
 	return mirror;
@@ -242,7 +242,7 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 
 static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
-	if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+	if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
 		ff_layout_free_mirror(mirror);
 }
 
@@ -1726,10 +1726,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	vers = nfs4_ff_layout_ds_version(lseg, idx);
 
 	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
-		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
 
 	hdr->pgio_done_cb = ff_layout_read_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
 	if (fh)
@@ -1785,11 +1785,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 
 	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
-		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+		offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
 		vers);
 
 	hdr->pgio_done_cb = ff_layout_write_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
 	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1863,11 +1863,11 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 	vers = nfs4_ff_layout_ds_version(lseg, idx);
 
 	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
-		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
 		vers);
 	data->commit_done_cb = ff_layout_commit_done_cb;
 	data->cred = ds_cred;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
@@ -2286,7 +2286,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
 		if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
 			continue;
 		/* mirror refcount put in cleanup_layoutstats */
-		if (!atomic_inc_not_zero(&mirror->ref))
+		if (!refcount_inc_not_zero(&mirror->ref))
 			continue;
 		dev = &mirror->mirror_ds->id_node; 
 		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 679cb087ef3f..411798346e48 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -14,6 +14,7 @@
 #define FF_FLAGS_NO_IO_THRU_MDS  2
 #define FF_FLAGS_NO_READ_IO      4
 
+#include <linux/refcount.h>
 #include "../pnfs.h"
 
 /* XXX: Let's filter out insanely large mirror count for now to avoid oom
@@ -82,7 +83,7 @@ struct nfs4_ff_layout_mirror {
 	nfs4_stateid			stateid;
 	struct rpc_cred	__rcu		*ro_cred;
 	struct rpc_cred	__rcu		*rw_cred;
-	atomic_t			ref;
+	refcount_t			ref;
 	spinlock_t			lock;
 	unsigned long			flags;
 	struct nfs4_ff_layoutstat	read_stat;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..b992d2382ffa 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,9 +85,9 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
-int nfs_wait_atomic_killable(atomic_t *p)
+int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode)
 {
-	return nfs_wait_killable(TASK_KILLABLE);
+	return nfs_wait_killable(mode);
 }
 
 /**
@@ -752,7 +752,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
 	 * Note that we only have to check the vfsmount flags here:
 	 *  - NFS always sets S_NOATIME by so checking it would give a
 	 *    bogus result
-	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *  - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
 	 *    no point in checking those.
 	 */
 	if ((path->mnt->mnt_flags & MNT_NOATIME) ||
@@ -783,7 +783,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
 
 static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 {
-	atomic_set(&l_ctx->count, 1);
+	refcount_set(&l_ctx->count, 1);
 	l_ctx->lockowner = current->files;
 	INIT_LIST_HEAD(&l_ctx->list);
 	atomic_set(&l_ctx->io_count, 0);
@@ -797,7 +797,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
 	do {
 		if (pos->lockowner != current->files)
 			continue;
-		atomic_inc(&pos->count);
+		refcount_inc(&pos->count);
 		return pos;
 	} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
 	return NULL;
@@ -836,7 +836,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
 	struct nfs_open_context *ctx = l_ctx->open_context;
 	struct inode *inode = d_inode(ctx->dentry);
 
-	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+	if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
 		return;
 	list_del(&l_ctx->list);
 	spin_unlock(&inode->i_lock);
@@ -913,7 +913,7 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		atomic_inc(&ctx->lock_context.count);
+		refcount_inc(&ctx->lock_context.count);
 	return ctx;
 }
 EXPORT_SYMBOL_GPL(get_nfs_open_context);
@@ -924,11 +924,11 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 	struct super_block *sb = ctx->dentry->d_sb;
 
 	if (!list_empty(&ctx->list)) {
-		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+		if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
 			return;
 		list_del(&ctx->list);
 		spin_unlock(&inode->i_lock);
-	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
+	} else if (!refcount_dec_and_test(&ctx->lock_context.count))
 		return;
 	if (inode != NULL)
 		NFS_PROTO(inode)->close_context(ctx, is_sync);
@@ -2084,8 +2084,12 @@ static int nfs_net_init(struct net *net)
 
 static void nfs_net_exit(struct net *net)
 {
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
 	nfs_fs_proc_net_exit(net);
 	nfs_cleanup_cb_ident_idr(net);
+	WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
+	WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
 }
 
 static struct pernet_operations nfs_net_ops = {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f9a4a5524bd5..8357ff69962f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -10,7 +10,7 @@
 #include <linux/nfs_page.h>
 #include <linux/wait_bit.h>
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+#define NFS_MS_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
 
 extern const struct export_operations nfs_export_ops;
 
@@ -388,7 +388,7 @@ extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p);
+extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index bc673fb47fb3..49f848fd1f04 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -188,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
 	struct nfs3_accessargs	arg = {
 		.fh		= NFS_FH(inode),
+		.access		= entry->mask,
 	};
 	struct nfs3_accessres	res;
 	struct rpc_message msg = {
@@ -196,25 +197,9 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 		.rpc_resp	= &res,
 		.rpc_cred	= entry->cred,
 	};
-	int mode = entry->mask;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  access\n");
-
-	if (mode & MAY_READ)
-		arg.access |= NFS3_ACCESS_READ;
-	if (S_ISDIR(inode->i_mode)) {
-		if (mode & MAY_WRITE)
-			arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE;
-		if (mode & MAY_EXEC)
-			arg.access |= NFS3_ACCESS_LOOKUP;
-	} else {
-		if (mode & MAY_WRITE)
-			arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND;
-		if (mode & MAY_EXEC)
-			arg.access |= NFS3_ACCESS_EXECUTE;
-	}
-
 	res.fattr = nfs_alloc_fattr();
 	if (res.fattr == NULL)
 		goto out;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index dcfcf7fd7438..b374f680830c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -145,7 +145,7 @@ struct nfs4_lock_state {
 	unsigned long		ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
 	nfs4_stateid		ls_stateid;
-	atomic_t		ls_count;
+	refcount_t		ls_count;
 	fl_owner_t		ls_owner;
 };
 
@@ -162,6 +162,7 @@ enum {
 	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
 	NFS_STATE_RECOVERY_FAILED,	/* OPEN stateid state recovery failed */
 	NFS_STATE_MAY_NOTIFY_LOCK,	/* server may CB_NOTIFY_LOCK */
+	NFS_STATE_CHANGE_WAIT,		/* A state changing operation is outstanding */
 };
 
 struct nfs4_state {
@@ -185,6 +186,8 @@ struct nfs4_state {
 	unsigned int n_rdwr;		/* Number of read/write references */
 	fmode_t state;			/* State on the server (R,W, or RW) */
 	atomic_t count;
+
+	wait_queue_head_t waitq;
 };
 
 
@@ -458,6 +461,10 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
 		const struct nfs_lock_context *, nfs4_stateid *,
 		struct rpc_cred **);
+extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state);
+extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state);
 
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -465,7 +472,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_release_seqid(struct nfs_seqid *seqid);
 extern void nfs_free_seqid(struct nfs_seqid *seqid);
-extern int nfs4_setup_sequence(const struct nfs_client *client,
+extern int nfs4_setup_sequence(struct nfs_client *client,
 				struct nfs4_sequence_args *args,
 				struct nfs4_sequence_res *res,
 				struct rpc_task *task);
@@ -475,6 +482,7 @@ extern int nfs4_sequence_done(struct rpc_task *task,
 extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
 
 extern const nfs4_stateid zero_stateid;
+extern const nfs4_stateid invalid_stateid;
 
 /* nfs4super.c */
 struct nfs_mount_info;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index e9bea90dc017..65a7e5da508c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -404,15 +404,19 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	if (error < 0)
 		goto error;
 
-	if (!nfs4_has_session(clp))
-		nfs_mark_client_ready(clp, NFS_CS_READY);
-
 	error = nfs4_discover_server_trunking(clp, &old);
 	if (error < 0)
 		goto error;
 
-	if (clp != old)
+	if (clp != old) {
 		clp->cl_preserve_clid = true;
+		/*
+		 * Mark the client as having failed initialization so other
+		 * processes walking the nfs_client_list in nfs_match_client()
+		 * won't try to use it.
+		 */
+		nfs_mark_client_ready(clp, -EPERM);
+	}
 	nfs_put_client(clp);
 	clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
 	return old;
@@ -483,7 +487,7 @@ static int nfs4_match_client(struct nfs_client  *pos,  struct nfs_client *new,
 	 * ID and serverowner fields.  Wait for CREATE_SESSION
 	 * to finish. */
 	if (pos->cl_cons_state > NFS_CS_READY) {
-		atomic_inc(&pos->cl_count);
+		refcount_inc(&pos->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 
 		nfs_put_client(*prev);
@@ -539,6 +543,9 @@ int nfs40_walk_client_list(struct nfs_client *new,
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
 
+		if (pos == new)
+			goto found;
+
 		status = nfs4_match_client(pos, new, &prev, nn);
 		if (status < 0)
 			goto out_unlock;
@@ -559,7 +566,8 @@ int nfs40_walk_client_list(struct nfs_client *new,
 		 * way that a SETCLIENTID_CONFIRM to pos can succeed is
 		 * if new and pos point to the same server:
 		 */
-		atomic_inc(&pos->cl_count);
+found:
+		refcount_inc(&pos->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 
 		nfs_put_client(prev);
@@ -572,6 +580,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
 		case 0:
 			nfs4_swap_callback_idents(pos, new);
 			pos->cl_confirm = new->cl_confirm;
+			nfs_mark_client_ready(pos, NFS_CS_READY);
 
 			prev = NULL;
 			*result = pos;
@@ -715,7 +724,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			continue;
 
 found:
-		atomic_inc(&pos->cl_count);
+		refcount_inc(&pos->cl_count);
 		*result = pos;
 		status = 0;
 		break;
@@ -749,7 +758,7 @@ nfs4_find_client_ident(struct net *net, int cb_ident)
 	spin_lock(&nn->nfs_client_lock);
 	clp = idr_find(&nn->cb_ident_idr, cb_ident);
 	if (clp)
-		atomic_inc(&clp->cl_count);
+		refcount_inc(&clp->cl_count);
 	spin_unlock(&nn->nfs_client_lock);
 	return clp;
 }
@@ -793,7 +802,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
 
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
-		if (nfs4_cb_match_client(addr, clp, minorversion) == false)
+		if (!nfs4_cb_match_client(addr, clp, minorversion))
 			continue;
 
 		if (!nfs4_has_session(clp))
@@ -804,7 +813,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
 		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
 			continue;
 
-		atomic_inc(&clp->cl_count);
+		refcount_inc(&clp->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 		return clp;
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f90090e8c959..56fa5a16e097 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -96,6 +96,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_open_context *ctx, struct nfs4_label *ilabel,
 			    struct nfs4_label *olabel);
 #ifdef CONFIG_NFS_V4_1
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+		struct rpc_cred *cred,
+		struct nfs4_slot *slot,
+		bool is_privileged);
 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
 		struct rpc_cred *);
 static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
@@ -254,15 +258,12 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
 };
 
 const u32 nfs4_fs_locations_bitmap[3] = {
-	FATTR4_WORD0_TYPE
-	| FATTR4_WORD0_CHANGE
+	FATTR4_WORD0_CHANGE
 	| FATTR4_WORD0_SIZE
 	| FATTR4_WORD0_FSID
 	| FATTR4_WORD0_FILEID
 	| FATTR4_WORD0_FS_LOCATIONS,
-	FATTR4_WORD1_MODE
-	| FATTR4_WORD1_NUMLINKS
-	| FATTR4_WORD1_OWNER
+	FATTR4_WORD1_OWNER
 	| FATTR4_WORD1_OWNER_GROUP
 	| FATTR4_WORD1_RAWDEV
 	| FATTR4_WORD1_SPACE_USED
@@ -644,13 +645,14 @@ static int nfs40_sequence_done(struct rpc_task *task,
 
 #if defined(CONFIG_NFS_V4_1)
 
-static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+static void nfs41_release_slot(struct nfs4_slot *slot)
 {
 	struct nfs4_session *session;
 	struct nfs4_slot_table *tbl;
-	struct nfs4_slot *slot = res->sr_slot;
 	bool send_new_highest_used_slotid = false;
 
+	if (!slot)
+		return;
 	tbl = slot->table;
 	session = tbl->session;
 
@@ -676,13 +678,18 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 		send_new_highest_used_slotid = false;
 out_unlock:
 	spin_unlock(&tbl->slot_tbl_lock);
-	res->sr_slot = NULL;
 	if (send_new_highest_used_slotid)
 		nfs41_notify_server(session->clp);
 	if (waitqueue_active(&tbl->slot_waitq))
 		wake_up_all(&tbl->slot_waitq);
 }
 
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+	nfs41_release_slot(res->sr_slot);
+	res->sr_slot = NULL;
+}
+
 static int nfs41_sequence_process(struct rpc_task *task,
 		struct nfs4_sequence_res *res)
 {
@@ -710,13 +717,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
 	/* Check the SEQUENCE operation status */
 	switch (res->sr_status) {
 	case 0:
-		/* If previous op on slot was interrupted and we reused
-		 * the seq# and got a reply from the cache, then retry
-		 */
-		if (task->tk_status == -EREMOTEIO && interrupted) {
-			++slot->seq_nr;
-			goto retry_nowait;
-		}
 		/* Update the slot's sequence and clientid lease timer */
 		slot->seq_done = 1;
 		clp = session->clp;
@@ -750,16 +750,16 @@ static int nfs41_sequence_process(struct rpc_task *task,
 		 * The slot id we used was probably retired. Try again
 		 * using a different slot id.
 		 */
+		if (slot->seq_nr < slot->table->target_highest_slotid)
+			goto session_recover;
 		goto retry_nowait;
 	case -NFS4ERR_SEQ_MISORDERED:
 		/*
 		 * Was the last operation on this sequence interrupted?
 		 * If so, retry after bumping the sequence number.
 		 */
-		if (interrupted) {
-			++slot->seq_nr;
-			goto retry_nowait;
-		}
+		if (interrupted)
+			goto retry_new_seq;
 		/*
 		 * Could this slot have been previously retired?
 		 * If so, then the server may be expecting seq_nr = 1!
@@ -768,10 +768,11 @@ static int nfs41_sequence_process(struct rpc_task *task,
 			slot->seq_nr = 1;
 			goto retry_nowait;
 		}
-		break;
+		goto session_recover;
 	case -NFS4ERR_SEQ_FALSE_RETRY:
-		++slot->seq_nr;
-		goto retry_nowait;
+		if (interrupted)
+			goto retry_new_seq;
+		goto session_recover;
 	default:
 		/* Just update the slot sequence no. */
 		slot->seq_done = 1;
@@ -781,6 +782,11 @@ out:
 	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
 out_noaction:
 	return ret;
+session_recover:
+	nfs4_schedule_session_recovery(session, res->sr_status);
+	goto retry_nowait;
+retry_new_seq:
+	++slot->seq_nr;
 retry_nowait:
 	if (rpc_restart_call_prepare(task)) {
 		nfs41_sequence_free_slot(res);
@@ -857,6 +863,17 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
 	.rpc_call_done = nfs41_call_sync_done,
 };
 
+static void
+nfs4_sequence_process_interrupted(struct nfs_client *client,
+		struct nfs4_slot *slot, struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+
+	task = _nfs41_proc_sequence(client, cred, slot, true);
+	if (!IS_ERR(task))
+		rpc_put_task_async(task);
+}
+
 #else	/* !CONFIG_NFS_V4_1 */
 
 static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -877,9 +894,34 @@ int nfs4_sequence_done(struct rpc_task *task,
 }
 EXPORT_SYMBOL_GPL(nfs4_sequence_done);
 
+static void
+nfs4_sequence_process_interrupted(struct nfs_client *client,
+		struct nfs4_slot *slot, struct rpc_cred *cred)
+{
+	WARN_ON_ONCE(1);
+	slot->interrupted = 0;
+}
+
 #endif	/* !CONFIG_NFS_V4_1 */
 
-int nfs4_setup_sequence(const struct nfs_client *client,
+static
+void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
+		struct nfs4_sequence_res *res,
+		struct nfs4_slot *slot)
+{
+	if (!slot)
+		return;
+	slot->privileged = args->sa_privileged ? 1 : 0;
+	args->sa_slot = slot;
+
+	res->sr_slot = slot;
+	res->sr_timestamp = jiffies;
+	res->sr_status_flags = 0;
+	res->sr_status = 1;
+
+}
+
+int nfs4_setup_sequence(struct nfs_client *client,
 			struct nfs4_sequence_args *args,
 			struct nfs4_sequence_res *res,
 			struct rpc_task *task)
@@ -897,29 +939,28 @@ int nfs4_setup_sequence(const struct nfs_client *client,
 		task->tk_timeout = 0;
 	}
 
-	spin_lock(&tbl->slot_tbl_lock);
-	/* The state manager will wait until the slot table is empty */
-	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
-		goto out_sleep;
+	for (;;) {
+		spin_lock(&tbl->slot_tbl_lock);
+		/* The state manager will wait until the slot table is empty */
+		if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+			goto out_sleep;
+
+		slot = nfs4_alloc_slot(tbl);
+		if (IS_ERR(slot)) {
+			/* Try again in 1/4 second */
+			if (slot == ERR_PTR(-ENOMEM))
+				task->tk_timeout = HZ >> 2;
+			goto out_sleep;
+		}
+		spin_unlock(&tbl->slot_tbl_lock);
 
-	slot = nfs4_alloc_slot(tbl);
-	if (IS_ERR(slot)) {
-		/* Try again in 1/4 second */
-		if (slot == ERR_PTR(-ENOMEM))
-			task->tk_timeout = HZ >> 2;
-		goto out_sleep;
+		if (likely(!slot->interrupted))
+			break;
+		nfs4_sequence_process_interrupted(client,
+				slot, task->tk_msg.rpc_cred);
 	}
-	spin_unlock(&tbl->slot_tbl_lock);
-
-	slot->privileged = args->sa_privileged ? 1 : 0;
-	args->sa_slot = slot;
 
-	res->sr_slot = slot;
-	if (session) {
-		res->sr_timestamp = jiffies;
-		res->sr_status_flags = 0;
-		res->sr_status = 1;
-	}
+	nfs4_sequence_attach_slot(args, res, slot);
 
 	trace_nfs4_setup_sequence(session, args);
 out_start:
@@ -1044,6 +1085,12 @@ struct nfs4_opendata {
 	int rpc_status;
 };
 
+struct nfs4_open_createattrs {
+	struct nfs4_label *label;
+	struct iattr *sattr;
+	const __u32 verf[2];
+};
+
 static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
 		int err, struct nfs4_exception *exception)
 {
@@ -1113,8 +1160,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 		struct nfs4_state_owner *sp, fmode_t fmode, int flags,
-		const struct iattr *attrs,
-		struct nfs4_label *label,
+		const struct nfs4_open_createattrs *c,
 		enum open_claim_type4 claim,
 		gfp_t gfp_mask)
 {
@@ -1122,6 +1168,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	struct inode *dir = d_inode(parent);
 	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	struct nfs4_label *label = (c != NULL) ? c->label : NULL;
 	struct nfs4_opendata *p;
 
 	p = kzalloc(sizeof(*p), gfp_mask);
@@ -1187,15 +1234,11 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
 		p->o_arg.fh = NFS_FH(d_inode(dentry));
 	}
-	if (attrs != NULL && attrs->ia_valid != 0) {
-		__u32 verf[2];
-
+	if (c != NULL && c->sattr != NULL && c->sattr->ia_valid != 0) {
 		p->o_arg.u.attrs = &p->attrs;
-		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+		memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
 
-		verf[0] = jiffies;
-		verf[1] = current->pid;
-		memcpy(p->o_arg.u.verifier.data, verf,
+		memcpy(p->o_arg.u.verifier.data, c->verf,
 				sizeof(p->o_arg.u.verifier.data));
 	}
 	p->c_arg.fh = &p->o_res.fh;
@@ -1334,6 +1377,25 @@ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
+static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
+{
+	if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+		wake_up_all(&state->waitq);
+}
+
+static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
+{
+	u32 state_seqid = be32_to_cpu(state->open_stateid.seqid);
+	u32 stateid_seqid = be32_to_cpu(stateid->seqid);
+
+	if (stateid_seqid == state_seqid + 1U ||
+	    (stateid_seqid == 1U && state_seqid == 0xffffffffU))
+		nfs_state_log_update_open_stateid(state);
+	else
+		set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+}
+
 static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 {
 	struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1349,18 +1411,32 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 		nfs4_state_mark_reclaim_nograce(clp, state);
 }
 
+/*
+ * Check for whether or not the caller may update the open stateid
+ * to the value passed in by stateid.
+ *
+ * Note: This function relies heavily on the server implementing
+ * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2
+ * correctly.
+ * i.e. The stateid seqids have to be initialised to 1, and
+ * are then incremented on every state transition.
+ */
 static bool nfs_need_update_open_stateid(struct nfs4_state *state,
-		const nfs4_stateid *stateid, nfs4_stateid *freeme)
+		const nfs4_stateid *stateid)
 {
-	if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
-		return true;
-	if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
-		nfs4_stateid_copy(freeme, &state->open_stateid);
-		nfs_test_and_clear_all_open_stateid(state);
+	if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 ||
+	    !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+		if (stateid->seqid == cpu_to_be32(1))
+			nfs_state_log_update_open_stateid(state);
+		else
+			set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
 		return true;
 	}
-	if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+
+	if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+		nfs_state_log_out_of_order_open_stateid(state, stateid);
 		return true;
+	}
 	return false;
 }
 
@@ -1399,11 +1475,14 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
 	if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
 	    !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
 		nfs_resync_open_stateid_locked(state);
-		return;
+		goto out;
 	}
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		nfs4_stateid_copy(&state->stateid, stateid);
 	nfs4_stateid_copy(&state->open_stateid, stateid);
+	trace_nfs4_open_stateid_update(state->inode, stateid, 0);
+out:
+	nfs_state_log_update_open_stateid(state);
 }
 
 static void nfs_clear_open_stateid(struct nfs4_state *state,
@@ -1420,29 +1499,60 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
 }
 
 static void nfs_set_open_stateid_locked(struct nfs4_state *state,
-		const nfs4_stateid *stateid, fmode_t fmode,
-		nfs4_stateid *freeme)
+		const nfs4_stateid *stateid, nfs4_stateid *freeme)
 {
-	switch (fmode) {
-		case FMODE_READ:
-			set_bit(NFS_O_RDONLY_STATE, &state->flags);
+	DEFINE_WAIT(wait);
+	int status = 0;
+	for (;;) {
+
+		if (!nfs_need_update_open_stateid(state, stateid))
+			return;
+		if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
 			break;
-		case FMODE_WRITE:
-			set_bit(NFS_O_WRONLY_STATE, &state->flags);
+		if (status)
 			break;
-		case FMODE_READ|FMODE_WRITE:
-			set_bit(NFS_O_RDWR_STATE, &state->flags);
+		/* Rely on seqids for serialisation with NFSv4.0 */
+		if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
+			break;
+
+		prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+		/*
+		 * Ensure we process the state changes in the same order
+		 * in which the server processed them by delaying the
+		 * update of the stateid until we are in sequence.
+		 */
+		write_sequnlock(&state->seqlock);
+		spin_unlock(&state->owner->so_lock);
+		rcu_read_unlock();
+		trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
+		if (!signal_pending(current)) {
+			if (schedule_timeout(5*HZ) == 0)
+				status = -EAGAIN;
+			else
+				status = 0;
+		} else
+			status = -EINTR;
+		finish_wait(&state->waitq, &wait);
+		rcu_read_lock();
+		spin_lock(&state->owner->so_lock);
+		write_seqlock(&state->seqlock);
 	}
-	if (!nfs_need_update_open_stateid(state, stateid, freeme))
-		return;
+
+	if (test_bit(NFS_OPEN_STATE, &state->flags) &&
+	    !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+		nfs4_stateid_copy(freeme, &state->open_stateid);
+		nfs_test_and_clear_all_open_stateid(state);
+	}
+
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		nfs4_stateid_copy(&state->stateid, stateid);
 	nfs4_stateid_copy(&state->open_stateid, stateid);
+	trace_nfs4_open_stateid_update(state->inode, stateid, status);
+	nfs_state_log_update_open_stateid(state);
 }
 
-static void __update_open_stateid(struct nfs4_state *state,
+static void nfs_state_set_open_stateid(struct nfs4_state *state,
 		const nfs4_stateid *open_stateid,
-		const nfs4_stateid *deleg_stateid,
 		fmode_t fmode,
 		nfs4_stateid *freeme)
 {
@@ -1450,17 +1560,34 @@ static void __update_open_stateid(struct nfs4_state *state,
 	 * Protect the call to nfs4_state_set_mode_locked and
 	 * serialise the stateid update
 	 */
-	spin_lock(&state->owner->so_lock);
 	write_seqlock(&state->seqlock);
-	if (deleg_stateid != NULL) {
-		nfs4_stateid_copy(&state->stateid, deleg_stateid);
-		set_bit(NFS_DELEGATED_STATE, &state->flags);
+	nfs_set_open_stateid_locked(state, open_stateid, freeme);
+	switch (fmode) {
+	case FMODE_READ:
+		set_bit(NFS_O_RDONLY_STATE, &state->flags);
+		break;
+	case FMODE_WRITE:
+		set_bit(NFS_O_WRONLY_STATE, &state->flags);
+		break;
+	case FMODE_READ|FMODE_WRITE:
+		set_bit(NFS_O_RDWR_STATE, &state->flags);
 	}
-	if (open_stateid != NULL)
-		nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
+	set_bit(NFS_OPEN_STATE, &state->flags);
+	write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_set_delegation(struct nfs4_state *state,
+		const nfs4_stateid *deleg_stateid,
+		fmode_t fmode)
+{
+	/*
+	 * Protect the call to nfs4_state_set_mode_locked and
+	 * serialise the stateid update
+	 */
+	write_seqlock(&state->seqlock);
+	nfs4_stateid_copy(&state->stateid, deleg_stateid);
+	set_bit(NFS_DELEGATED_STATE, &state->flags);
 	write_sequnlock(&state->seqlock);
-	update_open_stateflags(state, fmode);
-	spin_unlock(&state->owner->so_lock);
 }
 
 static int update_open_stateid(struct nfs4_state *state,
@@ -1478,6 +1605,12 @@ static int update_open_stateid(struct nfs4_state *state,
 	fmode &= (FMODE_READ|FMODE_WRITE);
 
 	rcu_read_lock();
+	spin_lock(&state->owner->so_lock);
+	if (open_stateid != NULL) {
+		nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme);
+		ret = 1;
+	}
+
 	deleg_cur = rcu_dereference(nfsi->delegation);
 	if (deleg_cur == NULL)
 		goto no_delegation;
@@ -1494,18 +1627,16 @@ static int update_open_stateid(struct nfs4_state *state,
 		goto no_delegation_unlock;
 
 	nfs_mark_delegation_referenced(deleg_cur);
-	__update_open_stateid(state, open_stateid, &deleg_cur->stateid,
-			fmode, &freeme);
+	nfs_state_set_delegation(state, &deleg_cur->stateid, fmode);
 	ret = 1;
 no_delegation_unlock:
 	spin_unlock(&deleg_cur->lock);
 no_delegation:
+	if (ret)
+		update_open_stateflags(state, fmode);
+	spin_unlock(&state->owner->so_lock);
 	rcu_read_unlock();
 
-	if (!ret && open_stateid != NULL) {
-		__update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
-		ret = 1;
-	}
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
 		nfs4_schedule_state_manager(clp);
 	if (freeme.type != 0)
@@ -1761,7 +1892,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 	struct nfs4_opendata *opendata;
 
 	opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
-			NULL, NULL, claim, GFP_NOFS);
+			NULL, claim, GFP_NOFS);
 	if (opendata == NULL)
 		return ERR_PTR(-ENOMEM);
 	opendata->state = state;
@@ -2518,7 +2649,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
 		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
 			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
 
-			atomic_inc(&lsp->ls_count);
+			refcount_inc(&lsp->ls_count);
 			spin_unlock(&state->state_lock);
 
 			nfs4_put_lock_state(prev);
@@ -2692,8 +2823,7 @@ out:
 static int _nfs4_do_open(struct inode *dir,
 			struct nfs_open_context *ctx,
 			int flags,
-			struct iattr *sattr,
-			struct nfs4_label *label,
+			const struct nfs4_open_createattrs *c,
 			int *opened)
 {
 	struct nfs4_state_owner  *sp;
@@ -2705,6 +2835,8 @@ static int _nfs4_do_open(struct inode *dir,
 	struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
 	fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
 	enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+	struct iattr *sattr = c->sattr;
+	struct nfs4_label *label = c->label;
 	struct nfs4_label *olabel = NULL;
 	int status;
 
@@ -2723,8 +2855,8 @@ static int _nfs4_do_open(struct inode *dir,
 	status = -ENOMEM;
 	if (d_really_is_positive(dentry))
 		claim = NFS4_OPEN_CLAIM_FH;
-	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
-			label, claim, GFP_KERNEL);
+	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags,
+			c, claim, GFP_KERNEL);
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
@@ -2805,10 +2937,18 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
 	struct nfs4_state *res;
+	struct nfs4_open_createattrs c = {
+		.label = label,
+		.sattr = sattr,
+		.verf = {
+			[0] = (__u32)jiffies,
+			[1] = (__u32)current->pid,
+		},
+	};
 	int status;
 
 	do {
-		status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
+		status = _nfs4_do_open(dir, ctx, flags, &c, opened);
 		res = ctx->state;
 		trace_nfs4_open_file(ctx, flags, status);
 		if (status == 0)
@@ -3024,18 +3164,20 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			calldata->arg.lr_args = NULL;
 			calldata->res.lr_res = NULL;
 			break;
+		case -NFS4ERR_OLD_STATEID:
+			if (nfs4_refresh_layout_stateid(&calldata->arg.lr_args->stateid,
+						calldata->inode))
+				goto lr_restart;
+			/* Fallthrough */
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_EXPIRED:
 		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
 		case -NFS4ERR_WRONG_CRED:
 			calldata->arg.lr_args = NULL;
 			calldata->res.lr_res = NULL;
-			calldata->res.lr_ret = 0;
-			rpc_restart_call_prepare(task);
-			return;
+			goto lr_restart;
 		}
 	}
 
@@ -3051,39 +3193,43 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			if (calldata->arg.bitmask != NULL) {
 				calldata->arg.bitmask = NULL;
 				calldata->res.fattr = NULL;
-				task->tk_status = 0;
-				rpc_restart_call_prepare(task);
-				goto out_release;
+				goto out_restart;
 
 			}
 			break;
+		case -NFS4ERR_OLD_STATEID:
+			/* Did we race with OPEN? */
+			if (nfs4_refresh_open_stateid(&calldata->arg.stateid,
+						state))
+				goto out_restart;
+			goto out_release;
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
 			nfs4_free_revoked_stateid(server,
 					&calldata->arg.stateid,
 					task->tk_msg.rpc_cred);
-		case -NFS4ERR_OLD_STATEID:
+			/* Fallthrough */
 		case -NFS4ERR_BAD_STATEID:
-			if (!nfs4_stateid_match(&calldata->arg.stateid,
-						&state->open_stateid)) {
-				rpc_restart_call_prepare(task);
-				goto out_release;
-			}
-			if (calldata->arg.fmode == 0)
-				break;
+			break;
 		default:
-			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
-				rpc_restart_call_prepare(task);
-				goto out_release;
-			}
+			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+				goto out_restart;
 	}
 	nfs_clear_open_stateid(state, &calldata->arg.stateid,
 			res_stateid, calldata->arg.fmode);
 out_release:
+	task->tk_status = 0;
 	nfs_release_seqid(calldata->arg.seqid);
 	nfs_refresh_inode(calldata->inode, &calldata->fattr);
 	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
+	return;
+lr_restart:
+	calldata->res.lr_ret = 0;
+out_restart:
+	task->tk_status = 0;
+	rpc_restart_call_prepare(task);
+	goto out_release;
 }
 
 static void nfs4_close_prepare(struct rpc_task *task, void *data)
@@ -3103,7 +3249,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
 	is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
 	is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
-	nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid);
 	/* Calculate the change in open mode */
 	calldata->arg.fmode = 0;
 	if (state->n_rdwr == 0) {
@@ -3121,7 +3266,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
 
 	if (!nfs4_valid_open_stateid(state) ||
-	    test_bit(NFS_OPEN_STATE, &state->flags) == 0)
+	    !nfs4_refresh_open_stateid(&calldata->arg.stateid, state))
 		call_close = 0;
 	spin_unlock(&state->owner->so_lock);
 
@@ -3215,6 +3360,8 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 	calldata->inode = state->inode;
 	calldata->state = state;
 	calldata->arg.fh = NFS_FH(state->inode);
+	if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state))
+		goto out_free_calldata;
 	/* Serialization for the sequence id */
 	alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
 	calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
@@ -3889,6 +4036,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 	struct nfs4_accessargs args = {
 		.fh = NFS_FH(inode),
 		.bitmask = server->cache_consistency_bitmask,
+		.access = entry->mask,
 	};
 	struct nfs4_accessres res = {
 		.server = server,
@@ -3899,26 +4047,8 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 		.rpc_resp = &res,
 		.rpc_cred = entry->cred,
 	};
-	int mode = entry->mask;
 	int status = 0;
 
-	/*
-	 * Determine which access bits we want to ask for...
-	 */
-	if (mode & MAY_READ)
-		args.access |= NFS4_ACCESS_READ;
-	if (S_ISDIR(inode->i_mode)) {
-		if (mode & MAY_WRITE)
-			args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE;
-		if (mode & MAY_EXEC)
-			args.access |= NFS4_ACCESS_LOOKUP;
-	} else {
-		if (mode & MAY_WRITE)
-			args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND;
-		if (mode & MAY_EXEC)
-			args.access |= NFS4_ACCESS_EXECUTE;
-	}
-
 	res.fattr = nfs_alloc_fattr();
 	if (res.fattr == NULL)
 		return -ENOMEM;
@@ -4843,7 +4973,7 @@ static void nfs4_renew_release(void *calldata)
 	struct nfs4_renewdata *data = calldata;
 	struct nfs_client *clp = data->client;
 
-	if (atomic_read(&clp->cl_count) > 1)
+	if (refcount_read(&clp->cl_count) > 1)
 		nfs4_schedule_state_renewal(clp);
 	nfs_put_client(clp);
 	kfree(data);
@@ -4891,7 +5021,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
 
 	if (renew_flags == 0)
 		return 0;
-	if (!atomic_inc_not_zero(&clp->cl_count))
+	if (!refcount_inc_not_zero(&clp->cl_count))
 		return -EIO;
 	data = kmalloc(sizeof(*data), GFP_NOFS);
 	if (data == NULL) {
@@ -5643,18 +5773,20 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 			data->args.lr_args = NULL;
 			data->res.lr_res = NULL;
 			break;
+		case -NFS4ERR_OLD_STATEID:
+			if (nfs4_refresh_layout_stateid(&data->args.lr_args->stateid,
+						data->inode))
+				goto lr_restart;
+			/* Fallthrough */
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_EXPIRED:
 		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
 		case -NFS4ERR_WRONG_CRED:
 			data->args.lr_args = NULL;
 			data->res.lr_res = NULL;
-			data->res.lr_ret = 0;
-			rpc_restart_call_prepare(task);
-			return;
+			goto lr_restart;
 		}
 	}
 
@@ -5668,27 +5800,36 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 		nfs4_free_revoked_stateid(data->res.server,
 				data->args.stateid,
 				task->tk_msg.rpc_cred);
+		/* Fallthrough */
 	case -NFS4ERR_BAD_STATEID:
-	case -NFS4ERR_OLD_STATEID:
 	case -NFS4ERR_STALE_STATEID:
 		task->tk_status = 0;
 		break;
+	case -NFS4ERR_OLD_STATEID:
+		if (nfs4_refresh_delegation_stateid(&data->stateid, data->inode))
+			goto out_restart;
+		task->tk_status = 0;
+		break;
 	case -NFS4ERR_ACCESS:
 		if (data->args.bitmask) {
 			data->args.bitmask = NULL;
 			data->res.fattr = NULL;
-			task->tk_status = 0;
-			rpc_restart_call_prepare(task);
-			return;
+			goto out_restart;
 		}
+		/* Fallthrough */
 	default:
 		if (nfs4_async_handle_error(task, data->res.server,
 					    NULL, NULL) == -EAGAIN) {
-			rpc_restart_call_prepare(task);
-			return;
+			goto out_restart;
 		}
 	}
 	data->rpc_status = task->tk_status;
+	return;
+lr_restart:
+	data->res.lr_ret = 0;
+out_restart:
+	task->tk_status = 0;
+	rpc_restart_call_prepare(task);
 }
 
 static void nfs4_delegreturn_release(void *calldata)
@@ -5896,7 +6037,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
 	p->lsp = lsp;
-	atomic_inc(&lsp->ls_count);
+	refcount_inc(&lsp->ls_count);
 	/* Ensure we don't close file until we're done freeing locks! */
 	p->ctx = get_nfs_open_context(ctx);
 	p->l_ctx = nfs_get_lock_context(ctx);
@@ -6112,7 +6253,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->res.lock_seqid = p->arg.lock_seqid;
 	p->lsp = lsp;
 	p->server = server;
-	atomic_inc(&lsp->ls_count);
+	refcount_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
 	memcpy(&p->fl, fl, sizeof(p->fl));
 	return p;
@@ -6568,6 +6709,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 	    !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
 		return -ENOLCK;
 
+	/*
+	 * Don't rely on the VFS having checked the file open mode,
+	 * since it won't do this for flock() locks.
+	 */
+	switch (request->fl_type) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			return -EBADF;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+	}
+
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
 		return status;
@@ -6763,9 +6918,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
 				   struct page *page)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	u32 bitmask[3] = {
-		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
-	};
+	u32 bitmask[3];
 	struct nfs4_fs_locations_arg args = {
 		.dir_fh = NFS_FH(dir),
 		.name = name,
@@ -6784,12 +6937,15 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
 
 	dprintk("%s: start\n", __func__);
 
+	bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS;
+	bitmask[1] = nfs4_fattr_bitmap[1];
+
 	/* Ask for the fileid of the absent filesystem if mounted_on_fileid
 	 * is not supported */
 	if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-		bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+		bitmask[0] &= ~FATTR4_WORD0_FILEID;
 	else
-		bitmask[0] |= FATTR4_WORD0_FILEID;
+		bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 
 	nfs_fattr_init(&fs_locations->fattr);
 	fs_locations->server = server;
@@ -7472,7 +7628,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	struct nfs41_exchange_id_data *calldata;
 	int status;
 
-	if (!atomic_inc_not_zero(&clp->cl_count))
+	if (!refcount_inc_not_zero(&clp->cl_count))
 		return ERR_PTR(-EIO);
 
 	status = -ENOMEM;
@@ -8072,7 +8228,7 @@ static void nfs41_sequence_release(void *data)
 	struct nfs4_sequence_data *calldata = data;
 	struct nfs_client *clp = calldata->clp;
 
-	if (atomic_read(&clp->cl_count) > 1)
+	if (refcount_read(&clp->cl_count) > 1)
 		nfs4_schedule_state_renewal(clp);
 	nfs_put_client(clp);
 	kfree(calldata);
@@ -8101,7 +8257,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 	trace_nfs4_sequence(clp, task->tk_status);
 	if (task->tk_status < 0) {
 		dprintk("%s ERROR %d\n", __func__, task->tk_status);
-		if (atomic_read(&clp->cl_count) == 1)
+		if (refcount_read(&clp->cl_count) == 1)
 			goto out;
 
 		if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
@@ -8135,6 +8291,7 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
 
 static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 		struct rpc_cred *cred,
+		struct nfs4_slot *slot,
 		bool is_privileged)
 {
 	struct nfs4_sequence_data *calldata;
@@ -8148,15 +8305,18 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 		.callback_ops = &nfs41_sequence_ops,
 		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
 	};
+	struct rpc_task *ret;
 
-	if (!atomic_inc_not_zero(&clp->cl_count))
-		return ERR_PTR(-EIO);
+	ret = ERR_PTR(-EIO);
+	if (!refcount_inc_not_zero(&clp->cl_count))
+		goto out_err;
+
+	ret = ERR_PTR(-ENOMEM);
 	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
-	if (calldata == NULL) {
-		nfs_put_client(clp);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (calldata == NULL)
+		goto out_put_clp;
 	nfs4_init_sequence(&calldata->args, &calldata->res, 0);
+	nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot);
 	if (is_privileged)
 		nfs4_set_sequence_privileged(&calldata->args);
 	msg.rpc_argp = &calldata->args;
@@ -8164,7 +8324,15 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 	calldata->clp = clp;
 	task_setup_data.callback_data = calldata;
 
-	return rpc_run_task(&task_setup_data);
+	ret = rpc_run_task(&task_setup_data);
+	if (IS_ERR(ret))
+		goto out_err;
+	return ret;
+out_put_clp:
+	nfs_put_client(clp);
+out_err:
+	nfs41_release_slot(slot);
+	return ret;
 }
 
 static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
@@ -8174,7 +8342,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
 
 	if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
 		return -EAGAIN;
-	task = _nfs41_proc_sequence(clp, cred, false);
+	task = _nfs41_proc_sequence(clp, cred, NULL, false);
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
 	else
@@ -8188,7 +8356,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 	struct rpc_task *task;
 	int ret;
 
-	task = _nfs41_proc_sequence(clp, cred, true);
+	task = _nfs41_proc_sequence(clp, cred, NULL, true);
 	if (IS_ERR(task)) {
 		ret = PTR_ERR(task);
 		goto out;
@@ -8588,18 +8756,27 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 
 	server = NFS_SERVER(lrp->args.inode);
 	switch (task->tk_status) {
+	case -NFS4ERR_OLD_STATEID:
+		if (nfs4_refresh_layout_stateid(&lrp->args.stateid,
+					lrp->args.inode))
+			goto out_restart;
+		/* Fallthrough */
 	default:
 		task->tk_status = 0;
+		/* Fallthrough */
 	case 0:
 		break;
 	case -NFS4ERR_DELAY:
 		if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
 			break;
-		nfs4_sequence_free_slot(&lrp->res.seq_res);
-		rpc_restart_call_prepare(task);
-		return;
+		goto out_restart;
 	}
 	dprintk("<-- %s\n", __func__);
+	return;
+out_restart:
+	task->tk_status = 0;
+	nfs4_sequence_free_slot(&lrp->res.seq_res);
+	rpc_restart_call_prepare(task);
 }
 
 static void nfs4_layoutreturn_release(void *calldata)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0378e2257ca7..e4f4a09ed9f4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -69,6 +69,14 @@ const nfs4_stateid zero_stateid = {
 	{ .data = { 0 } },
 	.type = NFS4_SPECIAL_STATEID_TYPE,
 };
+const nfs4_stateid invalid_stateid = {
+	{
+		/* Funky initialiser keeps older gcc versions happy */
+		.data = { 0xff, 0xff, 0xff, 0xff, 0 },
+	},
+	.type = NFS4_INVALID_STATEID_TYPE,
+};
+
 static DEFINE_MUTEX(nfs_clid_init_mutex);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -645,6 +653,7 @@ nfs4_alloc_open_state(void)
 	INIT_LIST_HEAD(&state->lock_states);
 	spin_lock_init(&state->state_lock);
 	seqlock_init(&state->seqlock);
+	init_waitqueue_head(&state->waitq);
 	return state;
 }
 
@@ -825,7 +834,7 @@ __nfs4_find_lock_state(struct nfs4_state *state,
 			ret = pos;
 	}
 	if (ret)
-		atomic_inc(&ret->ls_count);
+		refcount_inc(&ret->ls_count);
 	return ret;
 }
 
@@ -843,7 +852,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	if (lsp == NULL)
 		return NULL;
 	nfs4_init_seqid_counter(&lsp->ls_seqid);
-	atomic_set(&lsp->ls_count, 1);
+	refcount_set(&lsp->ls_count, 1);
 	lsp->ls_state = state;
 	lsp->ls_owner = fl_owner;
 	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
@@ -907,7 +916,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (lsp == NULL)
 		return;
 	state = lsp->ls_state;
-	if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock))
+	if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock))
 		return;
 	list_del(&lsp->ls_locks);
 	if (list_empty(&state->lock_states))
@@ -927,7 +936,7 @@ static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
 	struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
 
 	dst->fl_u.nfs4_fl.owner = lsp;
-	atomic_inc(&lsp->ls_count);
+	refcount_inc(&lsp->ls_count);
 }
 
 static void nfs4_fl_release_lock(struct file_lock *fl)
@@ -985,18 +994,39 @@ out:
 	return ret;
 }
 
-static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+	bool ret;
+	int seq;
+
+	do {
+		ret = false;
+		seq = read_seqbegin(&state->seqlock);
+		if (nfs4_state_match_open_stateid_other(state, dst)) {
+			dst->seqid = state->open_stateid.seqid;
+			ret = true;
+		}
+	} while (read_seqretry(&state->seqlock, seq));
+	return ret;
+}
+
+bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
 {
+	bool ret;
 	const nfs4_stateid *src;
 	int seq;
 
 	do {
+		ret = false;
 		src = &zero_stateid;
 		seq = read_seqbegin(&state->seqlock);
-		if (test_bit(NFS_OPEN_STATE, &state->flags))
+		if (test_bit(NFS_OPEN_STATE, &state->flags)) {
 			src = &state->open_stateid;
+			ret = true;
+		}
 		nfs4_stateid_copy(dst, src);
 	} while (read_seqretry(&state->seqlock, seq));
+	return ret;
 }
 
 /*
@@ -1177,7 +1207,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 		return;
 	__module_get(THIS_MODULE);
-	atomic_inc(&clp->cl_count);
+	refcount_inc(&clp->cl_count);
 
 	/* The rcu_read_lock() is not strictly necessary, as the state
 	 * manager is the only thread that ever changes the rpc_xprt
@@ -1269,7 +1299,7 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
 
 	might_sleep();
 
-	atomic_inc(&clp->cl_count);
+	refcount_inc(&clp->cl_count);
 	res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
 				 nfs_wait_bit_killable, TASK_KILLABLE);
 	if (res)
@@ -1409,6 +1439,11 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
 			found = true;
 			continue;
 		}
+		if (nfs4_stateid_match_other(&state->open_stateid, stateid) &&
+		    nfs4_state_mark_reclaim_nograce(clp, state)) {
+			found = true;
+			continue;
+		}
 		if (nfs_state_lock_state_matches_stateid(state, stateid) &&
 		    nfs4_state_mark_reclaim_nograce(clp, state))
 			found = true;
@@ -2510,7 +2545,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			break;
 		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 			break;
-	} while (atomic_read(&clp->cl_count) > 1);
+	} while (refcount_read(&clp->cl_count) > 1);
 	return;
 out_error:
 	if (strlen(section))
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index e7c6275519b0..a275fba93170 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -202,17 +202,13 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
 		TP_ARGS(clp, error),
 
 		TP_STRUCT__entry(
-			__string(dstaddr,
-				rpc_peeraddr2str(clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR))
+			__string(dstaddr, clp->cl_hostname)
 			__field(int, error)
 		),
 
 		TP_fast_assign(
 			__entry->error = error;
-			__assign_str(dstaddr,
-				rpc_peeraddr2str(clp->cl_rpcclient,
-						RPC_DISPLAY_ADDR));
+			__assign_str(dstaddr, clp->cl_hostname);
 		),
 
 		TP_printk(
@@ -1066,6 +1062,8 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
 
 DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
 DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait);
 
 DECLARE_EVENT_CLASS(nfs4_getattr_event,
 		TP_PROTO(
@@ -1133,9 +1131,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
-			__string(dstaddr, clp ?
-				rpc_peeraddr2str(clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR) : "unknown")
+			__string(dstaddr, clp ? clp->cl_hostname : "unknown")
 		),
 
 		TP_fast_assign(
@@ -1148,9 +1144,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
 				__entry->fileid = 0;
 				__entry->dev = 0;
 			}
-			__assign_str(dstaddr, clp ?
-				rpc_peeraddr2str(clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR) : "unknown")
+			__assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
 		),
 
 		TP_printk(
@@ -1192,9 +1186,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
-			__string(dstaddr, clp ?
-				rpc_peeraddr2str(clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR) : "unknown")
+			__string(dstaddr, clp ? clp->cl_hostname : "unknown")
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 		),
@@ -1209,9 +1201,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
 				__entry->fileid = 0;
 				__entry->dev = 0;
 			}
-			__assign_str(dstaddr, clp ?
-				rpc_peeraddr2str(clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR) : "unknown")
+			__assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
 			__entry->stateid_seq =
 				be32_to_cpu(stateid->seqid);
 			__entry->stateid_hash =
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 14ed9791ec9c..77c6729e57f0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4385,6 +4385,14 @@ static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *state
 	return decode_stateid(xdr, stateid);
 }
 
+static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	nfs4_stateid dummy;
+
+	nfs4_stateid_copy(stateid, &invalid_stateid);
+	return decode_stateid(xdr, &dummy);
+}
+
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
 	int status;
@@ -4393,7 +4401,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_open_stateid(xdr, &res->stateid);
+		status = decode_invalid_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -6108,6 +6116,8 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
 	res->lrs_present = be32_to_cpup(p);
 	if (res->lrs_present)
 		status = decode_layout_stateid(xdr, &res->stateid);
+	else
+		nfs4_stateid_copy(&res->stateid, &invalid_stateid);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3bcd669a3152..d602fe9e1ac8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -251,7 +251,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 void
 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-	atomic_inc(&lo->plh_refcount);
+	refcount_inc(&lo->plh_refcount);
 }
 
 static struct pnfs_layout_hdr *
@@ -296,7 +296,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 
 	pnfs_layoutreturn_before_put_layout_hdr(lo);
 
-	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+	if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 		if (!list_empty(&lo->plh_segs))
 			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
 		pnfs_detach_layout_hdr(lo);
@@ -355,6 +355,24 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
 }
 
 /*
+ * Update the seqid of a layout stateid
+ */
+bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+	struct pnfs_layout_hdr *lo;
+	bool ret = false;
+
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+		dst->seqid = lo->plh_stateid.seqid;
+		ret = true;
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
  * Mark a pnfs_layout_hdr and all associated layout segments as invalid
  *
  * In order to continue using the pnfs_layout_hdr, a full recovery
@@ -395,14 +413,14 @@ pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 {
 	lo->plh_retry_timestamp = jiffies;
 	if (!test_and_set_bit(fail_bit, &lo->plh_flags))
-		atomic_inc(&lo->plh_refcount);
+		refcount_inc(&lo->plh_refcount);
 }
 
 static void
 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 {
 	if (test_and_clear_bit(fail_bit, &lo->plh_flags))
-		atomic_dec(&lo->plh_refcount);
+		refcount_dec(&lo->plh_refcount);
 }
 
 static void
@@ -450,7 +468,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
 {
 	INIT_LIST_HEAD(&lseg->pls_list);
 	INIT_LIST_HEAD(&lseg->pls_lc_list);
-	atomic_set(&lseg->pls_refcount, 1);
+	refcount_set(&lseg->pls_refcount, 1);
 	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 	lseg->pls_layout = lo;
 	lseg->pls_range = *range;
@@ -472,7 +490,7 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 	list_del_init(&lseg->pls_list);
 	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
-	atomic_dec(&lo->plh_refcount);
+	refcount_dec(&lo->plh_refcount);
 	if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
 		return;
 	if (list_empty(&lo->plh_segs) &&
@@ -507,13 +525,13 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 		return;
 
 	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
-		atomic_read(&lseg->pls_refcount),
+		refcount_read(&lseg->pls_refcount),
 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 
 	lo = lseg->pls_layout;
 	inode = lo->plh_inode;
 
-	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+	if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 			spin_unlock(&inode->i_lock);
 			return;
@@ -551,7 +569,7 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 		struct list_head *tmp_list)
 {
-	if (!atomic_dec_and_test(&lseg->pls_refcount))
+	if (!refcount_dec_and_test(&lseg->pls_refcount))
 		return false;
 	pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
 	list_add(&lseg->pls_list, tmp_list);
@@ -570,7 +588,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 		 * outstanding io is finished.
 		 */
 		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
-			atomic_read(&lseg->pls_refcount));
+			refcount_read(&lseg->pls_refcount));
 		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
 			rv = 1;
 	}
@@ -1451,7 +1469,7 @@ alloc_init_layout_hdr(struct inode *ino,
 	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
 	if (!lo)
 		return NULL;
-	atomic_set(&lo->plh_refcount, 1);
+	refcount_set(&lo->plh_refcount, 1);
 	INIT_LIST_HEAD(&lo->plh_layouts);
 	INIT_LIST_HEAD(&lo->plh_segs);
 	INIT_LIST_HEAD(&lo->plh_return_segs);
@@ -1513,7 +1531,7 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
 	if ((range->iomode == IOMODE_RW &&
 	     ls_range->iomode != IOMODE_RW) ||
 	    (range->iomode != ls_range->iomode &&
-	     strict_iomode == true) ||
+	     strict_iomode) ||
 	    !pnfs_lseg_range_intersecting(ls_range, range))
 		return 0;
 
@@ -1546,7 +1564,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 	}
 
 	dprintk("%s:Return lseg %p ref %d\n",
-		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
+		__func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
 	return ret;
 }
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 87f144f14d1e..8d507c361d98 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
+#include <linux/refcount.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/workqueue.h>
@@ -54,7 +55,7 @@ struct nfs4_pnfs_ds {
 	char			*ds_remotestr;	/* comma sep list of addrs */
 	struct list_head	ds_addrs;
 	struct nfs_client	*ds_clp;
-	atomic_t		ds_count;
+	refcount_t		ds_count;
 	unsigned long		ds_state;
 #define NFS4DS_CONNECTING	0	/* ds is establishing connection */
 };
@@ -63,7 +64,7 @@ struct pnfs_layout_segment {
 	struct list_head pls_list;
 	struct list_head pls_lc_list;
 	struct pnfs_layout_range pls_range;
-	atomic_t pls_refcount;
+	refcount_t pls_refcount;
 	u32 pls_seq;
 	unsigned long pls_flags;
 	struct pnfs_layout_hdr *pls_layout;
@@ -179,7 +180,7 @@ struct pnfs_layoutdriver_type {
 };
 
 struct pnfs_layout_hdr {
-	atomic_t		plh_refcount;
+	refcount_t		plh_refcount;
 	atomic_t		plh_outstanding; /* number of RPCs out */
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_bulk_destroy;
@@ -251,6 +252,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
 		bool is_recall);
 int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 		bool is_recall);
+bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode);
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 			     const nfs4_stateid *new,
@@ -393,7 +395,7 @@ static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
 	if (lseg) {
-		atomic_inc(&lseg->pls_refcount);
+		refcount_inc(&lseg->pls_refcount);
 		smp_mb__after_atomic();
 	}
 	return lseg;
@@ -764,6 +766,11 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 {
 }
 
+static inline bool nfs4_refresh_layout_stateid(nfs4_stateid *dst,
+		struct inode *inode)
+{
+	return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 60da59be83b6..03aaa60c7768 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -338,7 +338,7 @@ print_ds(struct nfs4_pnfs_ds *ds)
 		"        client %p\n"
 		"        cl_exchange_flags %x\n",
 		ds->ds_remotestr,
-		atomic_read(&ds->ds_count), ds->ds_clp,
+		refcount_read(&ds->ds_count), ds->ds_clp,
 		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
 }
 
@@ -451,7 +451,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
 
 void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
 {
-	if (atomic_dec_and_lock(&ds->ds_count,
+	if (refcount_dec_and_lock(&ds->ds_count,
 				&nfs4_ds_cache_lock)) {
 		list_del_init(&ds->ds_node);
 		spin_unlock(&nfs4_ds_cache_lock);
@@ -537,7 +537,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
 		INIT_LIST_HEAD(&ds->ds_addrs);
 		list_splice_init(dsaddrs, &ds->ds_addrs);
 		ds->ds_remotestr = remotestr;
-		atomic_set(&ds->ds_count, 1);
+		refcount_set(&ds->ds_count, 1);
 		INIT_LIST_HEAD(&ds->ds_node);
 		ds->ds_clp = NULL;
 		list_add(&ds->ds_node, &nfs4_data_server_cache);
@@ -546,10 +546,10 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
 	} else {
 		kfree(remotestr);
 		kfree(ds);
-		atomic_inc(&tmp_ds->ds_count);
+		refcount_inc(&tmp_ds->ds_count);
 		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
 			__func__, tmp_ds->ds_remotestr,
-			atomic_read(&tmp_ds->ds_count));
+			refcount_read(&tmp_ds->ds_count));
 		ds = tmp_ds;
 	}
 	spin_unlock(&nfs4_ds_cache_lock);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c9d24bae3025..29bacdc56f6a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -813,9 +813,9 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 	 */
 	seq_printf(m, "\n\topts:\t");
 	seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
-	seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-	seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-	seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : "");
 	nfs_show_mount_options(m, nfss, 1);
 
 	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
@@ -1332,7 +1332,7 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->options |= NFS_OPTION_MIGRATION;
 			break;
 		case Opt_nomigration:
-			mnt->options &= NFS_OPTION_MIGRATION;
+			mnt->options &= ~NFS_OPTION_MIGRATION;
 			break;
 
 		/*
@@ -1456,18 +1456,21 @@ static int nfs_parse_mount_options(char *raw,
 			switch (token) {
 			case Opt_xprt_udp6:
 				protofamily = AF_INET6;
+				/* fall through */
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 				break;
 			case Opt_xprt_tcp6:
 				protofamily = AF_INET6;
+				/* fall through */
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
 			case Opt_xprt_rdma6:
 				protofamily = AF_INET6;
+				/* fall through */
 			case Opt_xprt_rdma:
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
@@ -1494,11 +1497,13 @@ static int nfs_parse_mount_options(char *raw,
 			switch (token) {
 			case Opt_xprt_udp6:
 				mountfamily = AF_INET6;
+				/* fall through */
 			case Opt_xprt_udp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
 				break;
 			case Opt_xprt_tcp6:
 				mountfamily = AF_INET6;
+				/* fall through */
 			case Opt_xprt_tcp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
@@ -1988,9 +1993,9 @@ static int nfs23_validate_mount_data(void *options,
 	args->version = NFS_DEFAULT_VERSION;
 	switch (data->version) {
 	case 1:
-		data->namlen = 0;
+		data->namlen = 0; /* fall through */
 	case 2:
-		data->bsize = 0;
+		data->bsize = 0; /* fall through */
 	case 3:
 		if (data->flags & NFS_MOUNT_VER3)
 			goto out_no_v3;
@@ -1998,11 +2003,14 @@ static int nfs23_validate_mount_data(void *options,
 		memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
 		/* Turn off security negotiation */
 		extra_flags |= NFS_MOUNT_SECFLAVOUR;
+		/* fall through */
 	case 4:
 		if (data->flags & NFS_MOUNT_SECFLAVOUR)
 			goto out_no_sec;
+		/* fall through */
 	case 5:
 		memset(data->context, 0, sizeof(data->context));
+		/* fall through */
 	case 6:
 		if (data->flags & NFS_MOUNT_VER3) {
 			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
@@ -2288,11 +2296,11 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	/*
 	 * noac is a special case. It implies -o sync, but that's not
 	 * necessarily reflected in the mtab options. do_remount_sb
-	 * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+	 * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the
 	 * remount options, so we have to explicitly reset it.
 	 */
 	if (data->flags & NFS_MOUNT_NOAC)
-		*flags |= MS_SYNCHRONOUS;
+		*flags |= SB_SYNCHRONOUS;
 
 	/* compare new mount options with old ones */
 	error = nfs_compare_remount_data(nfss, data);
@@ -2341,7 +2349,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 		sb->s_time_gran = 1;
 		sb->s_export_op = &nfs_export_ops;
 	}
@@ -2371,7 +2379,7 @@ static void nfs_clone_super(struct super_block *sb,
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	}
 
  	nfs_initialise_sb(sb);
@@ -2592,11 +2600,11 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 
 	/* -o noac implies -o sync */
 	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+		sb_mntdata.mntflags |= SB_SYNCHRONOUS;
 
 	if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
-		if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
-			sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+		if (mount_info->cloned->sb->s_flags & SB_SYNCHRONOUS)
+			sb_mntdata.mntflags |= SB_SYNCHRONOUS;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
 	s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
@@ -2633,7 +2641,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 	if (error)
 		goto error_splat_root;
 
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 
 out:
 	return mntroot;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index babebbccae2a..4a379d7918f2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -487,10 +487,8 @@ try_again:
 	}
 
 	ret = nfs_page_group_lock(head);
-	if (ret < 0) {
-		nfs_unlock_and_release_request(head);
-		return ERR_PTR(ret);
-	}
+	if (ret < 0)
+		goto release_request;
 
 	/* lock each request in the page group */
 	total_bytes = head->wb_bytes;
@@ -515,8 +513,7 @@ try_again:
 			if (ret < 0) {
 				nfs_unroll_locks(inode, head, subreq);
 				nfs_release_request(subreq);
-				nfs_unlock_and_release_request(head);
-				return ERR_PTR(ret);
+				goto release_request;
 			}
 		}
 		/*
@@ -532,8 +529,8 @@ try_again:
 			nfs_page_group_unlock(head);
 			nfs_unroll_locks(inode, head, subreq);
 			nfs_unlock_and_release_request(subreq);
-			nfs_unlock_and_release_request(head);
-			return ERR_PTR(-EIO);
+			ret = -EIO;
+			goto release_request;
 		}
 	}
 
@@ -576,6 +573,10 @@ try_again:
 	/* still holds ref on head from nfs_page_find_head_request
 	 * and still has lock on head from lock loop */
 	return head;
+
+release_request:
+	nfs_unlock_and_release_request(head);
+	return ERR_PTR(ret);
 }
 
 static void nfs_write_error_remove_page(struct nfs_page *req)
@@ -1889,6 +1890,8 @@ int nfs_commit_inode(struct inode *inode, int how)
 	if (res)
 		error = nfs_generic_commit_list(inode, &head, how, &cinfo);
 	nfs_commit_end(cinfo.mds);
+	if (res == 0)
+		return res;
 	if (error < 0)
 		goto out_error;
 	if (!may_wait)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index 420d3a0ab258..5be08f02a76b 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm)
 	struct list_head *grace_list = net_generic(net, grace_net_id);
 
 	spin_lock(&grace_lock);
-	list_add(&lm->list, grace_list);
+	if (list_empty(&lm->list))
+		list_add(&lm->list, grace_list);
+	else
+		WARN(1, "double list_add attempt detected in net %x %s\n",
+		     net->ns.inum, (net == &init_net) ? "(init_net)" : "");
 	spin_unlock(&grace_lock);
 }
 EXPORT_SYMBOL_GPL(locks_start_grace);
@@ -55,14 +59,7 @@ locks_end_grace(struct lock_manager *lm)
 }
 EXPORT_SYMBOL_GPL(locks_end_grace);
 
-/**
- * locks_in_grace
- *
- * Lock managers call this function to determine when it is OK for them
- * to answer ordinary lock requests, and when they should accept only
- * lock reclaims.
- */
-int
+static bool
 __state_in_grace(struct net *net, bool open)
 {
 	struct list_head *grace_list = net_generic(net, grace_net_id);
@@ -78,15 +75,22 @@ __state_in_grace(struct net *net, bool open)
 	return false;
 }
 
-int locks_in_grace(struct net *net)
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+bool locks_in_grace(struct net *net)
 {
-	return __state_in_grace(net, 0);
+	return __state_in_grace(net, false);
 }
 EXPORT_SYMBOL_GPL(locks_in_grace);
 
-int opens_in_grace(struct net *net)
+bool opens_in_grace(struct net *net)
 {
-	return __state_in_grace(net, 1);
+	return __state_in_grace(net, true);
 }
 EXPORT_SYMBOL_GPL(opens_in_grace);
 
@@ -104,7 +108,9 @@ grace_exit_net(struct net *net)
 {
 	struct list_head *grace_list = net_generic(net, grace_net_id);
 
-	BUG_ON(!list_empty(grace_list));
+	WARN_ONCE(!list_empty(grace_list),
+		  "net %x %s: grace_list is not empty\n",
+		  net->ns.inum, __func__);
 }
 
 static struct pernet_operations grace_net_ops = {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 697f8ae7792d..f650e475d8f0 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 				gi->gid[i] = exp->ex_anon_gid;
 			else
 				gi->gid[i] = rqgi->gid[i];
+
+			/* Each thread allocates its own gi, no race */
+			groups_sort(gi);
 		}
 	} else {
 		gi = get_group_info(rqgi);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 46b48dbbdd32..8ceb25a10ea0 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -232,7 +232,7 @@ static struct cache_head *expkey_alloc(void)
 		return NULL;
 }
 
-static struct cache_detail svc_expkey_cache_template = {
+static const struct cache_detail svc_expkey_cache_template = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
 	.name		= "nfsd.fh",
@@ -748,7 +748,7 @@ static struct cache_head *svc_export_alloc(void)
 		return NULL;
 }
 
-static struct cache_detail svc_export_cache_template = {
+static const struct cache_detail svc_export_cache_template = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPORT_HASHMAX,
 	.name		= "nfsd.export",
@@ -1230,7 +1230,7 @@ nfsd_export_init(struct net *net)
 	int rv;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	dprintk("nfsd: initializing export module (net: %p).\n", net);
+	dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum);
 
 	nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net);
 	if (IS_ERR(nn->svc_export_cache))
@@ -1278,7 +1278,7 @@ nfsd_export_shutdown(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	dprintk("nfsd: shutting down export module (net: %p).\n", net);
+	dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum);
 
 	cache_unregister_net(nn->svc_expkey_cache, net);
 	cache_unregister_net(nn->svc_export_cache, net);
@@ -1286,5 +1286,5 @@ nfsd_export_shutdown(struct net *net)
 	cache_destroy_net(nn->svc_export_cache, net);
 	svcauth_unix_purge(net);
 
-	dprintk("nfsd: export shutdown complete (net: %p).\n", net);
+	dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum);
 }
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 6dfede6d172a..84831253203d 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -12,6 +12,7 @@
 #include <linux/nsproxy.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/uaccess.h>
+#include <linux/kernel.h>
 
 #include "state.h"
 #include "netns.h"
@@ -126,8 +127,6 @@ static struct nfsd_fault_inject_op inject_ops[] = {
 	},
 };
 
-#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op))
-
 int nfsd_fault_inject_init(void)
 {
 	unsigned int i;
@@ -138,7 +137,7 @@ int nfsd_fault_inject_init(void)
 	if (!debug_dir)
 		goto fail;
 
-	for (i = 0; i < NUM_INJECT_OPS; i++) {
+	for (i = 0; i < ARRAY_SIZE(inject_ops); i++) {
 		op = &inject_ops[i];
 		if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
 			goto fail;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 3714231a9d0f..36358d435cb0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -107,7 +107,7 @@ struct nfsd_net {
 	bool lockd_up;
 
 	/* Time of server startup */
-	struct timeval nfssvc_boot;
+	struct timespec64 nfssvc_boot;
 
 	/*
 	 * Max number of connections this nfsd container will allow. Defaults
@@ -119,6 +119,9 @@ struct nfsd_net {
 	u32 clverifier_counter;
 
 	struct svc_serv *nfsd_serv;
+
+	wait_queue_head_t ntf_wq;
+	atomic_t ntf_refcnt;
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f38acd905441..2758480555fa 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -748,8 +748,9 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
 	if (resp->status == 0) {
 		*p++ = htonl(resp->count);
 		*p++ = htonl(resp->committed);
-		*p++ = htonl(nn->nfssvc_boot.tv_sec);
-		*p++ = htonl(nn->nfssvc_boot.tv_usec);
+		/* unique identifier, y2038 overflow can be ignored */
+		*p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
+		*p++ = htonl(nn->nfssvc_boot.tv_nsec);
 	}
 	return xdr_ressize_check(rqstp, p);
 }
@@ -1119,8 +1120,9 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
 	p = encode_wcc_data(rqstp, p, &resp->fh);
 	/* Write verifier */
 	if (resp->status == 0) {
-		*p++ = htonl(nn->nfssvc_boot.tv_sec);
-		*p++ = htonl(nn->nfssvc_boot.tv_usec);
+		/* unique identifier, y2038 overflow can be ignored */
+		*p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
+		*p++ = htonl(nn->nfssvc_boot.tv_nsec);
 	}
 	return xdr_ressize_check(rqstp, p);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6b9b6cca469f..a5bb76593ce7 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -178,7 +178,7 @@ static struct ent *idtoname_lookup(struct cache_detail *, struct ent *);
 static struct ent *idtoname_update(struct cache_detail *, struct ent *,
 				   struct ent *);
 
-static struct cache_detail idtoname_cache_template = {
+static const struct cache_detail idtoname_cache_template = {
 	.owner		= THIS_MODULE,
 	.hash_size	= ENT_HASHMAX,
 	.name		= "nfs4.idtoname",
@@ -341,7 +341,7 @@ static struct ent *nametoid_update(struct cache_detail *, struct ent *,
 				   struct ent *);
 static int         nametoid_parse(struct cache_detail *, char *, int);
 
-static struct cache_detail nametoid_cache_template = {
+static const struct cache_detail nametoid_cache_template = {
 	.owner		= THIS_MODULE,
 	.hash_size	= ENT_HASHMAX,
 	.name		= "nfs4.nametoid",
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ea45d954e8d7..7d888369f85a 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -336,7 +336,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
 
 	trace_layout_recall(&ls->ls_stid.sc_stateid);
 
-	atomic_inc(&ls->ls_stid.sc_count);
+	refcount_inc(&ls->ls_stid.sc_count);
 	nfsd4_run_cb(&ls->ls_recall);
 
 out_unlock:
@@ -441,7 +441,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
 			goto done;
 	}
 
-	atomic_inc(&ls->ls_stid.sc_count);
+	refcount_inc(&ls->ls_stid.sc_count);
 	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
 	new = NULL;
 done:
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8487486ec496..008ea0b627d0 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -485,9 +485,6 @@ static __be32
 nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    union nfsd4_op_u *u)
 {
-	if (!cstate->current_fh.fh_dentry)
-		return nfserr_nofilehandle;
-
 	u->getfh = &cstate->current_fh;
 	return nfs_ok;
 }
@@ -535,9 +532,6 @@ static __be32
 nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     union nfsd4_op_u *u)
 {
-	if (!cstate->current_fh.fh_dentry)
-		return nfserr_nofilehandle;
-
 	fh_dup2(&cstate->save_fh, &cstate->current_fh);
 	if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) {
 		memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t));
@@ -570,10 +564,11 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
 
 	/*
 	 * This is opaque to client, so no need to byte-swap. Use
-	 * __force to keep sparse happy
+	 * __force to keep sparse happy. y2038 time_t overflow is
+	 * irrelevant in this usage.
 	 */
 	verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
-	verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec;
+	verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
 	memcpy(verifier->data, verf, sizeof(verifier->data));
 }
 
@@ -703,10 +698,8 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   union nfsd4_op_u *u)
 {
 	struct nfsd4_link *link = &u->link;
-	__be32 status = nfserr_nofilehandle;
+	__be32 status;
 
-	if (!cstate->save_fh.fh_dentry)
-		return status;
 	status = nfsd_link(rqstp, &cstate->current_fh,
 			   link->li_name, link->li_namelen, &cstate->save_fh);
 	if (!status)
@@ -850,10 +843,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     union nfsd4_op_u *u)
 {
 	struct nfsd4_rename *rename = &u->rename;
-	__be32 status = nfserr_nofilehandle;
+	__be32 status;
 
-	if (!cstate->save_fh.fh_dentry)
-		return status;
 	if (opens_in_grace(SVC_NET(rqstp)) &&
 		!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
 		return nfserr_grace;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0c04f81aa63b..b29b5a185a2c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -63,12 +63,16 @@ static const stateid_t zero_stateid = {
 static const stateid_t currentstateid = {
 	.si_generation = 1,
 };
+static const stateid_t close_stateid = {
+	.si_generation = 0xffffffffU,
+};
 
 static u64 current_sessionid = 1;
 
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
 #define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
+#define CLOSE_STATEID(stateid)  (!memcmp((stateid), &close_stateid, sizeof(stateid_t)))
 
 /* forward declarations */
 static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
@@ -83,6 +87,11 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
  */
 static DEFINE_SPINLOCK(state_lock);
 
+enum nfsd4_st_mutex_lock_subclass {
+	OPEN_STATEID_MUTEX = 0,
+	LOCK_STATEID_MUTEX = 1,
+};
+
 /*
  * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
  * the refcount on the open stateid to drop.
@@ -359,7 +368,7 @@ put_nfs4_file(struct nfs4_file *fi)
 {
 	might_lock(&state_lock);
 
-	if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
+	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
 		hlist_del_rcu(&fi->fi_hash);
 		spin_unlock(&state_lock);
 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
@@ -568,7 +577,7 @@ alloc_clnt_odstate(struct nfs4_client *clp)
 	co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL);
 	if (co) {
 		co->co_client = clp;
-		atomic_set(&co->co_odcount, 1);
+		refcount_set(&co->co_odcount, 1);
 	}
 	return co;
 }
@@ -586,7 +595,7 @@ static inline void
 get_clnt_odstate(struct nfs4_clnt_odstate *co)
 {
 	if (co)
-		atomic_inc(&co->co_odcount);
+		refcount_inc(&co->co_odcount);
 }
 
 static void
@@ -598,7 +607,7 @@ put_clnt_odstate(struct nfs4_clnt_odstate *co)
 		return;
 
 	fp = co->co_file;
-	if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
+	if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
 		list_del(&co->co_perfile);
 		spin_unlock(&fp->fi_lock);
 
@@ -656,7 +665,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla
 	stid->sc_stateid.si_opaque.so_id = new_id;
 	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
 	/* Will be incremented before return to client: */
-	atomic_set(&stid->sc_count, 1);
+	refcount_set(&stid->sc_count, 1);
 	spin_lock_init(&stid->sc_lock);
 
 	/*
@@ -813,7 +822,7 @@ nfs4_put_stid(struct nfs4_stid *s)
 
 	might_lock(&clp->cl_lock);
 
-	if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
+	if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
 		wake_up_all(&close_wq);
 		return;
 	}
@@ -913,7 +922,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 	if (status)
 		return status;
 	++fp->fi_delegees;
-	atomic_inc(&dp->dl_stid.sc_count);
+	refcount_inc(&dp->dl_stid.sc_count);
 	dp->dl_stid.sc_type = NFS4_DELEG_STID;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	list_add(&dp->dl_perclnt, &clp->cl_delegations);
@@ -1214,7 +1223,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
 
 	WARN_ON_ONCE(!list_empty(&stp->st_locks));
 
-	if (!atomic_dec_and_test(&s->sc_count)) {
+	if (!refcount_dec_and_test(&s->sc_count)) {
 		wake_up_all(&close_wq);
 		return;
 	}
@@ -1439,8 +1448,10 @@ free_session_slots(struct nfsd4_session *ses)
 {
 	int i;
 
-	for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
+		free_svc_cred(&ses->se_slots[i]->sl_cred);
 		kfree(ses->se_slots[i]);
+	}
 }
 
 /*
@@ -1472,6 +1483,11 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
 	spin_lock(&nfsd_drc_lock);
 	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
 		    nfsd_drc_max_mem - nfsd_drc_mem_used);
+	/*
+	 * Never use more than a third of the remaining memory,
+	 * unless it's the only way to give this client a slot:
+	 */
+	avail = clamp_t(int, avail, slotsize, avail/3);
 	num = min_t(int, num, avail / slotsize);
 	nfsd_drc_mem_used += num * slotsize;
 	spin_unlock(&nfsd_drc_lock);
@@ -2072,7 +2088,7 @@ find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
 	s = find_stateid_locked(cl, t);
 	if (s != NULL) {
 		if (typemask & s->sc_type)
-			atomic_inc(&s->sc_count);
+			refcount_inc(&s->sc_count);
 		else
 			s = NULL;
 	}
@@ -2287,14 +2303,18 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 
 	dprintk("--> %s slot %p\n", __func__, slot);
 
+	slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
 	slot->sl_opcnt = resp->opcnt;
 	slot->sl_status = resp->cstate.status;
+	free_svc_cred(&slot->sl_cred);
+	copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred);
 
-	slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
-	if (nfsd4_not_cached(resp)) {
-		slot->sl_datalen = 0;
+	if (!nfsd4_cache_this(resp)) {
+		slot->sl_flags &= ~NFSD4_SLOT_CACHED;
 		return;
 	}
+	slot->sl_flags |= NFSD4_SLOT_CACHED;
+
 	base = resp->cstate.data_offset;
 	slot->sl_datalen = buf->len - base;
 	if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
@@ -2321,8 +2341,16 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
 	op = &args->ops[resp->opcnt - 1];
 	nfsd4_encode_operation(resp, op);
 
-	/* Return nfserr_retry_uncached_rep in next operation. */
-	if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) {
+	if (slot->sl_flags & NFSD4_SLOT_CACHED)
+		return op->status;
+	if (args->opcnt == 1) {
+		/*
+		 * The original operation wasn't a solo sequence--we
+		 * always cache those--so this retry must not match the
+		 * original:
+		 */
+		op->status = nfserr_seq_false_retry;
+	} else {
 		op = &args->ops[resp->opcnt++];
 		op->status = nfserr_retry_uncached_rep;
 		nfsd4_encode_operation(resp, op);
@@ -2986,6 +3014,34 @@ static bool nfsd4_request_too_big(struct svc_rqst *rqstp,
 	return xb->len > session->se_fchannel.maxreq_sz;
 }
 
+static bool replay_matches_cache(struct svc_rqst *rqstp,
+		 struct nfsd4_sequence *seq, struct nfsd4_slot *slot)
+{
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+	if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) !=
+	    (bool)seq->cachethis)
+		return false;
+	/*
+	 * If there's an error than the reply can have fewer ops than
+	 * the call.  But if we cached a reply with *more* ops than the
+	 * call you're sending us now, then this new call is clearly not
+	 * really a replay of the old one:
+	 */
+	if (slot->sl_opcnt < argp->opcnt)
+		return false;
+	/* This is the only check explicitly called by spec: */
+	if (!same_creds(&rqstp->rq_cred, &slot->sl_cred))
+		return false;
+	/*
+	 * There may be more comparisons we could actually do, but the
+	 * spec doesn't require us to catch every case where the calls
+	 * don't match (that would require caching the call as well as
+	 * the reply), so we don't bother.
+	 */
+	return true;
+}
+
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		union nfsd4_op_u *u)
@@ -3045,6 +3101,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_seq_misordered;
 		if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
 			goto out_put_session;
+		status = nfserr_seq_false_retry;
+		if (!replay_matches_cache(rqstp, seq, slot))
+			goto out_put_session;
 		cstate->slot = slot;
 		cstate->session = session;
 		cstate->clp = clp;
@@ -3351,7 +3410,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 {
 	lockdep_assert_held(&state_lock);
 
-	atomic_set(&fp->fi_ref, 1);
+	refcount_set(&fp->fi_ref, 1);
 	spin_lock_init(&fp->fi_lock);
 	INIT_LIST_HEAD(&fp->fi_stateids);
 	INIT_LIST_HEAD(&fp->fi_delegations);
@@ -3512,15 +3571,63 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 		/* ignore lock owners */
 		if (local->st_stateowner->so_is_open_owner == 0)
 			continue;
-		if (local->st_stateowner == &oo->oo_owner) {
+		if (local->st_stateowner != &oo->oo_owner)
+			continue;
+		if (local->st_stid.sc_type == NFS4_OPEN_STID) {
 			ret = local;
-			atomic_inc(&ret->st_stid.sc_count);
+			refcount_inc(&ret->st_stid.sc_count);
 			break;
 		}
 	}
 	return ret;
 }
 
+static __be32
+nfsd4_verify_open_stid(struct nfs4_stid *s)
+{
+	__be32 ret = nfs_ok;
+
+	switch (s->sc_type) {
+	default:
+		break;
+	case NFS4_CLOSED_STID:
+	case NFS4_CLOSED_DELEG_STID:
+		ret = nfserr_bad_stateid;
+		break;
+	case NFS4_REVOKED_DELEG_STID:
+		ret = nfserr_deleg_revoked;
+	}
+	return ret;
+}
+
+/* Lock the stateid st_mutex, and deal with races with CLOSE */
+static __be32
+nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+	__be32 ret;
+
+	mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
+	ret = nfsd4_verify_open_stid(&stp->st_stid);
+	if (ret != nfs_ok)
+		mutex_unlock(&stp->st_mutex);
+	return ret;
+}
+
+static struct nfs4_ol_stateid *
+nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+	struct nfs4_ol_stateid *stp;
+	for (;;) {
+		spin_lock(&fp->fi_lock);
+		stp = nfsd4_find_existing_open(fp, open);
+		spin_unlock(&fp->fi_lock);
+		if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok)
+			break;
+		nfs4_put_stid(&stp->st_stid);
+	}
+	return stp;
+}
+
 static struct nfs4_openowner *
 alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 			   struct nfsd4_compound_state *cstate)
@@ -3563,8 +3670,9 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
 	stp = open->op_stp;
 	/* We are moving these outside of the spinlocks to avoid the warnings */
 	mutex_init(&stp->st_mutex);
-	mutex_lock(&stp->st_mutex);
+	mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
 
+retry:
 	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	spin_lock(&fp->fi_lock);
 
@@ -3573,7 +3681,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
 		goto out_unlock;
 
 	open->op_stp = NULL;
-	atomic_inc(&stp->st_stid.sc_count);
+	refcount_inc(&stp->st_stid.sc_count);
 	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	INIT_LIST_HEAD(&stp->st_locks);
 	stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
@@ -3589,7 +3697,11 @@ out_unlock:
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&oo->oo_owner.so_client->cl_lock);
 	if (retstp) {
-		mutex_lock(&retstp->st_mutex);
+		/* Handle races with CLOSE */
+		if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+			nfs4_put_stid(&retstp->st_stid);
+			goto retry;
+		}
 		/* To keep mutex tracking happy */
 		mutex_unlock(&stp->st_mutex);
 		stp = retstp;
@@ -3621,7 +3733,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 	 * there should be no danger of the refcount going back up again at
 	 * this point.
 	 */
-	wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2);
+	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
 
 	release_all_access(s);
 	if (s->st_stid.sc_file) {
@@ -3647,7 +3759,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
 
 	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fh_match(&fp->fi_fhandle, fh)) {
-			if (atomic_inc_not_zero(&fp->fi_ref))
+			if (refcount_inc_not_zero(&fp->fi_ref))
 				return fp;
 		}
 	}
@@ -3783,7 +3895,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * lock) we know the server hasn't removed the lease yet, we know
 	 * it's safe to take a reference.
 	 */
-	atomic_inc(&dp->dl_stid.sc_count);
+	refcount_inc(&dp->dl_stid.sc_count);
 	nfsd4_run_cb(&dp->dl_recall);
 }
 
@@ -3966,7 +4078,8 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei
 {
 	struct nfs4_stid *ret;
 
-	ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+	ret = find_stateid_by_type(cl, s,
+				NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
 	if (!ret)
 		return NULL;
 	return delegstateid(ret);
@@ -3989,6 +4102,12 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (deleg == NULL)
 		goto out;
+	if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+		nfs4_put_stid(&deleg->dl_stid);
+		if (cl->cl_minorversion)
+			status = nfserr_deleg_revoked;
+		goto out;
+	}
 	flags = share_access_to_flags(open->op_share_access);
 	status = nfs4_check_delegmode(deleg, flags);
 	if (status) {
@@ -4392,6 +4511,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
+	bool new_stp = false;
 
 	/*
 	 * Lookup file; if found, lookup stateid and check open request,
@@ -4403,9 +4523,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
 			goto out;
-		spin_lock(&fp->fi_lock);
-		stp = nfsd4_find_existing_open(fp, open);
-		spin_unlock(&fp->fi_lock);
+		stp = nfsd4_find_and_lock_existing_open(fp, open);
 	} else {
 		open->op_file = NULL;
 		status = nfserr_bad_stateid;
@@ -4413,35 +4531,31 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			goto out;
 	}
 
+	if (!stp) {
+		stp = init_open_stateid(fp, open);
+		if (!open->op_stp)
+			new_stp = true;
+	}
+
 	/*
 	 * OPEN the file, or upgrade an existing OPEN.
 	 * If truncate fails, the OPEN fails.
+	 *
+	 * stp is already locked.
 	 */
-	if (stp) {
+	if (!new_stp) {
 		/* Stateid was found, this is an OPEN upgrade */
-		mutex_lock(&stp->st_mutex);
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
 		if (status) {
 			mutex_unlock(&stp->st_mutex);
 			goto out;
 		}
 	} else {
-		/* stp is returned locked. */
-		stp = init_open_stateid(fp, open);
-		/* See if we lost the race to some other thread */
-		if (stp->st_access_bmap != 0) {
-			status = nfs4_upgrade_open(rqstp, fp, current_fh,
-						stp, open);
-			if (status) {
-				mutex_unlock(&stp->st_mutex);
-				goto out;
-			}
-			goto upgrade_out;
-		}
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
-			mutex_unlock(&stp->st_mutex);
+			stp->st_stid.sc_type = NFS4_CLOSED_STID;
 			release_open_stateid(stp);
+			mutex_unlock(&stp->st_mutex);
 			goto out;
 		}
 
@@ -4450,7 +4564,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (stp->st_clnt_odstate == open->op_odstate)
 			open->op_odstate = NULL;
 	}
-upgrade_out:
+
 	nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
 	mutex_unlock(&stp->st_mutex);
 
@@ -4677,7 +4791,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 	spin_unlock(&nn->blocked_locks_lock);
 
 	while (!list_empty(&reaplist)) {
-		nbl = list_first_entry(&nn->blocked_locks_lru,
+		nbl = list_first_entry(&reaplist,
 					struct nfsd4_blocked_lock, nbl_lru);
 		list_del_init(&nbl->nbl_lru);
 		posix_unblock_lock(&nbl->nbl_lock);
@@ -4798,6 +4912,18 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
 	return nfserr_old_stateid;
 }
 
+static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session)
+{
+	__be32 ret;
+
+	spin_lock(&s->sc_lock);
+	ret = nfsd4_verify_open_stid(s);
+	if (ret == nfs_ok)
+		ret = check_stateid_generation(in, &s->sc_stateid, has_session);
+	spin_unlock(&s->sc_lock);
+	return ret;
+}
+
 static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
 {
 	if (ols->st_stateowner->so_is_open_owner &&
@@ -4811,7 +4937,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	struct nfs4_stid *s;
 	__be32 status = nfserr_bad_stateid;
 
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+		CLOSE_STATEID(stateid))
 		return status;
 	/* Client debugging aid. */
 	if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
@@ -4826,7 +4953,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	s = find_stateid_locked(cl, stateid);
 	if (!s)
 		goto out_unlock;
-	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+	status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
 	if (status)
 		goto out_unlock;
 	switch (s->sc_type) {
@@ -4858,8 +4985,19 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     struct nfs4_stid **s, struct nfsd_net *nn)
 {
 	__be32 status;
+	bool return_revoked = false;
 
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+	/*
+	 *  only return revoked delegations if explicitly asked.
+	 *  otherwise we report revoked or bad_stateid status.
+	 */
+	if (typemask & NFS4_REVOKED_DELEG_STID)
+		return_revoked = true;
+	else if (typemask & NFS4_DELEG_STID)
+		typemask |= NFS4_REVOKED_DELEG_STID;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+		CLOSE_STATEID(stateid))
 		return nfserr_bad_stateid;
 	status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn);
 	if (status == nfserr_stale_clientid) {
@@ -4872,6 +5010,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	*s = find_stateid_by_type(cstate->clp, stateid, typemask);
 	if (!*s)
 		return nfserr_bad_stateid;
+	if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+		nfs4_put_stid(*s);
+		if (cstate->minorversion)
+			return nfserr_deleg_revoked;
+		return nfserr_bad_stateid;
+	}
 	return nfs_ok;
 }
 
@@ -4971,7 +5115,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 				&s, nn);
 	if (status)
 		return status;
-	status = check_stateid_generation(stateid, &s->sc_stateid,
+	status = nfsd4_stid_check_stateid_generation(stateid, s,
 			nfsd4_has_session(cstate));
 	if (status)
 		goto out;
@@ -5025,7 +5169,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
 	struct nfs4_ol_stateid *stp = openlockstateid(s);
 	__be32 ret;
 
-	mutex_lock(&stp->st_mutex);
+	ret = nfsd4_lock_ol_stateid(stp);
+	if (ret)
+		goto out_put_stid;
 
 	ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
 	if (ret)
@@ -5036,11 +5182,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
 			    lockowner(stp->st_stateowner)))
 		goto out;
 
+	stp->st_stid.sc_type = NFS4_CLOSED_STID;
 	release_lock_stateid(stp);
 	ret = nfs_ok;
 
 out:
 	mutex_unlock(&stp->st_mutex);
+out_put_stid:
 	nfs4_put_stid(s);
 	return ret;
 }
@@ -5060,6 +5208,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	s = find_stateid_locked(cl, stateid);
 	if (!s)
 		goto out_unlock;
+	spin_lock(&s->sc_lock);
 	switch (s->sc_type) {
 	case NFS4_DELEG_STID:
 		ret = nfserr_locks_held;
@@ -5071,11 +5220,13 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		ret = nfserr_locks_held;
 		break;
 	case NFS4_LOCK_STID:
-		atomic_inc(&s->sc_count);
+		spin_unlock(&s->sc_lock);
+		refcount_inc(&s->sc_count);
 		spin_unlock(&cl->cl_lock);
 		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
 	case NFS4_REVOKED_DELEG_STID:
+		spin_unlock(&s->sc_lock);
 		dp = delegstateid(s);
 		list_del_init(&dp->dl_recall_lru);
 		spin_unlock(&cl->cl_lock);
@@ -5084,6 +5235,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	/* Default falls through and returns nfserr_bad_stateid */
 	}
+	spin_unlock(&s->sc_lock);
 out_unlock:
 	spin_unlock(&cl->cl_lock);
 out:
@@ -5106,15 +5258,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 	status = nfsd4_check_seqid(cstate, sop, seqid);
 	if (status)
 		return status;
-	if (stp->st_stid.sc_type == NFS4_CLOSED_STID
-		|| stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID)
-		/*
-		 * "Closed" stateid's exist *only* to return
-		 * nfserr_replay_me from the previous step, and
-		 * revoked delegations are kept only for free_stateid.
-		 */
-		return nfserr_bad_stateid;
-	mutex_lock(&stp->st_mutex);
+	status = nfsd4_lock_ol_stateid(stp);
+	if (status != nfs_ok)
+		return status;
 	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
 	if (status == nfs_ok)
 		status = nfs4_check_fh(current_fh, &stp->st_stid);
@@ -5294,7 +5440,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 	bool unhashed;
 	LIST_HEAD(reaplist);
 
-	s->st_stid.sc_type = NFS4_CLOSED_STID;
 	spin_lock(&clp->cl_lock);
 	unhashed = unhash_open_stateid(s, &reaplist);
 
@@ -5334,10 +5479,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
 		goto out; 
+
+	stp->st_stid.sc_type = NFS4_CLOSED_STID;
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
-	mutex_unlock(&stp->st_mutex);
 
 	nfsd4_close_open_stateid(stp);
+	mutex_unlock(&stp->st_mutex);
+
+	/* See RFC5661 sectionm 18.2.4 */
+	if (stp->st_stid.sc_client->cl_minorversion)
+		memcpy(&close->cl_stateid, &close_stateid,
+				sizeof(close->cl_stateid));
 
 	/* put reference from nfs4_preprocess_seqid_op */
 	nfs4_put_stid(&stp->st_stid);
@@ -5363,7 +5515,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out;
 	dp = delegstateid(s);
-	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
+	status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate));
 	if (status)
 		goto put_stateid;
 
@@ -5569,16 +5721,43 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
 	return ret;
 }
 
-static void
+static struct nfs4_ol_stateid *
+find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
+{
+	struct nfs4_ol_stateid *lst;
+	struct nfs4_client *clp = lo->lo_owner.so_client;
+
+	lockdep_assert_held(&clp->cl_lock);
+
+	list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
+		if (lst->st_stid.sc_type != NFS4_LOCK_STID)
+			continue;
+		if (lst->st_stid.sc_file == fp) {
+			refcount_inc(&lst->st_stid.sc_count);
+			return lst;
+		}
+	}
+	return NULL;
+}
+
+static struct nfs4_ol_stateid *
 init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 		  struct nfs4_file *fp, struct inode *inode,
 		  struct nfs4_ol_stateid *open_stp)
 {
 	struct nfs4_client *clp = lo->lo_owner.so_client;
+	struct nfs4_ol_stateid *retstp;
 
-	lockdep_assert_held(&clp->cl_lock);
+	mutex_init(&stp->st_mutex);
+	mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
+retry:
+	spin_lock(&clp->cl_lock);
+	spin_lock(&fp->fi_lock);
+	retstp = find_lock_stateid(lo, fp);
+	if (retstp)
+		goto out_unlock;
 
-	atomic_inc(&stp->st_stid.sc_count);
+	refcount_inc(&stp->st_stid.sc_count);
 	stp->st_stid.sc_type = NFS4_LOCK_STID;
 	stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
 	get_nfs4_file(fp);
@@ -5586,29 +5765,22 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
-	mutex_init(&stp->st_mutex);
 	list_add(&stp->st_locks, &open_stp->st_locks);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
-	spin_lock(&fp->fi_lock);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
+out_unlock:
 	spin_unlock(&fp->fi_lock);
-}
-
-static struct nfs4_ol_stateid *
-find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
-{
-	struct nfs4_ol_stateid *lst;
-	struct nfs4_client *clp = lo->lo_owner.so_client;
-
-	lockdep_assert_held(&clp->cl_lock);
-
-	list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
-		if (lst->st_stid.sc_file == fp) {
-			atomic_inc(&lst->st_stid.sc_count);
-			return lst;
+	spin_unlock(&clp->cl_lock);
+	if (retstp) {
+		if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+			nfs4_put_stid(&retstp->st_stid);
+			goto retry;
 		}
+		/* To keep mutex tracking happy */
+		mutex_unlock(&stp->st_mutex);
+		stp = retstp;
 	}
-	return NULL;
+	return stp;
 }
 
 static struct nfs4_ol_stateid *
@@ -5621,26 +5793,25 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
 	struct nfs4_openowner *oo = openowner(ost->st_stateowner);
 	struct nfs4_client *clp = oo->oo_owner.so_client;
 
+	*new = false;
 	spin_lock(&clp->cl_lock);
 	lst = find_lock_stateid(lo, fi);
-	if (lst == NULL) {
-		spin_unlock(&clp->cl_lock);
-		ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
-		if (ns == NULL)
-			return NULL;
-
-		spin_lock(&clp->cl_lock);
-		lst = find_lock_stateid(lo, fi);
-		if (likely(!lst)) {
-			lst = openlockstateid(ns);
-			init_lock_stateid(lst, lo, fi, inode, ost);
-			ns = NULL;
-			*new = true;
-		}
-	}
 	spin_unlock(&clp->cl_lock);
-	if (ns)
+	if (lst != NULL) {
+		if (nfsd4_lock_ol_stateid(lst) == nfs_ok)
+			goto out;
+		nfs4_put_stid(&lst->st_stid);
+	}
+	ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
+	if (ns == NULL)
+		return NULL;
+
+	lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost);
+	if (lst == openlockstateid(ns))
+		*new = true;
+	else
 		nfs4_put_stid(ns);
+out:
 	return lst;
 }
 
@@ -5677,7 +5848,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
 	struct nfs4_lockowner *lo;
 	struct nfs4_ol_stateid *lst;
 	unsigned int strhashval;
-	bool hashed;
 
 	lo = find_lockowner_str(cl, &lock->lk_new_owner);
 	if (!lo) {
@@ -5693,25 +5863,12 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
 			goto out;
 	}
 
-retry:
 	lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
 	if (lst == NULL) {
 		status = nfserr_jukebox;
 		goto out;
 	}
 
-	mutex_lock(&lst->st_mutex);
-
-	/* See if it's still hashed to avoid race with FREE_STATEID */
-	spin_lock(&cl->cl_lock);
-	hashed = !list_empty(&lst->st_perfile);
-	spin_unlock(&cl->cl_lock);
-
-	if (!hashed) {
-		mutex_unlock(&lst->st_mutex);
-		nfs4_put_stid(&lst->st_stid);
-		goto retry;
-	}
 	status = nfs_ok;
 	*plst = lst;
 out:
@@ -5917,14 +6074,16 @@ out:
 		    seqid_mutating_err(ntohl(status)))
 			lock_sop->lo_owner.so_seqid++;
 
-		mutex_unlock(&lock_stp->st_mutex);
-
 		/*
 		 * If this is a new, never-before-used stateid, and we are
 		 * returning an error, then just go ahead and release it.
 		 */
-		if (status && new)
+		if (status && new) {
+			lock_stp->st_stid.sc_type = NFS4_CLOSED_STID;
 			release_lock_stateid(lock_stp);
+		}
+
+		mutex_unlock(&lock_stp->st_mutex);
 
 		nfs4_put_stid(&lock_stp->st_stid);
 	}
@@ -6944,6 +7103,10 @@ static int nfs4_state_create_net(struct net *net)
 		INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
 	nn->conf_name_tree = RB_ROOT;
 	nn->unconf_name_tree = RB_ROOT;
+	nn->boot_time = get_seconds();
+	nn->grace_ended = false;
+	nn->nfsd4_manager.block_opens = true;
+	INIT_LIST_HEAD(&nn->nfsd4_manager.list);
 	INIT_LIST_HEAD(&nn->client_lru);
 	INIT_LIST_HEAD(&nn->close_lru);
 	INIT_LIST_HEAD(&nn->del_recall_lru);
@@ -7001,13 +7164,10 @@ nfs4_state_start_net(struct net *net)
 	ret = nfs4_state_create_net(net);
 	if (ret)
 		return ret;
-	nn->boot_time = get_seconds();
-	nn->grace_ended = false;
-	nn->nfsd4_manager.block_opens = true;
 	locks_start_grace(net, &nn->nfsd4_manager);
 	nfsd4_client_tracking_init(net);
-	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
-	       nn->nfsd4_grace, net);
+	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
+	       nn->nfsd4_grace, net->ns.inum);
 	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
 	return 0;
 }
@@ -7080,7 +7240,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_unlock(&nn->blocked_locks_lock);
 
 	while (!list_empty(&reaplist)) {
-		nbl = list_first_entry(&nn->blocked_locks_lru,
+		nbl = list_first_entry(&reaplist,
 					struct nfsd4_blocked_lock, nbl_lru);
 		list_del_init(&nbl->nbl_lru);
 		posix_unblock_lock(&nbl->nbl_lock);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6493df6b1bd5..d107b4426f7e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1241,6 +1241,9 @@ static __net_init int nfsd_init_net(struct net *net)
 	nn->nfsd4_grace = 90;
 	nn->clverifier_counter = prandom_u32();
 	nn->clientid_counter = prandom_u32();
+
+	atomic_set(&nn->ntf_refcnt, 0);
+	init_waitqueue_head(&nn->ntf_wq);
 	return 0;
 
 out_idmap_error:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e02bd2783124..89cb484f1cfb 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -335,7 +335,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct sockaddr_in sin;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nn->ntf_refcnt))
 		goto out;
 
 	if (nn->nfsd_serv) {
@@ -344,6 +345,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
 		sin.sin_addr.s_addr = ifa->ifa_local;
 		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
 	}
+	atomic_dec(&nn->ntf_refcnt);
+	wake_up(&nn->ntf_wq);
 
 out:
 	return NOTIFY_DONE;
@@ -363,7 +366,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct sockaddr_in6 sin6;
 
-	if (event != NETDEV_DOWN)
+	if ((event != NETDEV_DOWN) ||
+	    !atomic_inc_not_zero(&nn->ntf_refcnt))
 		goto out;
 
 	if (nn->nfsd_serv) {
@@ -374,7 +378,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
 			sin6.sin6_scope_id = ifa->idev->dev->ifindex;
 		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
 	}
-
+	atomic_dec(&nn->ntf_refcnt);
+	wake_up(&nn->ntf_wq);
 out:
 	return NOTIFY_DONE;
 }
@@ -391,6 +396,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	atomic_dec(&nn->ntf_refcnt);
 	/* check if the notifier still has clients */
 	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
 		unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
@@ -398,6 +404,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 		unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
 	}
+	wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0);
 
 	/*
 	 * write_ports can create the server without actually starting
@@ -447,7 +454,7 @@ void nfsd_reset_versions(void)
  */
 static void set_max_drc(void)
 {
-	#define NFSD_DRC_SIZE_SHIFT	10
+	#define NFSD_DRC_SIZE_SHIFT	7
 	nfsd_drc_max_mem = (nr_free_buffer_pages()
 					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
 	nfsd_drc_mem_used = 0;
@@ -517,7 +524,8 @@ int nfsd_create_serv(struct net *net)
 		register_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
 	}
-	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
+	atomic_inc(&nn->ntf_refcnt);
+	ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
 	return 0;
 }
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 005c911b34ac..f3772ea8ba0d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -36,6 +36,7 @@
 #define _NFSD4_STATE_H
 
 #include <linux/idr.h>
+#include <linux/refcount.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include "nfsfh.h"
 
@@ -83,7 +84,7 @@ struct nfsd4_callback_ops {
  * fields that are of general use to any stateid.
  */
 struct nfs4_stid {
-	atomic_t		sc_count;
+	refcount_t		sc_count;
 #define NFS4_OPEN_STID 1
 #define NFS4_LOCK_STID 2
 #define NFS4_DELEG_STID 4
@@ -169,11 +170,13 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 struct nfsd4_slot {
 	u32	sl_seqid;
 	__be32	sl_status;
+	struct svc_cred sl_cred;
 	u32	sl_datalen;
 	u16	sl_opcnt;
 #define NFSD4_SLOT_INUSE	(1 << 0)
 #define NFSD4_SLOT_CACHETHIS	(1 << 1)
 #define NFSD4_SLOT_INITIALIZED	(1 << 2)
+#define NFSD4_SLOT_CACHED	(1 << 3)
 	u8	sl_flags;
 	char	sl_data[];
 };
@@ -465,7 +468,7 @@ struct nfs4_clnt_odstate {
 	struct nfs4_client	*co_client;
 	struct nfs4_file	*co_file;
 	struct list_head	co_perfile;
-	atomic_t		co_odcount;
+	refcount_t		co_odcount;
 };
 
 /*
@@ -481,7 +484,7 @@ struct nfs4_clnt_odstate {
  * the global state_lock spinlock.
  */
 struct nfs4_file {
-	atomic_t		fi_ref;
+	refcount_t		fi_ref;
 	spinlock_t		fi_lock;
 	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
 	struct list_head        fi_stateids;
@@ -634,7 +637,7 @@ struct nfs4_file *find_file(struct knfsd_fh *fh);
 void put_nfs4_file(struct nfs4_file *fi);
 static inline void get_nfs4_file(struct nfs4_file *fi)
 {
-	atomic_inc(&fi->fi_ref);
+	refcount_inc(&fi->fi_ref);
 }
 struct file *find_any_file(struct nfs4_file *f);
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 1e4edbf70052..bc29511b6405 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -649,9 +649,18 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
 	return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
 }
 
-static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+/*
+ * The session reply cache only needs to cache replies that the client
+ * actually asked us to.  But it's almost free for us to cache compounds
+ * consisting of only a SEQUENCE op, so we may as well cache those too.
+ * Also, the protocol doesn't give us a convenient response in the case
+ * of a replay of a solo SEQUENCE op that wasn't cached
+ * (RETRY_UNCACHED_REP can only be returned in the second op of a
+ * compound).
+ */
+static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp)
 {
-	return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+	return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
 		|| nfsd4_is_solo_sequence(resp);
 }
 
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 06ffa135dfa6..16a7a67a11c9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2156,10 +2156,10 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
 	     level++)
 		INIT_LIST_HEAD(&lists[level]);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, btcache, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 515d13c196da..1a2894aa0194 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (err)
 		return err;
 
-	inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	inode = nilfs_new_inode(dir, S_IFLNK | 0777);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8616c46d33da..68241512d7c1 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -255,10 +255,9 @@ int nilfs_copy_dirty_pages(struct address_space *dmap,
 	pgoff_t index = 0;
 	int err = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 repeat:
-	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
-				PAGEVEC_SIZE))
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY))
 		return 0;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -310,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 	pgoff_t index = 0;
 	int err;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 repeat:
 	n = pagevec_lookup(&pvec, smap, &index);
 	if (!n)
@@ -374,10 +373,10 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
 	unsigned int i;
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -519,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 	index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
 	nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 repeat:
 	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 70ded52dc1dd..9f3ffba41533 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -708,21 +708,17 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		index = start >> PAGE_SHIFT;
 		last = end >> PAGE_SHIFT;
 	}
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
  repeat:
 	if (unlikely(index > last) ||
-	    !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				min_t(pgoff_t, last - index,
-				      PAGEVEC_SIZE - 1) + 1))
+	    !pagevec_lookup_range_tag(&pvec, mapping, &index, last,
+				PAGECACHE_TAG_DIRTY))
 		return ndirties;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
 		struct buffer_head *bh, *head;
 		struct page *page = pvec.pages[i];
 
-		if (unlikely(page->index > last))
-			break;
-
 		lock_page(page);
 		if (!page_has_buffers(page))
 			create_empty_buffers(page, i_blocksize(inode), 0);
@@ -757,10 +753,10 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
 	unsigned int i;
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
@@ -1958,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 					  err, ii->vfs_inode.i_ino);
 				return err;
 			}
-			mark_buffer_dirty(ibh);
-			nilfs_mdt_mark_dirty(ifile);
 			spin_lock(&nilfs->ns_inode_lock);
 			if (likely(!ii->i_bh))
 				ii->i_bh = ibh;
@@ -1968,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 			goto retry;
 		}
 
+		// Always redirty the buffer to avoid race condition
+		mark_buffer_dirty(ii->i_bh);
+		nilfs_mdt_mark_dirty(ifile);
+
 		clear_bit(NILFS_I_QUEUED, &ii->i_state);
 		set_bit(NILFS_I_BUSY, &ii->i_state);
 		list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
@@ -1981,7 +1979,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii, *n;
-	int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+	int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE);
 	int defer_iput = false;
 
 	spin_lock(&nilfs->ns_inode_lock);
@@ -2404,11 +2402,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 	return err;
 }
 
-static void nilfs_construction_timeout(unsigned long data)
+static void nilfs_construction_timeout(struct timer_list *t)
 {
-	struct task_struct *p = (struct task_struct *)data;
+	struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer);
 
-	wake_up_process(p);
+	wake_up_process(sci->sc_timer_task);
 }
 
 static void
@@ -2546,8 +2544,7 @@ static int nilfs_segctor_thread(void *arg)
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int timeout = 0;
 
-	sci->sc_timer.data = (unsigned long)current;
-	sci->sc_timer.function = nilfs_construction_timeout;
+	sci->sc_timer_task = current;
 
 	/* start sync. */
 	sci->sc_task = current;
@@ -2678,7 +2675,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
 	INIT_LIST_HEAD(&sci->sc_iput_queue);
 	INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
-	init_timer(&sci->sc_timer);
+	timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
 
 	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
 	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 1060949d7dd2..84084a4d9b3e 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -180,6 +180,7 @@ struct nilfs_sc_info {
 	unsigned long		sc_watermark;
 
 	struct timer_list	sc_timer;
+	struct task_struct     *sc_timer_task;
 	struct task_struct     *sc_task;
 };
 
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1541a1e9221a..1341a41e7b43 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 }
 
 /**
-  * nilfs_sufile_truncate_range - truncate range of segment array
-  * @sufile: inode of segment usage file
-  * @start: start segment number (inclusive)
-  * @end: end segment number (inclusive)
-  *
-  * Return Value: On success, 0 is returned.  On error, one of the
-  * following negative error codes is returned.
-  *
-  * %-EIO - I/O error.
-  *
-  * %-ENOMEM - Insufficient amount of memory available.
-  *
-  * %-EINVAL - Invalid number of segments specified
-  *
-  * %-EBUSY - Dirty or active segments are present in the range
-  */
+ * nilfs_sufile_truncate_range - truncate range of segment array
+ * @sufile: inode of segment usage file
+ * @start: start segment number (inclusive)
+ * @end: end segment number (inclusive)
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid number of segments specified
+ *
+ * %-EBUSY - Dirty or active segments are present in the range
+ */
 static int nilfs_sufile_truncate_range(struct inode *sufile,
 				       __u64 start, __u64 end)
 {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4fc018dfcfae..3073b646e1ba 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -141,7 +141,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
 
 		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
 			printk(KERN_CRIT "Remounting filesystem read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 	}
 
@@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_bh = NULL;
 	ii->i_state = 0;
 	ii->i_cno = 0;
-	ii->vfs_inode.i_version = 1;
 	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
 	return &ii->vfs_inode;
 }
@@ -870,7 +869,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 
 	/* FS independent flags */
 #ifdef NILFS_ATIME_DISABLE
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 #endif
 
 	nilfs_set_default_options(sb, sbp);
@@ -1134,7 +1133,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		err = -EINVAL;
 		goto restore_opts;
 	}
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
 
 	err = -EINVAL;
 
@@ -1144,12 +1143,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		/* Shutting down log writer */
 		nilfs_detach_log_writer(sb);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 		/*
 		 * Remounting a valid RW partition RDONLY, so set
@@ -1179,7 +1178,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore_opts;
 		}
 
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		root = NILFS_I(d_inode(sb->s_root))->i_root;
 		err = nilfs_attach_log_writer(sb, root);
@@ -1213,7 +1212,7 @@ static int nilfs_parse_snapshot_option(const char *option,
 	const char *msg = NULL;
 	int err;
 
-	if (!(sd->flags & MS_RDONLY)) {
+	if (!(sd->flags & SB_RDONLY)) {
 		msg = "read-only option is not specified";
 		goto parse_error;
 	}
@@ -1287,7 +1286,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1328,14 +1327,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
 		sb_set_blocksize(s, block_size(sd.bdev));
 
-		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (err)
 			goto failed_super;
 
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	} else if (!sd.cno) {
 		if (nilfs_tree_is_busy(s->s_root)) {
-			if ((flags ^ s->s_flags) & MS_RDONLY) {
+			if ((flags ^ s->s_flags) & SB_RDONLY) {
 				nilfs_msg(s, KERN_ERR,
 					  "the device already has a %s mount.",
 					  sb_rdonly(s) ? "read-only" : "read/write");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 2dd75bf619ad..1a85317e83f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -220,7 +220,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 
 	if (!valid_fs) {
 		nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
-		if (s_flags & MS_RDONLY) {
+		if (s_flags & SB_RDONLY) {
 			nilfs_msg(sb, KERN_INFO,
 				  "recovery required for readonly filesystem");
 			nilfs_msg(sb, KERN_INFO,
@@ -286,7 +286,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 	if (valid_fs)
 		goto skip_recovery;
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		__u64 features;
 
 		if (nilfs_test_opt(nilfs, NORECOVERY)) {
@@ -309,7 +309,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 			err = -EROFS;
 			goto failed_unload;
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
 		nilfs_msg(sb, KERN_ERR,
 			  "recovery cancelled because norecovery option was specified for a read/write mount");
@@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
 		} else if (cno > root->cno) {
 			n = n->rb_right;
 		} else {
-			atomic_inc(&root->count);
+			refcount_inc(&root->count);
 			spin_unlock(&nilfs->ns_cptree_lock);
 			return root;
 		}
@@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 		} else if (cno > root->cno) {
 			p = &(*p)->rb_right;
 		} else {
-			atomic_inc(&root->count);
+			refcount_inc(&root->count);
 			spin_unlock(&nilfs->ns_cptree_lock);
 			kfree(new);
 			return root;
@@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 	new->cno = cno;
 	new->ifile = NULL;
 	new->nilfs = nilfs;
-	atomic_set(&new->count, 1);
+	refcount_set(&new->count, 1);
 	atomic64_set(&new->inodes_count, 0);
 	atomic64_set(&new->blocks_count, 0);
 
@@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 
 void nilfs_put_root(struct nilfs_root *root)
 {
-	if (atomic_dec_and_test(&root->count)) {
+	if (refcount_dec_and_test(&root->count)) {
 		struct the_nilfs *nilfs = root->nilfs;
 
 		nilfs_sysfs_delete_snapshot_group(root);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index b305c6f033e7..883d732b0259 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -27,6 +27,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
+#include <linux/refcount.h>
 
 struct nilfs_sc_info;
 struct nilfs_sysfs_dev_subgroups;
@@ -246,7 +247,7 @@ struct nilfs_root {
 	__u64 cno;
 	struct rb_node rb_node;
 
-	atomic_t count;
+	refcount_t count;
 	struct the_nilfs *nilfs;
 	struct inode *ifile;
 
@@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *);
 
 static inline void nilfs_get_root(struct nilfs_root *root)
 {
-	atomic_inc(&root->count);
+	refcount_inc(&root->count);
 }
 
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index e5f911bd80d2..41355ce74ac0 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -21,6 +21,6 @@ config FANOTIFY_ACCESS_PERMISSIONS
 	   decisions concerning filesystem events.  This is used by some fanotify
 	   listeners which need to scan files before allowing the system access to
 	   use those files.  This is used by some anti-malware vendors and by some
-	   hierarchical storage managent systems.
+	   hierarchical storage management systems.
 
 	   If unsure, say N.
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 81d8959b6aef..219b269c737e 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -67,7 +67,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
 
 		/*
 		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * doing an __iget/iput with SB_ACTIVE clear would actually
 		 * evict all inodes with zero i_count from icache which is
 		 * unnecessarily violent and may in fact be illegal to do.
 		 */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index ef243e14b6eb..7c6f76d29f56 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -255,5 +255,5 @@ void __init nsfs_init(void)
 	nsfs_mnt = kern_mount(&nsfs);
 	if (IS_ERR(nsfs_mnt))
 		panic("can't set nsfs up\n");
-	nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3f70f041dbe9..bb7159f697f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -473,7 +473,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 #else /* NTFS_RW */
 	/*
 	 * For the read-write compiled driver, if we are remounting read-write,
@@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	 * When remounting read-only, mark the volume clean if no volume errors
 	 * have occurred.
 	 */
-	if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
 		static const char *es = ".  Cannot remount read-write.";
 
 		/* Remounting read-write. */
@@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			NVolSetErrors(vol);
 			return -EROFS;
 		}
-	} else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+	} else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
 		/* Remounting read-only. */
 		if (!NVolErrors(vol)) {
 			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
@@ -1799,7 +1799,7 @@ static bool load_system_files(ntfs_volume *vol)
 						es3);
 				goto iput_mirr_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s",
 					!vol->mftmirr_ino ? es1 : es2, es3);
 		} else
@@ -1937,7 +1937,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_vol_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1974,7 +1974,7 @@ get_ctx_vol_failed:
 				}
 				goto iput_logfile_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2019,7 +2019,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_root_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2042,7 +2042,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		/*
 		 * Do not set NVolErrors() because ntfs_remount() might manage
 		 * to set the dirty flag in which case all would be well.
@@ -2055,7 +2055,7 @@ get_ctx_vol_failed:
 	 * If (still) a read-write mount, set the NT4 compatibility flag on
 	 * newer NTFS version volumes.
 	 */
-	if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+	if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
 			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
 		static const char *es1 = "Failed to set NT4 compatibility flag";
 		static const char *es2 = ".  Run chkdsk.";
@@ -2069,7 +2069,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif
@@ -2087,7 +2087,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2128,7 +2128,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_quota_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2150,7 +2150,7 @@ get_ctx_vol_failed:
 			goto iput_quota_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 	/*
@@ -2171,7 +2171,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_usnjrnl_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2194,7 +2194,7 @@ get_ctx_vol_failed:
 			goto iput_usnjrnl_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2728,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	lockdep_off();
 	ntfs_debug("Entering.");
 #ifndef NTFS_RW
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 #endif /* ! NTFS_RW */
 	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
 	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index addd7c5f2d3e..ab5105f9767e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3585,8 +3585,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
 		 * The easy case - we can just plop the record right in.
 		 */
 		*left_rec = *split_rec;
-
-		has_empty_extent = 0;
 	} else
 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 88a31e9340a0..d1516327b787 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -134,6 +134,19 @@ bail:
 	return err;
 }
 
+static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	down_read(&oi->ip_alloc_sem);
+	ret = ocfs2_get_block(inode, iblock, bh_result, create);
+	up_read(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
 int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		    struct buffer_head *bh_result, int create)
 {
@@ -2128,7 +2141,7 @@ static void ocfs2_dio_free_write_ctx(struct inode *inode,
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
-static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
 			       struct buffer_head *bh_result, int create)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2154,12 +2167,9 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
 	 * while file size will be changed.
 	 */
 	if (pos + total_len <= i_size_read(inode)) {
-		down_read(&oi->ip_alloc_sem);
-		/* This is the fast path for re-write. */
-		ret = ocfs2_get_block(inode, iblock, bh_result, create);
-
-		up_read(&oi->ip_alloc_sem);
 
+		/* This is the fast path for re-write. */
+		ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
 		if (buffer_mapped(bh_result) &&
 		    !buffer_new(bh_result) &&
 		    ret == 0)
@@ -2424,9 +2434,9 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		return 0;
 
 	if (iov_iter_rw(iter) == READ)
-		get_block = ocfs2_get_block;
+		get_block = ocfs2_lock_get_block;
 	else
-		get_block = ocfs2_dio_get_block;
+		get_block = ocfs2_dio_wr_get_block;
 
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
 				    iter, get_block,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index b97bcc6dde7c..b1bb70c8ca4d 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -28,9 +28,6 @@
 
 #include <linux/buffer_head.h>
 
-void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
-			     int uptodate);
-
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct ocfs2_caching_info   *ci);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 3ef5137dc362..a9e67efc0004 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -79,10 +79,8 @@ void o2hb_fill_node_map(unsigned long *map,
 			unsigned bytes);
 void o2hb_exit(void);
 int o2hb_init(void);
-int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_no_sem(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
-int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
 int o2hb_get_all_regions(char *region_uuids, u8 numregions);
 int o2hb_global_heartbeat_active(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index a51200ece93d..da64c3a20eeb 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
 		"panic",	/* O2NM_FENCE_PANIC */
 };
 
+static inline void o2nm_lock_subsystem(void);
+static inline void o2nm_unlock_subsystem(void);
+
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
 	struct o2nm_node *node = NULL;
@@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
 {
 	/* through the first node_set .parent
 	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
-	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+	if (node->nd_item.ci_parent)
+		return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+	else
+		return NULL;
 }
 
 enum {
@@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 				   size_t count)
 {
 	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	struct o2nm_cluster *cluster;
 	unsigned long tmp;
 	char *p = (char *)page;
 	int ret = 0;
@@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
 		return -EINVAL; /* XXX */
 
+	o2nm_lock_subsystem();
+	cluster = to_o2nm_cluster_from_node(node);
+	if (!cluster) {
+		o2nm_unlock_subsystem();
+		return -EINVAL;
+	}
+
 	write_lock(&cluster->cl_nodes_lock);
 	if (cluster->cl_nodes[tmp])
 		ret = -EEXIST;
@@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 		set_bit(tmp, cluster->cl_nodes_bitmap);
 	}
 	write_unlock(&cluster->cl_nodes_lock);
+	o2nm_unlock_subsystem();
+
 	if (ret)
 		return ret;
 
@@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 					    size_t count)
 {
 	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	struct o2nm_cluster *cluster;
 	int ret, i;
 	struct rb_node **p, *parent;
 	unsigned int octets[4];
@@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
 	}
 
+	o2nm_lock_subsystem();
+	cluster = to_o2nm_cluster_from_node(node);
+	if (!cluster) {
+		o2nm_unlock_subsystem();
+		return -EINVAL;
+	}
+
 	ret = 0;
 	write_lock(&cluster->cl_nodes_lock);
 	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
@@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
 	}
 	write_unlock(&cluster->cl_nodes_lock);
+	o2nm_unlock_subsystem();
+
 	if (ret)
 		return ret;
 
@@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 				     size_t count)
 {
 	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	struct o2nm_cluster *cluster;
 	unsigned long tmp;
 	char *p = (char *)page;
 	ssize_t ret;
@@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
 		return -EINVAL; /* XXX */
 
+	o2nm_lock_subsystem();
+	cluster = to_o2nm_cluster_from_node(node);
+	if (!cluster) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	/* the only failure case is trying to set a new local node
 	 * when a different one is already set */
 	if (tmp && tmp == cluster->cl_has_local &&
-	    cluster->cl_local_node != node->nd_num)
-		return -EBUSY;
+	    cluster->cl_local_node != node->nd_num) {
+		ret = -EBUSY;
+		goto out;
+	}
 
 	/* bring up the rx thread if we're setting the new local node. */
 	if (tmp && !cluster->cl_has_local) {
 		ret = o2net_start_listening(node);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if (!tmp && cluster->cl_has_local &&
@@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 		cluster->cl_local_node = node->nd_num;
 	}
 
-	return count;
+	ret = count;
+
+out:
+	o2nm_unlock_subsystem();
+	return ret;
 }
 
 CONFIGFS_ATTR(o2nm_node_, num);
@@ -738,6 +775,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
 	},
 };
 
+static inline void o2nm_lock_subsystem(void)
+{
+	mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex);
+}
+
+static inline void o2nm_unlock_subsystem(void)
+{
+	mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex);
+}
+
 int o2nm_depend_item(struct config_item *item)
 {
 	return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8d779227370a..bebe59feca58 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -140,7 +140,7 @@ static void o2net_rx_until_empty(struct work_struct *work);
 static void o2net_shutdown_sc(struct work_struct *work);
 static void o2net_listen_data_ready(struct sock *sk);
 static void o2net_sc_send_keep_req(struct work_struct *work);
-static void o2net_idle_timer(unsigned long data);
+static void o2net_idle_timer(struct timer_list *t);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
@@ -450,8 +450,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
 	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
 
-	setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
-		    (unsigned long)sc);
+	timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0);
 
 	sclog(sc, "alloced\n");
 
@@ -1517,9 +1516,9 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 /* socket shutdown does a del_timer_sync against this as it tears down.
  * we can't start this timer until we've got to the point in sc buildup
  * where shutdown is going to be involved */
-static void o2net_idle_timer(unsigned long data)
+static void o2net_idle_timer(struct timer_list *t)
 {
-	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout);
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 #ifdef CONFIG_DEBUG_FS
 	unsigned long msecs = ktime_to_ms(ktime_get()) -
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index a2b19fbdcf46..e1fea149f50b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -394,7 +394,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
 static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
 {
 	if (dlm->dlm_worker) {
-		flush_workqueue(dlm->dlm_worker);
 		destroy_workqueue(dlm->dlm_worker);
 		dlm->dlm_worker = NULL;
 	}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3e04279446e8..9c3e0f13ca87 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2616,7 +2616,9 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	 * otherwise the assert_master from the new
 	 * master will destroy this.
 	 */
-	dlm_get_mle_inuse(mle);
+	if (ret != -EEXIST)
+		dlm_get_mle_inuse(mle);
+
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..ec8f75813beb 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 					dlm_lockres_put(res);
 					continue;
 				}
+				dlm_move_lockres_to_recovery_list(dlm, res);
 			} else if (res->owner == dlm->node_num) {
 				dlm_free_dead_locks(dlm, res, dead_node);
 				__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 9ab9e1892b5f..9c7c18c0e129 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,13 +88,13 @@ struct workqueue_struct *user_dlm_worker;
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
 static int param_set_dlmfs_capabilities(const char *val,
-					struct kernel_param *kp)
+					const struct kernel_param *kp)
 {
 	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
 	return -EINVAL;
 }
 static int param_get_dlmfs_capabilities(char *buffer,
-					struct kernel_param *kp)
+					const struct kernel_param *kp)
 {
 	return strlcpy(buffer, DLMFS_CAPABILITIES,
 		       strlen(DLMFS_CAPABILITIES) + 1);
@@ -670,7 +670,6 @@ static void __exit exit_dlmfs_fs(void)
 {
 	unregister_filesystem(&dlmfs_fs_type);
 
-	flush_workqueue(user_dlm_worker);
 	destroy_workqueue(user_dlm_worker);
 
 	/*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6e41fc8fabbe..a1d051055472 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -227,7 +227,7 @@ int ocfs2_should_update_atime(struct inode *inode,
 		return 0;
 
 	if ((inode->i_flags & S_NOATIME) ||
-	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+	    ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 
 	/*
@@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
+		/*
+		 * Here we should wait dio to finish before inode lock
+		 * to avoid a deadlock between ocfs2_setattr() and
+		 * ocfs2_dio_end_io_write()
+		 */
+		inode_dio_wait(inode);
+
 		status = ocfs2_rw_lock(inode, 1);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		if (status)
 			goto bail_unlock;
 
-		inode_dio_wait(inode);
-
 		if (i_size_read(inode) >= attr->ia_size) {
 			if (ocfs2_should_order_data(inode)) {
 				status = ocfs2_begin_ordered_truncate(inode,
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 2cabbcf2f28e..e87279e49ba3 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -129,19 +129,13 @@ static struct kobj_attribute ocfs2_attr_filecheck_set =
 					ocfs2_filecheck_show,
 					ocfs2_filecheck_store);
 
-static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 static void
 ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
 {
 	struct ocfs2_filecheck_entry *p;
 
 	if (!atomic_dec_and_test(&entry->fs_count))
-		wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+		wait_on_atomic_t(&entry->fs_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	spin_lock(&entry->fs_fcheck->fc_lock);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71f22c8fbffd..9f0b95abc09f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1147,12 +1147,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 					     GLOBAL_BITMAP_SYSTEM_INODE,
 					     OCFS2_INVALID_SLOT, NULL,
 					     ALLOC_NEW_GROUP);
-	if (status < 0 && status != -ENOSPC) {
+	if (status < 0 && status != -ENOSPC)
 		mlog_errno(status);
-		goto bail;
-	}
 
-bail:
 	return status;
 }
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 80733496b22a..80efa5699fb0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -675,9 +675,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	/* We're going to/from readonly mode. */
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
 		/* Disable quota accounting before remounting RO */
-		if (*flags & MS_RDONLY) {
+		if (*flags & SB_RDONLY) {
 			ret = ocfs2_susp_quotas(osb, 0);
 			if (ret < 0)
 				goto out;
@@ -691,8 +691,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			goto unlock_osb;
 		}
 
-		if (*flags & MS_RDONLY) {
-			sb->s_flags |= MS_RDONLY;
+		if (*flags & SB_RDONLY) {
+			sb->s_flags |= SB_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
 			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
@@ -709,14 +709,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 				ret = -EINVAL;
 				goto unlock_osb;
 			}
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
 		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
-		if (!ret && !(*flags & MS_RDONLY)) {
+		if (!ret && !(*flags & SB_RDONLY)) {
 			if (sb_any_quota_suspended(sb))
 				ret = ocfs2_susp_quotas(osb, 1);
 			else
@@ -724,7 +724,7 @@ unlock_osb:
 			if (ret < 0) {
 				/* Return back changes... */
 				spin_lock(&osb->osb_lock);
-				sb->s_flags |= MS_RDONLY;
+				sb->s_flags |= SB_RDONLY;
 				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 				spin_unlock(&osb->osb_lock);
 				goto out;
@@ -744,9 +744,9 @@ unlock_osb:
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
 
-		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
-							MS_POSIXACL : 0);
+							SB_POSIXACL : 0);
 	}
 out:
 	return ret;
@@ -1057,10 +1057,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
-	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
-		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
 
-	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	/* Hard readonly mode only if: bdev_read_only, SB_RDONLY,
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
 		if (!sb_rdonly(sb)) {
@@ -2057,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 	/* this is needed to support O_LARGEFILE */
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
@@ -2521,10 +2521,8 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	/* This function assumes that the caller has the main osb resource */
 
 	/* ocfs2_initializer_super have already created this workqueue */
-	if (osb->ocfs2_wq) {
-		flush_workqueue(osb->ocfs2_wq);
+	if (osb->ocfs2_wq)
 		destroy_workqueue(osb->ocfs2_wq);
-	}
 
 	ocfs2_free_slot_info(osb);
 
@@ -2570,7 +2568,7 @@ static int ocfs2_handle_error(struct super_block *sb)
 			return rv;
 
 		pr_crit("OCFS2: File system is now read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		ocfs2_set_ro_flag(osb, 0);
 	}
 
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b023e4f3d740..d4550c8bbc41 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,9 +26,6 @@
 #ifndef OCFS2_SUPER_H
 #define OCFS2_SUPER_H
 
-int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
-				  int node_num);
-
 __printf(3, 4)
 int __ocfs2_error(struct super_block *sb, const char *function,
 		   const char *fmt, ...);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5fdf269ba82e..c5898c59d411 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -901,7 +901,7 @@ static int ocfs2_xattr_list_entry(struct super_block *sb,
 
 	case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
 	case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
-		if (!(sb->s_flags & MS_POSIXACL))
+		if (!(sb->s_flags & SB_POSIXACL))
 			return 0;
 		break;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 13215f26e321..2200662a9bf1 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -369,7 +369,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 static int openprom_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -386,7 +386,7 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
 	struct op_inode_info *oi;
 	int ret;
 
-	s->s_flags |= MS_NOATIME;
+	s->s_flags |= SB_NOATIME;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = OPENPROM_SUPER_MAGIC;
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index c2d8233b1e82..480ea059a680 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -155,13 +155,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 int orangefs_init_acl(struct inode *inode, struct inode *dir)
 {
-	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct posix_acl *default_acl, *acl;
 	umode_t mode = inode->i_mode;
+	struct iattr iattr;
 	int error = 0;
 
-	ClearModeFlag(orangefs_inode);
-
 	error = posix_acl_create(dir, &mode, &default_acl, &acl);
 	if (error)
 		return error;
@@ -180,9 +178,11 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir)
 
 	/* If mode of the inode was changed, then do a forcible ->setattr */
 	if (mode != inode->i_mode) {
-		SetModeFlag(orangefs_inode);
+		memset(&iattr, 0, sizeof iattr);
 		inode->i_mode = mode;
-		orangefs_flush_inode(inode);
+		iattr.ia_mode = mode;
+		iattr.ia_valid |= ATTR_MODE;
+		orangefs_inode_setattr(inode, &iattr);
 	}
 
 	return error;
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index a8cc588d6224..e2c2699d8016 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -386,7 +386,6 @@ static int orangefs_dir_release(struct inode *inode, struct file *file)
 {
 	struct orangefs_dir *od = file->private_data;
 	struct orangefs_dir_part *part = od->part;
-	orangefs_flush_inode(inode);
 	while (part) {
 		struct orangefs_dir_part *next = part->next;
 		vfree(part);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index e4a8e6a7eb17..1668fd645c45 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -383,9 +383,15 @@ out:
 		if (type == ORANGEFS_IO_READ) {
 			file_accessed(file);
 		} else {
-			SetMtimeFlag(orangefs_inode);
-			inode->i_mtime = current_time(inode);
-			mark_inode_dirty_sync(inode);
+			file_update_time(file);
+			/*
+			 * Must invalidate to ensure write loop doesn't
+			 * prevent kernel from reading updated
+			 * attribute.  Size probably changed because of
+			 * the write, and other clients could update
+			 * any other attribute.
+			 */
+			orangefs_inode->getattr_time = jiffies - 1;
 		}
 	}
 
@@ -615,8 +621,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
 		     "orangefs_file_release: called on %pD\n",
 		     file);
 
-	orangefs_flush_inode(inode);
-
 	/*
 	 * remove all associated inode pages from the page cache and
 	 * readahead cache (if any); this forces an expensive refresh of
@@ -666,8 +670,6 @@ static int orangefs_fsync(struct file *file,
 		     ret);
 
 	op_release(new_op);
-
-	orangefs_flush_inode(file_inode(file));
 	return ret;
 }
 
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 28825a5b6d09..fe1d705ad91f 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -290,6 +290,22 @@ int orangefs_permission(struct inode *inode, int mask)
 	return generic_permission(inode, mask);
 }
 
+int orangefs_update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	struct iattr iattr;
+	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
+	    get_khandle_from_ino(inode));
+	generic_update_time(inode, time, flags);
+	memset(&iattr, 0, sizeof iattr);
+        if (flags & S_ATIME)
+		iattr.ia_valid |= ATTR_ATIME;
+	if (flags & S_CTIME)
+		iattr.ia_valid |= ATTR_CTIME;
+	if (flags & S_MTIME)
+		iattr.ia_valid |= ATTR_MTIME;
+	return orangefs_inode_setattr(inode, &iattr);
+}
+
 /* ORANGEDS2 implementation of VFS inode operations for files */
 const struct inode_operations orangefs_file_inode_operations = {
 	.get_acl = orangefs_get_acl,
@@ -298,6 +314,7 @@ const struct inode_operations orangefs_file_inode_operations = {
 	.getattr = orangefs_getattr,
 	.listxattr = orangefs_listxattr,
 	.permission = orangefs_permission,
+	.update_time = orangefs_update_time,
 };
 
 static int orangefs_init_iops(struct inode *inode)
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 7e9e5d0ea3bc..c98bba2dbc94 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -22,7 +22,9 @@ static int orangefs_create(struct inode *dir,
 {
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
+	struct orangefs_object_kref ref;
 	struct inode *inode;
+	struct iattr iattr;
 	int ret;
 
 	gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n",
@@ -55,8 +57,10 @@ static int orangefs_create(struct inode *dir,
 	if (ret < 0)
 		goto out;
 
-	inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
-				&new_op->downcall.resp.create.refn);
+	ref = new_op->downcall.resp.create.refn;
+	op_release(new_op);
+
+	inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref);
 	if (IS_ERR(inode)) {
 		gossip_err("%s: Failed to allocate inode for file :%pd:\n",
 			   __func__,
@@ -82,12 +86,13 @@ static int orangefs_create(struct inode *dir,
 		     __func__,
 		     dentry);
 
-	SetMtimeFlag(parent);
 	dir->i_mtime = dir->i_ctime = current_time(dir);
+	memset(&iattr, 0, sizeof iattr);
+	iattr.ia_valid |= ATTR_MTIME;
+	orangefs_inode_setattr(dir, &iattr);
 	mark_inode_dirty_sync(dir);
 	ret = 0;
 out:
-	op_release(new_op);
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "%s: %pd: returning %d\n",
 		     __func__,
@@ -221,6 +226,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
+	struct iattr iattr;
 	int ret;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
@@ -253,8 +259,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!ret) {
 		drop_nlink(inode);
 
-		SetMtimeFlag(parent);
 		dir->i_mtime = dir->i_ctime = current_time(dir);
+		memset(&iattr, 0, sizeof iattr);
+		iattr.ia_valid |= ATTR_MTIME;
+		orangefs_inode_setattr(dir, &iattr);
 		mark_inode_dirty_sync(dir);
 	}
 	return ret;
@@ -266,7 +274,9 @@ static int orangefs_symlink(struct inode *dir,
 {
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
+	struct orangefs_object_kref ref;
 	struct inode *inode;
+	struct iattr iattr;
 	int mode = 755;
 	int ret;
 
@@ -307,8 +317,10 @@ static int orangefs_symlink(struct inode *dir,
 		goto out;
 	}
 
-	inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
-				&new_op->downcall.resp.sym.refn);
+	ref = new_op->downcall.resp.sym.refn;
+	op_release(new_op);
+
+	inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref);
 	if (IS_ERR(inode)) {
 		gossip_err
 		    ("*** Failed to allocate orangefs symlink inode\n");
@@ -331,12 +343,13 @@ static int orangefs_symlink(struct inode *dir,
 		     get_khandle_from_ino(inode),
 		     dentry);
 
-	SetMtimeFlag(parent);
 	dir->i_mtime = dir->i_ctime = current_time(dir);
+	memset(&iattr, 0, sizeof iattr);
+	iattr.ia_valid |= ATTR_MTIME;
+	orangefs_inode_setattr(dir, &iattr);
 	mark_inode_dirty_sync(dir);
 	ret = 0;
 out:
-	op_release(new_op);
 	return ret;
 }
 
@@ -344,7 +357,9 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 {
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
+	struct orangefs_object_kref ref;
 	struct inode *inode;
+	struct iattr iattr;
 	int ret;
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
@@ -373,8 +388,10 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 		goto out;
 	}
 
-	inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
-				&new_op->downcall.resp.mkdir.refn);
+	ref = new_op->downcall.resp.mkdir.refn;
+	op_release(new_op);
+
+	inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref);
 	if (IS_ERR(inode)) {
 		gossip_err("*** Failed to allocate orangefs dir inode\n");
 		ret = PTR_ERR(inode);
@@ -400,11 +417,12 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	 * NOTE: we have no good way to keep nlink consistent for directories
 	 * across clients; keep constant at 1.
 	 */
-	SetMtimeFlag(parent);
 	dir->i_mtime = dir->i_ctime = current_time(dir);
+	memset(&iattr, 0, sizeof iattr);
+	iattr.ia_valid |= ATTR_MTIME;
+	orangefs_inode_setattr(dir, &iattr);
 	mark_inode_dirty_sync(dir);
 out:
-	op_release(new_op);
 	return ret;
 }
 
@@ -470,4 +488,5 @@ const struct inode_operations orangefs_dir_inode_operations = {
 	.getattr = orangefs_getattr,
 	.listxattr = orangefs_listxattr,
 	.permission = orangefs_permission,
+	.update_time = orangefs_update_time,
 };
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index b6001bb28f5a..c7db56a31b92 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -15,8 +15,10 @@
 
 #ifdef __KERNEL__
 #include <linux/types.h>
+#include <linux/kernel.h>
 #else
 #include <stdint.h>
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif
 
 #define	GOSSIP_NO_DEBUG			(__u64)0
@@ -88,6 +90,6 @@ static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
 };
 
 static const int num_kmod_keyword_mask_map = (int)
-	(sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
+	(ARRAY_SIZE(s_kmod_keyword_mask_map));
 
 #endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 004af348fb80..97adf7d100b5 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -209,37 +209,10 @@ struct orangefs_inode_s {
 	struct inode vfs_inode;
 	sector_t last_failed_block_index_read;
 
-	/*
-	 * State of in-memory attributes not yet flushed to disk associated
-	 * with this object
-	 */
-	unsigned long pinode_flags;
-
 	unsigned long getattr_time;
 	u32 getattr_mask;
 };
 
-#define P_ATIME_FLAG 0
-#define P_MTIME_FLAG 1
-#define P_CTIME_FLAG 2
-#define P_MODE_FLAG  3
-
-#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
-#define SetAtimeFlag(pinode)   set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
-#define AtimeFlag(pinode)      test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
-
-#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
-#define SetMtimeFlag(pinode)   set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
-#define MtimeFlag(pinode)      test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
-
-#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
-#define SetCtimeFlag(pinode)   set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
-#define CtimeFlag(pinode)      test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
-
-#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
-#define SetModeFlag(pinode)   set_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
-#define ModeFlag(pinode)      test_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
-
 /* per superblock private orangefs info */
 struct orangefs_sb_info_s {
 	struct orangefs_khandle root_khandle;
@@ -275,12 +248,6 @@ struct orangefs_kiocb_s {
 	/* orangefs kernel operation type */
 	struct orangefs_kernel_op_s *op;
 
-	/* The user space buffers from/to which I/O is being staged */
-	struct iovec *iov;
-
-	/* number of elements in the iovector */
-	unsigned long nr_segs;
-
 	/* set to indicate the type of the operation */
 	int rw;
 
@@ -442,6 +409,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 
 int orangefs_permission(struct inode *inode, int mask);
 
+int orangefs_update_time(struct inode *, struct timespec *, int);
+
 /*
  * defined in xattr.c
  */
@@ -484,8 +453,6 @@ bool __is_daemon_in_service(void);
  */
 __s32 fsid_of_op(struct orangefs_kernel_op_s *op);
 
-int orangefs_flush_inode(struct inode *inode);
-
 ssize_t orangefs_inode_getxattr(struct inode *inode,
 			     const char *name,
 			     void *buffer,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index f82336496311..97fe93129f38 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -4,6 +4,7 @@
  *
  * See COPYING in top-level directory.
  */
+#include <linux/kernel.h>
 #include "protocol.h"
 #include "orangefs-kernel.h"
 #include "orangefs-dev-proto.h"
@@ -437,89 +438,8 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
 
 	op_release(new_op);
 
-	/*
-	 * successful setattr should clear the atime, mtime and
-	 * ctime flags.
-	 */
-	if (ret == 0) {
-		ClearAtimeFlag(orangefs_inode);
-		ClearMtimeFlag(orangefs_inode);
-		ClearCtimeFlag(orangefs_inode);
-		ClearModeFlag(orangefs_inode);
+	if (ret == 0)
 		orangefs_inode->getattr_time = jiffies - 1;
-	}
-
-	return ret;
-}
-
-int orangefs_flush_inode(struct inode *inode)
-{
-	/*
-	 * If it is a dirty inode, this function gets called.
-	 * Gather all the information that needs to be setattr'ed
-	 * Right now, this will only be used for mode, atime, mtime
-	 * and/or ctime.
-	 */
-	struct iattr wbattr;
-	int ret;
-	int mtime_flag;
-	int ctime_flag;
-	int atime_flag;
-	int mode_flag;
-	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-
-	memset(&wbattr, 0, sizeof(wbattr));
-
-	/*
-	 * check inode flags up front, and clear them if they are set.  This
-	 * will prevent multiple processes from all trying to flush the same
-	 * inode if they call close() simultaneously
-	 */
-	mtime_flag = MtimeFlag(orangefs_inode);
-	ClearMtimeFlag(orangefs_inode);
-	ctime_flag = CtimeFlag(orangefs_inode);
-	ClearCtimeFlag(orangefs_inode);
-	atime_flag = AtimeFlag(orangefs_inode);
-	ClearAtimeFlag(orangefs_inode);
-	mode_flag = ModeFlag(orangefs_inode);
-	ClearModeFlag(orangefs_inode);
-
-	/*  -- Lazy atime,mtime and ctime update --
-	 * Note: all times are dictated by server in the new scheme
-	 * and not by the clients
-	 *
-	 * Also mode updates are being handled now..
-	 */
-
-	if (mtime_flag)
-		wbattr.ia_valid |= ATTR_MTIME;
-	if (ctime_flag)
-		wbattr.ia_valid |= ATTR_CTIME;
-	if (atime_flag)
-		wbattr.ia_valid |= ATTR_ATIME;
-
-	if (mode_flag) {
-		wbattr.ia_mode = inode->i_mode;
-		wbattr.ia_valid |= ATTR_MODE;
-	}
-
-	gossip_debug(GOSSIP_UTILS_DEBUG,
-		     "*********** orangefs_flush_inode: %pU "
-		     "(ia_valid %d)\n",
-		     get_khandle_from_ino(inode),
-		     wbattr.ia_valid);
-	if (wbattr.ia_valid == 0) {
-		gossip_debug(GOSSIP_UTILS_DEBUG,
-			     "orangefs_flush_inode skipping setattr()\n");
-		return 0;
-	}
-
-	gossip_debug(GOSSIP_UTILS_DEBUG,
-		     "orangefs_flush_inode (%pU) writing mode %o\n",
-		     get_khandle_from_ino(inode),
-		     inode->i_mode);
-
-	ret = orangefs_inode_setattr(inode, &wbattr);
 
 	return ret;
 }
@@ -606,7 +526,7 @@ int orangefs_normalize_to_errno(__s32 error_code)
 	/* Convert ORANGEFS encoded errno values into regular errno values. */
 	} else if ((-error_code) & ORANGEFS_ERROR_BIT) {
 		i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS);
-		if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping))
+		if (i < ARRAY_SIZE(PINT_errno_mapping))
 			error_code = -PINT_errno_mapping[i];
 		else
 			error_code = -EINVAL;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 47ebd9bfd1a1..36f1390b5ed7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -40,7 +40,7 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb);
 
-	if (root->d_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
 	if (orangefs_sb->flags & ORANGEFS_OPT_INTR)
 		seq_puts(m, ",intr");
@@ -60,7 +60,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
 	 * Force any potential flags that might be set from the mount
 	 * to zero, ie, initialize to unset.
 	 */
-	sb->s_flags &= ~MS_POSIXACL;
+	sb->s_flags &= ~SB_POSIXACL;
 	orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
 	orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
 
@@ -73,7 +73,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_acl:
-			sb->s_flags |= MS_POSIXACL;
+			sb->s_flags |= SB_POSIXACL;
 			break;
 		case Opt_intr:
 			orangefs_sb->flags |= ORANGEFS_OPT_INTR;
@@ -99,8 +99,6 @@ static void orangefs_inode_cache_ctor(void *req)
 
 	inode_init_once(&orangefs_inode->vfs_inode);
 	init_rwsem(&orangefs_inode->xattr_sem);
-
-	orangefs_inode->vfs_inode.i_version = 1;
 }
 
 static struct inode *orangefs_alloc_inode(struct super_block *sb)
@@ -119,7 +117,6 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb)
 	orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL;
 	orangefs_inode->last_failed_block_index_read = 0;
 	memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target));
-	orangefs_inode->pinode_flags = 0;
 
 	gossip_debug(GOSSIP_SUPER_DEBUG,
 		     "orangefs_alloc_inode: allocated %p\n",
@@ -299,21 +296,9 @@ void fsid_key_table_finalize(void)
 {
 }
 
-/* Called whenever the VFS dirties the inode in response to atime updates */
-static void orangefs_dirty_inode(struct inode *inode, int flags)
-{
-	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-
-	gossip_debug(GOSSIP_SUPER_DEBUG,
-		     "orangefs_dirty_inode: %pU\n",
-		     get_khandle_from_ino(inode));
-	SetAtimeFlag(orangefs_inode);
-}
-
 static const struct super_operations orangefs_s_ops = {
 	.alloc_inode = orangefs_alloc_inode,
 	.destroy_inode = orangefs_destroy_inode,
-	.dirty_inode = orangefs_dirty_inode,
 	.drop_inode = generic_delete_inode,
 	.statfs = orangefs_statfs,
 	.remount_fs = orangefs_remount_fs,
@@ -522,7 +507,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 
 	ret = orangefs_fill_sb(sb,
 	      &new_op->downcall.resp.fs_mount, data,
-	      flags & MS_SILENT ? 1 : 0);
+	      flags & SB_SILENT ? 1 : 0);
 
 	if (ret) {
 		d = ERR_PTR(ret);
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index d856cdf91763..db107fe91ab3 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -15,4 +15,5 @@ const struct inode_operations orangefs_symlink_inode_operations = {
 	.getattr = orangefs_getattr,
 	.listxattr = orangefs_listxattr,
 	.permission = orangefs_permission,
+	.update_time = orangefs_update_time,
 };
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index cbfc196e5dc5..5ac415466861 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -24,6 +24,16 @@ config OVERLAY_FS_REDIRECT_DIR
 	  an overlay which has redirects on a kernel that doesn't support this
 	  feature will have unexpected results.
 
+config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
+	bool "Overlayfs: follow redirects even if redirects are turned off"
+	default y
+	depends on OVERLAY_FS
+	help
+	  Disable this to get a possibly more secure configuration, but that
+	  might not be backward compatible with previous kernels.
+
+	  For more information, see Documentation/filesystems/overlayfs.txt
+
 config OVERLAY_FS_INDEX
 	bool "Overlayfs: turn on inodes index feature by default"
 	depends on OVERLAY_FS
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index c441f9387a1b..eb3b8d39fb61 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -22,7 +22,6 @@
 #include <linux/ratelimit.h>
 #include <linux/exportfs.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 #define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
 
@@ -486,6 +485,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
 static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
 {
 	struct inode *udir = c->destdir->d_inode;
+	struct inode *inode;
 	struct dentry *newdentry = NULL;
 	struct dentry *temp = NULL;
 	int err;
@@ -508,7 +508,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
 	if (err)
 		goto out_cleanup;
 
-	ovl_inode_update(d_inode(c->dentry), newdentry);
+	inode = d_inode(c->dentry);
+	ovl_inode_update(inode, newdentry);
+	if (S_ISDIR(inode->i_mode))
+		ovl_set_flag(OVL_WHITEOUTS, inode);
+
 out:
 	dput(temp);
 	return err;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index cc961a3bd3bd..f9788bc116a8 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -181,6 +181,11 @@ static bool ovl_type_origin(struct dentry *dentry)
 	return OVL_TYPE_ORIGIN(ovl_path_type(dentry));
 }
 
+static bool ovl_may_have_whiteouts(struct dentry *dentry)
+{
+	return ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
+}
+
 static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 			    struct cattr *attr, struct dentry *hardlink)
 {
@@ -300,7 +305,6 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
 {
 	int err;
 	struct dentry *ret = NULL;
-	enum ovl_path_type type = ovl_path_type(dentry);
 	LIST_HEAD(list);
 
 	err = ovl_check_empty_dir(dentry, &list);
@@ -313,13 +317,13 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
 	 * When removing an empty opaque directory, then it makes no sense to
 	 * replace it with an exact replica of itself.
 	 *
-	 * If no upperdentry then skip clearing whiteouts.
+	 * If upperdentry has whiteouts, clear them.
 	 *
 	 * Can race with copy-up, since we don't hold the upperdir mutex.
 	 * Doesn't matter, since copy-up can't create a non-empty directory
 	 * from an empty one.
 	 */
-	if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type))
+	if (!list_empty(&list))
 		ret = ovl_clear_empty(dentry, &list);
 
 out_free:
@@ -698,8 +702,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	struct dentry *opaquedir = NULL;
 	int err;
 
-	/* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */
-	if (is_dir && ovl_dentry_get_redirect(dentry)) {
+	/* Redirect/origin dir can be !ovl_lower_positive && not clean */
+	if (is_dir && (ovl_dentry_get_redirect(dentry) ||
+		       ovl_may_have_whiteouts(dentry))) {
 		opaquedir = ovl_check_empty_and_clear(dentry);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir))
@@ -882,7 +887,8 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
 		spin_unlock(&dentry->d_lock);
 	} else {
 		kfree(redirect);
-		pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err);
+		pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n",
+				    err);
 		/* Fall back to userspace copy-up */
 		err = -EXDEV;
 	}
@@ -946,7 +952,8 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
 
 	old_cred = ovl_override_creds(old->d_sb);
 
-	if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) {
+	if (overwrite && new_is_dir && (ovl_type_merge_or_lower(new) ||
+					ovl_may_have_whiteouts(new))) {
 		opaquedir = ovl_check_empty_and_clear(new);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir)) {
@@ -1069,9 +1076,10 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
 			drop_nlink(d_inode(new));
 	}
 
-	ovl_dentry_version_inc(old->d_parent,
-			       !overwrite && ovl_type_origin(new));
-	ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old));
+	ovl_dentry_version_inc(old->d_parent, ovl_type_origin(old) ||
+			       (!overwrite && ovl_type_origin(new)));
+	ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old) ||
+			       (d_inode(new) && ovl_type_origin(new)));
 
 out_dput:
 	dput(newdentry);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 321511ed8c42..00b6b294272a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -15,6 +15,14 @@
 #include <linux/ratelimit.h>
 #include "overlayfs.h"
 
+
+static dev_t ovl_get_pseudo_dev(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->lowerstack[0].layer->pseudo_dev;
+}
+
 int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int err;
@@ -66,6 +74,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 	struct path realpath;
 	const struct cred *old_cred;
 	bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
+	bool samefs = ovl_same_sb(dentry->d_sb);
 	int err;
 
 	type = ovl_path_real(dentry, &realpath);
@@ -75,16 +84,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 		goto out;
 
 	/*
-	 * When all layers are on the same fs, all real inode number are
-	 * unique, so we use the overlay st_dev, which is friendly to du -x.
-	 *
-	 * We also use st_ino of the copy up origin, if we know it.
-	 * This guaranties constant st_dev/st_ino across copy up.
+	 * For non-dir or same fs, we use st_ino of the copy up origin, if we
+	 * know it. This guaranties constant st_dev/st_ino across copy up.
 	 *
 	 * If filesystem supports NFS export ops, this also guaranties
 	 * persistent st_ino across mount cycle.
 	 */
-	if (ovl_same_sb(dentry->d_sb)) {
+	if (!is_dir || samefs) {
 		if (OVL_TYPE_ORIGIN(type)) {
 			struct kstat lowerstat;
 			u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
@@ -95,7 +101,6 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 			if (err)
 				goto out;
 
-			WARN_ON_ONCE(stat->dev != lowerstat.dev);
 			/*
 			 * Lower hardlinks may be broken on copy up to different
 			 * upper files, so we cannot use the lower origin st_ino
@@ -107,17 +112,36 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 			if (is_dir || lowerstat.nlink == 1 ||
 			    ovl_test_flag(OVL_INDEX, d_inode(dentry)))
 				stat->ino = lowerstat.ino;
+
+			if (samefs)
+				WARN_ON_ONCE(stat->dev != lowerstat.dev);
+			else
+				stat->dev = ovl_get_pseudo_dev(dentry);
 		}
-		stat->dev = dentry->d_sb->s_dev;
-	} else if (is_dir) {
+		if (samefs) {
+			/*
+			 * When all layers are on the same fs, all real inode
+			 * number are unique, so we use the overlay st_dev,
+			 * which is friendly to du -x.
+			 */
+			stat->dev = dentry->d_sb->s_dev;
+		} else if (!OVL_TYPE_UPPER(type)) {
+			/*
+			 * For non-samefs setup, to make sure that st_dev/st_ino
+			 * pair is unique across the system, we use a unique
+			 * anonymous st_dev for lower layer inode.
+			 */
+			stat->dev = ovl_get_pseudo_dev(dentry);
+		}
+	} else {
 		/*
-		 * If not all layers are on the same fs the pair {real st_ino;
-		 * overlay st_dev} is not unique, so use the non persistent
-		 * overlay st_ino.
-		 *
 		 * Always use the overlay st_dev for directories, so 'find
 		 * -xdev' will scan the entire overlay mount and won't cross the
 		 * overlay mount boundaries.
+		 *
+		 * If not all layers are on the same fs the pair {real st_ino;
+		 * overlay st_dev} is not unique, so use the non persistent
+		 * overlay st_ino for directories.
 		 */
 		stat->dev = dentry->d_sb->s_dev;
 		stat->ino = dentry->d_inode->i_ino;
@@ -409,6 +433,7 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
 #ifdef CONFIG_LOCKDEP
 	static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
 	static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+	static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
 
 	int depth = inode->i_sb->s_stack_depth - 1;
 
@@ -419,6 +444,8 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
 		lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
 	else
 		lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+
+	lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
 #endif
 }
 
@@ -657,6 +684,16 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry,
 	if (upperdentry && ovl_is_impuredir(upperdentry))
 		ovl_set_flag(OVL_IMPURE, inode);
 
+	/* Check for non-merge dir that may have whiteouts */
+	if (S_ISDIR(realinode->i_mode)) {
+		struct ovl_entry *oe = dentry->d_fsdata;
+
+		if (((upperdentry && lowerdentry) || oe->numlower > 1) ||
+		    ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
+			ovl_set_flag(OVL_WHITEOUTS, inode);
+		}
+	}
+
 	if (inode->i_state & I_NEW)
 		unlock_new_inode(inode);
 out:
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index a12dc10bf726..beb945e1963c 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -15,7 +15,6 @@
 #include <linux/mount.h>
 #include <linux/exportfs.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 struct ovl_lookup_data {
 	struct qstr name;
@@ -286,16 +285,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
 
 
 static int ovl_check_origin(struct dentry *upperdentry,
-			    struct path *lowerstack, unsigned int numlower,
-			    struct path **stackp, unsigned int *ctrp)
+			    struct ovl_path *lower, unsigned int numlower,
+			    struct ovl_path **stackp, unsigned int *ctrp)
 {
 	struct vfsmount *mnt;
 	struct dentry *origin = NULL;
 	int i;
 
-
 	for (i = 0; i < numlower; i++) {
-		mnt = lowerstack[i].mnt;
+		mnt = lower[i].layer->mnt;
 		origin = ovl_get_origin(upperdentry, mnt);
 		if (IS_ERR(origin))
 			return PTR_ERR(origin);
@@ -309,12 +307,12 @@ static int ovl_check_origin(struct dentry *upperdentry,
 
 	BUG_ON(*ctrp);
 	if (!*stackp)
-		*stackp = kmalloc(sizeof(struct path), GFP_KERNEL);
+		*stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL);
 	if (!*stackp) {
 		dput(origin);
 		return -ENOMEM;
 	}
-	**stackp = (struct path) { .dentry = origin, .mnt = mnt };
+	**stackp = (struct ovl_path){.dentry = origin, .layer = lower[i].layer};
 	*ctrp = 1;
 
 	return 0;
@@ -350,8 +348,8 @@ static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh)
  *
  * Return 0 on match, -ESTALE on mismatch, < 0 on error.
  */
-int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt,
-		      struct dentry *origin, bool is_upper, bool set)
+int ovl_verify_origin(struct dentry *dentry, struct dentry *origin,
+		      bool is_upper, bool set)
 {
 	struct inode *inode;
 	struct ovl_fh *fh;
@@ -384,13 +382,13 @@ fail:
  * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path.
  * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error.
  */
-int ovl_verify_index(struct dentry *index, struct path *lowerstack,
+int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
 		     unsigned int numlower)
 {
 	struct ovl_fh *fh = NULL;
 	size_t len;
-	struct path origin = { };
-	struct path *stack = &origin;
+	struct ovl_path origin = { };
+	struct ovl_path *stack = &origin;
 	unsigned int ctr = 0;
 	int err;
 
@@ -429,7 +427,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack,
 	if (err)
 		goto fail;
 
-	err = ovl_check_origin(index, lowerstack, numlower, &stack, &ctr);
+	err = ovl_check_origin(index, lower, numlower, &stack, &ctr);
 	if (!err && !ctr)
 		err = -ESTALE;
 	if (err)
@@ -437,7 +435,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack,
 
 	/* Check if index is orphan and don't warn before cleaning it */
 	if (d_inode(index)->i_nlink == 1 &&
-	    ovl_get_nlink(index, origin.dentry, 0) == 0)
+	    ovl_get_nlink(origin.dentry, index, 0) == 0)
 		err = -ENOENT;
 
 	dput(origin.dentry);
@@ -568,11 +566,24 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
 		idx++;
 	}
 	BUG_ON(idx > oe->numlower);
-	*path = oe->lowerstack[idx - 1];
+	path->dentry = oe->lowerstack[idx - 1].dentry;
+	path->mnt = oe->lowerstack[idx - 1].layer->mnt;
 
 	return (idx < oe->numlower) ? idx + 1 : -1;
 }
 
+static int ovl_find_layer(struct ovl_fs *ofs, struct ovl_path *path)
+{
+	int i;
+
+	for (i = 0; i < ofs->numlower; i++) {
+		if (ofs->lower_layers[i].mnt == path->layer->mnt)
+			break;
+	}
+
+	return i;
+}
+
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags)
 {
@@ -581,7 +592,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
 	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
 	struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
-	struct path *stack = NULL;
+	struct ovl_path *stack = NULL;
 	struct dentry *upperdir, *upperdentry = NULL;
 	struct dentry *index = NULL;
 	unsigned int ctr = 0;
@@ -630,7 +641,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			err = ovl_check_origin(upperdentry, roe->lowerstack,
 					       roe->numlower, &stack, &ctr);
 			if (err)
-				goto out;
+				goto out_put_upper;
 		}
 
 		if (d.redirect) {
@@ -646,17 +657,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 
 	if (!d.stop && poe->numlower) {
 		err = -ENOMEM;
-		stack = kcalloc(ofs->numlower, sizeof(struct path),
+		stack = kcalloc(ofs->numlower, sizeof(struct ovl_path),
 				GFP_KERNEL);
 		if (!stack)
 			goto out_put_upper;
 	}
 
 	for (i = 0; !d.stop && i < poe->numlower; i++) {
-		struct path lowerpath = poe->lowerstack[i];
+		struct ovl_path lower = poe->lowerstack[i];
 
 		d.last = i == poe->numlower - 1;
-		err = ovl_lookup_layer(lowerpath.dentry, &d, &this);
+		err = ovl_lookup_layer(lower.dentry, &d, &this);
 		if (err)
 			goto out_put;
 
@@ -664,20 +675,34 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			continue;
 
 		stack[ctr].dentry = this;
-		stack[ctr].mnt = lowerpath.mnt;
+		stack[ctr].layer = lower.layer;
 		ctr++;
 
 		if (d.stop)
 			break;
 
+		/*
+		 * Following redirects can have security consequences: it's like
+		 * a symlink into the lower layer without the permission checks.
+		 * This is only a problem if the upper layer is untrusted (e.g
+		 * comes from an USB drive).  This can allow a non-readable file
+		 * or directory to become readable.
+		 *
+		 * Only following redirects when redirects are enabled disables
+		 * this attack vector when not necessary.
+		 */
+		err = -EPERM;
+		if (d.redirect && !ofs->config.redirect_follow) {
+			pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry);
+			goto out_put;
+		}
+
 		if (d.redirect && d.redirect[0] == '/' && poe != roe) {
 			poe = roe;
 
 			/* Find the current layer on the root dentry */
-			for (i = 0; i < poe->numlower; i++)
-				if (poe->lowerstack[i].mnt == lowerpath.mnt)
-					break;
-			if (WARN_ON(i == poe->numlower))
+			i = ovl_find_layer(ofs, &lower);
+			if (WARN_ON(i == ofs->numlower))
 				break;
 		}
 	}
@@ -700,7 +725,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 		goto out_put;
 
 	oe->opaque = upperopaque;
-	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+	memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr);
 	dentry->d_fsdata = oe;
 
 	if (upperdentry)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d9a0edd4e57e..b489099ccd49 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/uuid.h>
+#include "ovl_entry.h"
 
 enum ovl_path_type {
 	__OVL_PATH_UPPER	= (1 << 0),
@@ -28,7 +29,10 @@ enum ovl_path_type {
 #define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink"
 
 enum ovl_flag {
+	/* Pure upper dir that may contain non pure upper entries */
 	OVL_IMPURE,
+	/* Non-merge dir that may contain whiteout entries */
+	OVL_WHITEOUTS,
 	OVL_INDEX,
 };
 
@@ -176,7 +180,7 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
 static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
 {
 	struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
-	int err = IS_ERR(ret) ? PTR_ERR(ret) : 0;
+	int err = PTR_ERR_OR_ZERO(ret);
 
 	pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return ret;
@@ -223,6 +227,7 @@ bool ovl_is_whiteout(struct dentry *dentry);
 struct file *ovl_path_open(struct path *path, int flags);
 int ovl_copy_up_start(struct dentry *dentry);
 void ovl_copy_up_end(struct dentry *dentry);
+bool ovl_check_origin_xattr(struct dentry *dentry);
 bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
 int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
 		       const char *name, const void *value, size_t size,
@@ -244,9 +249,9 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
 
 
 /* namei.c */
-int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt,
-		      struct dentry *origin, bool is_upper, bool set);
-int ovl_verify_index(struct dentry *index, struct path *lowerstack,
+int ovl_verify_origin(struct dentry *dentry, struct dentry *origin,
+		      bool is_upper, bool set);
+int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
 		     unsigned int numlower);
 int ovl_get_index_name(struct dentry *origin, struct qstr *name);
 int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
@@ -263,7 +268,7 @@ int ovl_check_d_type_supported(struct path *realpath);
 void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
 			 struct dentry *dentry, int level);
 int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
-			 struct path *lowerstack, unsigned int numlower);
+			 struct ovl_path *lower, unsigned int numlower);
 
 /* inode.c */
 int ovl_set_nlink_upper(struct dentry *dentry);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 36b49bd09264..9d0bc03bf6e4 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -14,14 +14,26 @@ struct ovl_config {
 	char *workdir;
 	bool default_permissions;
 	bool redirect_dir;
+	bool redirect_follow;
+	const char *redirect_mode;
 	bool index;
 };
 
+struct ovl_layer {
+	struct vfsmount *mnt;
+	dev_t pseudo_dev;
+};
+
+struct ovl_path {
+	struct ovl_layer *layer;
+	struct dentry *dentry;
+};
+
 /* private information held for overlayfs's superblock */
 struct ovl_fs {
 	struct vfsmount *upper_mnt;
 	unsigned numlower;
-	struct vfsmount **lower_mnt;
+	struct ovl_layer *lower_layers;
 	/* workbasedir is the path at workdir= mount option */
 	struct dentry *workbasedir;
 	/* workdir is the 'work' directory under workbasedir */
@@ -52,7 +64,7 @@ struct ovl_entry {
 		struct rcu_head rcu;
 	};
 	unsigned numlower;
-	struct path lowerstack[];
+	struct ovl_path lowerstack[];
 };
 
 struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c310e3ff7f3f..8c98578d27a1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -26,6 +26,7 @@ struct ovl_cache_entry {
 	struct list_head l_node;
 	struct rb_node node;
 	struct ovl_cache_entry *next_maybe_whiteout;
+	bool is_upper;
 	bool is_whiteout;
 	char name[];
 };
@@ -158,6 +159,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
 	/* Defer setting d_ino for upper entry to ovl_iterate() */
 	if (ovl_calc_d_ino(rdd, p))
 		p->ino = 0;
+	p->is_upper = rdd->is_upper;
 	p->is_whiteout = false;
 
 	if (d_type == DT_CHR) {
@@ -316,21 +318,37 @@ static inline int ovl_dir_read(struct path *realpath,
 	return err;
 }
 
+/*
+ * Can we iterate real dir directly?
+ *
+ * Non-merge dir may contain whiteouts from a time it was a merge upper, before
+ * lower dir was removed under it and possibly before it was rotated from upper
+ * to lower layer.
+ */
+static bool ovl_dir_is_real(struct dentry *dir)
+{
+	return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+}
+
 static void ovl_dir_reset(struct file *file)
 {
 	struct ovl_dir_file *od = file->private_data;
 	struct ovl_dir_cache *cache = od->cache;
 	struct dentry *dentry = file->f_path.dentry;
-	enum ovl_path_type type = ovl_path_type(dentry);
+	bool is_real;
 
 	if (cache && ovl_dentry_version_get(dentry) != cache->version) {
 		ovl_cache_put(od, dentry);
 		od->cache = NULL;
 		od->cursor = NULL;
 	}
-	WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
-	if (od->is_real && OVL_TYPE_MERGE(type))
+	is_real = ovl_dir_is_real(dentry);
+	if (od->is_real != is_real) {
+		/* is_real can only become false when dir is copied up */
+		if (WARN_ON(is_real))
+			return;
 		od->is_real = false;
+	}
 }
 
 static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
@@ -481,7 +499,7 @@ out:
 	return err;
 
 fail:
-	pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n",
+	pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n",
 			    p->name, err);
 	goto out;
 }
@@ -645,7 +663,10 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
 			return PTR_ERR(rdt.cache);
 	}
 
-	return iterate_dir(od->realfile, &rdt.ctx);
+	err = iterate_dir(od->realfile, &rdt.ctx);
+	ctx->pos = rdt.ctx.pos;
+
+	return err;
 }
 
 
@@ -816,7 +837,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
 		return PTR_ERR(realfile);
 	}
 	od->realfile = realfile;
-	od->is_real = !OVL_TYPE_MERGE(type);
+	od->is_real = ovl_dir_is_real(file->f_path.dentry);
 	od->is_upper = OVL_TYPE_UPPER(type);
 	file->private_data = od;
 
@@ -835,7 +856,7 @@ const struct file_operations ovl_dir_operations = {
 int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
 {
 	int err;
-	struct ovl_cache_entry *p;
+	struct ovl_cache_entry *p, *n;
 	struct rb_root root = RB_ROOT;
 
 	err = ovl_dir_read_merged(dentry, list, &root);
@@ -844,18 +865,29 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
 
 	err = 0;
 
-	list_for_each_entry(p, list, l_node) {
-		if (p->is_whiteout)
-			continue;
+	list_for_each_entry_safe(p, n, list, l_node) {
+		/*
+		 * Select whiteouts in upperdir, they should
+		 * be cleared when deleting this directory.
+		 */
+		if (p->is_whiteout) {
+			if (p->is_upper)
+				continue;
+			goto del_entry;
+		}
 
 		if (p->name[0] == '.') {
 			if (p->len == 1)
-				continue;
+				goto del_entry;
 			if (p->len == 2 && p->name[1] == '.')
-				continue;
+				goto del_entry;
 		}
 		err = -ENOTEMPTY;
 		break;
+
+del_entry:
+		list_del(&p->l_node);
+		kfree(p);
 	}
 
 	return err;
@@ -869,7 +901,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 	list_for_each_entry(p, list, l_node) {
 		struct dentry *dentry;
 
-		if (!p->is_whiteout)
+		if (WARN_ON(!p->is_whiteout || !p->is_upper))
 			continue;
 
 		dentry = lookup_one_len(p->name, upper, p->len);
@@ -985,7 +1017,7 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
 }
 
 int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
-			 struct path *lowerstack, unsigned int numlower)
+			 struct ovl_path *lower, unsigned int numlower)
 {
 	int err;
 	struct dentry *index = NULL;
@@ -1020,7 +1052,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
 			index = NULL;
 			break;
 		}
-		err = ovl_verify_index(index, lowerstack, numlower);
+		err = ovl_verify_index(index, lower, numlower);
 		/* Cleanup stale and orphan index entries */
 		if (err && (err == -ESTALE || err == -ENOENT))
 			err = ovl_cleanup(dir, index);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f5738e96a052..76440feb79f6 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -18,7 +18,6 @@
 #include <linux/seq_file.h>
 #include <linux/posix_acl_xattr.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Overlay filesystem");
@@ -34,20 +33,32 @@ module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
 MODULE_PARM_DESC(ovl_redirect_dir_def,
 		 "Default to on or off for the redirect_dir feature");
 
+static bool ovl_redirect_always_follow =
+	IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW);
+module_param_named(redirect_always_follow, ovl_redirect_always_follow,
+		   bool, 0644);
+MODULE_PARM_DESC(ovl_redirect_always_follow,
+		 "Follow redirects even if redirect_dir feature is turned off");
+
 static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX);
 module_param_named(index, ovl_index_def, bool, 0644);
 MODULE_PARM_DESC(ovl_index_def,
 		 "Default to on or off for the inodes index feature");
 
+static void ovl_entry_stack_free(struct ovl_entry *oe)
+{
+	unsigned int i;
+
+	for (i = 0; i < oe->numlower; i++)
+		dput(oe->lowerstack[i].dentry);
+}
+
 static void ovl_dentry_release(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
 	if (oe) {
-		unsigned int i;
-
-		for (i = 0; i < oe->numlower; i++)
-			dput(oe->lowerstack[i].dentry);
+		ovl_entry_stack_free(oe);
 		kfree_rcu(oe, rcu);
 	}
 }
@@ -207,46 +218,67 @@ static void ovl_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, ovl_i_callback);
 }
 
-static void ovl_put_super(struct super_block *sb)
+static void ovl_free_fs(struct ovl_fs *ofs)
 {
-	struct ovl_fs *ufs = sb->s_fs_info;
 	unsigned i;
 
-	dput(ufs->indexdir);
-	dput(ufs->workdir);
-	if (ufs->workdir_locked)
-		ovl_inuse_unlock(ufs->workbasedir);
-	dput(ufs->workbasedir);
-	if (ufs->upper_mnt && ufs->upperdir_locked)
-		ovl_inuse_unlock(ufs->upper_mnt->mnt_root);
-	mntput(ufs->upper_mnt);
-	for (i = 0; i < ufs->numlower; i++)
-		mntput(ufs->lower_mnt[i]);
-	kfree(ufs->lower_mnt);
-
-	kfree(ufs->config.lowerdir);
-	kfree(ufs->config.upperdir);
-	kfree(ufs->config.workdir);
-	put_cred(ufs->creator_cred);
-	kfree(ufs);
+	dput(ofs->indexdir);
+	dput(ofs->workdir);
+	if (ofs->workdir_locked)
+		ovl_inuse_unlock(ofs->workbasedir);
+	dput(ofs->workbasedir);
+	if (ofs->upperdir_locked)
+		ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
+	mntput(ofs->upper_mnt);
+	for (i = 0; i < ofs->numlower; i++) {
+		mntput(ofs->lower_layers[i].mnt);
+		free_anon_bdev(ofs->lower_layers[i].pseudo_dev);
+	}
+	kfree(ofs->lower_layers);
+
+	kfree(ofs->config.lowerdir);
+	kfree(ofs->config.upperdir);
+	kfree(ofs->config.workdir);
+	kfree(ofs->config.redirect_mode);
+	if (ofs->creator_cred)
+		put_cred(ofs->creator_cred);
+	kfree(ofs);
 }
 
+static void ovl_put_super(struct super_block *sb)
+{
+	struct ovl_fs *ofs = sb->s_fs_info;
+
+	ovl_free_fs(ofs);
+}
+
+/* Sync real dirty inodes in upper filesystem (if it exists) */
 static int ovl_sync_fs(struct super_block *sb, int wait)
 {
-	struct ovl_fs *ufs = sb->s_fs_info;
+	struct ovl_fs *ofs = sb->s_fs_info;
 	struct super_block *upper_sb;
 	int ret;
 
-	if (!ufs->upper_mnt)
+	if (!ofs->upper_mnt)
 		return 0;
-	upper_sb = ufs->upper_mnt->mnt_sb;
-	if (!upper_sb->s_op->sync_fs)
+
+	/*
+	 * If this is a sync(2) call or an emergency sync, all the super blocks
+	 * will be iterated, including upper_sb, so no need to do anything.
+	 *
+	 * If this is a syncfs(2) call, then we do need to call
+	 * sync_filesystem() on upper_sb, but enough if we do it when being
+	 * called with wait == 1.
+	 */
+	if (!wait)
 		return 0;
 
-	/* real inodes have already been synced by sync_filesystem(ovl_sb) */
+	upper_sb = ofs->upper_mnt->mnt_sb;
+
 	down_read(&upper_sb->s_umount);
-	ret = upper_sb->s_op->sync_fs(upper_sb, wait);
+	ret = sync_filesystem(upper_sb);
 	up_read(&upper_sb->s_umount);
+
 	return ret;
 }
 
@@ -277,9 +309,14 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 /* Will this overlay be forced to mount/remount ro? */
-static bool ovl_force_readonly(struct ovl_fs *ufs)
+static bool ovl_force_readonly(struct ovl_fs *ofs)
+{
+	return (!ofs->upper_mnt || !ofs->workdir);
+}
+
+static const char *ovl_redirect_mode_def(void)
 {
-	return (!ufs->upper_mnt || !ufs->workdir);
+	return ovl_redirect_dir_def ? "on" : "off";
 }
 
 /**
@@ -291,29 +328,27 @@ static bool ovl_force_readonly(struct ovl_fs *ufs)
 static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 {
 	struct super_block *sb = dentry->d_sb;
-	struct ovl_fs *ufs = sb->s_fs_info;
+	struct ovl_fs *ofs = sb->s_fs_info;
 
-	seq_show_option(m, "lowerdir", ufs->config.lowerdir);
-	if (ufs->config.upperdir) {
-		seq_show_option(m, "upperdir", ufs->config.upperdir);
-		seq_show_option(m, "workdir", ufs->config.workdir);
+	seq_show_option(m, "lowerdir", ofs->config.lowerdir);
+	if (ofs->config.upperdir) {
+		seq_show_option(m, "upperdir", ofs->config.upperdir);
+		seq_show_option(m, "workdir", ofs->config.workdir);
 	}
-	if (ufs->config.default_permissions)
+	if (ofs->config.default_permissions)
 		seq_puts(m, ",default_permissions");
-	if (ufs->config.redirect_dir != ovl_redirect_dir_def)
-		seq_printf(m, ",redirect_dir=%s",
-			   ufs->config.redirect_dir ? "on" : "off");
-	if (ufs->config.index != ovl_index_def)
-		seq_printf(m, ",index=%s",
-			   ufs->config.index ? "on" : "off");
+	if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0)
+		seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode);
+	if (ofs->config.index != ovl_index_def)
+		seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
 	return 0;
 }
 
 static int ovl_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct ovl_fs *ufs = sb->s_fs_info;
+	struct ovl_fs *ofs = sb->s_fs_info;
 
-	if (!(*flags & MS_RDONLY) && ovl_force_readonly(ufs))
+	if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
 		return -EROFS;
 
 	return 0;
@@ -335,8 +370,7 @@ enum {
 	OPT_UPPERDIR,
 	OPT_WORKDIR,
 	OPT_DEFAULT_PERMISSIONS,
-	OPT_REDIRECT_DIR_ON,
-	OPT_REDIRECT_DIR_OFF,
+	OPT_REDIRECT_DIR,
 	OPT_INDEX_ON,
 	OPT_INDEX_OFF,
 	OPT_ERR,
@@ -347,8 +381,7 @@ static const match_table_t ovl_tokens = {
 	{OPT_UPPERDIR,			"upperdir=%s"},
 	{OPT_WORKDIR,			"workdir=%s"},
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
-	{OPT_REDIRECT_DIR_ON,		"redirect_dir=on"},
-	{OPT_REDIRECT_DIR_OFF,		"redirect_dir=off"},
+	{OPT_REDIRECT_DIR,		"redirect_dir=%s"},
 	{OPT_INDEX_ON,			"index=on"},
 	{OPT_INDEX_OFF,			"index=off"},
 	{OPT_ERR,			NULL}
@@ -377,10 +410,37 @@ static char *ovl_next_opt(char **s)
 	return sbegin;
 }
 
+static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode)
+{
+	if (strcmp(mode, "on") == 0) {
+		config->redirect_dir = true;
+		/*
+		 * Does not make sense to have redirect creation without
+		 * redirect following.
+		 */
+		config->redirect_follow = true;
+	} else if (strcmp(mode, "follow") == 0) {
+		config->redirect_follow = true;
+	} else if (strcmp(mode, "off") == 0) {
+		if (ovl_redirect_always_follow)
+			config->redirect_follow = true;
+	} else if (strcmp(mode, "nofollow") != 0) {
+		pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n",
+		       mode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int ovl_parse_opt(char *opt, struct ovl_config *config)
 {
 	char *p;
 
+	config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL);
+	if (!config->redirect_mode)
+		return -ENOMEM;
+
 	while ((p = ovl_next_opt(&opt)) != NULL) {
 		int token;
 		substring_t args[MAX_OPT_ARGS];
@@ -415,12 +475,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 			config->default_permissions = true;
 			break;
 
-		case OPT_REDIRECT_DIR_ON:
-			config->redirect_dir = true;
-			break;
-
-		case OPT_REDIRECT_DIR_OFF:
-			config->redirect_dir = false;
+		case OPT_REDIRECT_DIR:
+			kfree(config->redirect_mode);
+			config->redirect_mode = match_strdup(&args[0]);
+			if (!config->redirect_mode)
+				return -ENOMEM;
 			break;
 
 		case OPT_INDEX_ON:
@@ -445,19 +504,17 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 		config->workdir = NULL;
 	}
 
-	return 0;
+	return ovl_parse_redirect_mode(config, config->redirect_mode);
 }
 
 #define OVL_WORKDIR_NAME "work"
 #define OVL_INDEXDIR_NAME "index"
 
-static struct dentry *ovl_workdir_create(struct super_block *sb,
-					 struct ovl_fs *ufs,
-					 struct dentry *dentry,
+static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
 					 const char *name, bool persist)
 {
-	struct inode *dir = dentry->d_inode;
-	struct vfsmount *mnt = ufs->upper_mnt;
+	struct inode *dir =  ofs->workbasedir->d_inode;
+	struct vfsmount *mnt = ofs->upper_mnt;
 	struct dentry *work;
 	int err;
 	bool retried = false;
@@ -471,7 +528,7 @@ static struct dentry *ovl_workdir_create(struct super_block *sb,
 	locked = true;
 
 retry:
-	work = lookup_one_len(name, dentry, strlen(name));
+	work = lookup_one_len(name, ofs->workbasedir, strlen(name));
 
 	if (!IS_ERR(work)) {
 		struct iattr attr = {
@@ -541,8 +598,7 @@ out_dput:
 	dput(work);
 out_err:
 	pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
-		ufs->config.workdir, name, -err);
-	sb->s_flags |= MS_RDONLY;
+		ofs->config.workdir, name, -err);
 	work = NULL;
 	goto out_unlock;
 }
@@ -585,7 +641,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
 	return 0;
 
 out_put:
-	path_put(path);
+	path_put_init(path);
 out:
 	return err;
 }
@@ -603,7 +659,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
 			if (ovl_dentry_remote(path->dentry)) {
 				pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
 				       tmp);
-				path_put(path);
+				path_put_init(path);
 				err = -EINVAL;
 			}
 		kfree(tmp);
@@ -655,7 +711,7 @@ static int ovl_lower_dir(const char *name, struct path *path,
 	return 0;
 
 out_put:
-	path_put(path);
+	path_put_init(path);
 out:
 	return err;
 }
@@ -826,129 +882,269 @@ static const struct xattr_handler *ovl_xattr_handlers[] = {
 	NULL
 };
 
-static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
 {
-	struct path upperpath = { };
-	struct path workpath = { };
-	struct dentry *root_dentry;
-	struct ovl_entry *oe;
-	struct ovl_fs *ufs;
-	struct path *stack = NULL;
-	char *lowertmp;
-	char *lower;
-	unsigned int numlower;
-	unsigned int stacklen = 0;
-	unsigned int i;
-	bool remote = false;
-	struct cred *cred;
+	struct vfsmount *upper_mnt;
 	int err;
 
-	err = -ENOMEM;
-	ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
-	if (!ufs)
+	err = ovl_mount_dir(ofs->config.upperdir, upperpath);
+	if (err)
 		goto out;
 
-	ufs->config.redirect_dir = ovl_redirect_dir_def;
-	ufs->config.index = ovl_index_def;
-	err = ovl_parse_opt((char *) data, &ufs->config);
+	/* Upper fs should not be r/o */
+	if (sb_rdonly(upperpath->mnt->mnt_sb)) {
+		pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir);
 	if (err)
-		goto out_free_config;
+		goto out;
+
+	err = -EBUSY;
+	if (ovl_inuse_trylock(upperpath->dentry)) {
+		ofs->upperdir_locked = true;
+	} else if (ofs->config.index) {
+		pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
+		goto out;
+	} else {
+		pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+	}
+
+	upper_mnt = clone_private_mount(upperpath);
+	err = PTR_ERR(upper_mnt);
+	if (IS_ERR(upper_mnt)) {
+		pr_err("overlayfs: failed to clone upperpath\n");
+		goto out;
+	}
+
+	/* Don't inherit atime flags */
+	upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+	ofs->upper_mnt = upper_mnt;
+	err = 0;
+out:
+	return err;
+}
+
+static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
+{
+	struct dentry *temp;
+	int err;
+
+	ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
+	if (!ofs->workdir)
+		return 0;
+
+	/*
+	 * Upper should support d_type, else whiteouts are visible.  Given
+	 * workdir and upper are on same fs, we can do iterate_dir() on
+	 * workdir. This check requires successful creation of workdir in
+	 * previous step.
+	 */
+	err = ovl_check_d_type_supported(workpath);
+	if (err < 0)
+		return err;
+
+	/*
+	 * We allowed this configuration and don't want to break users over
+	 * kernel upgrade. So warn instead of erroring out.
+	 */
+	if (!err)
+		pr_warn("overlayfs: upper fs needs to support d_type.\n");
+
+	/* Check if upper/work fs supports O_TMPFILE */
+	temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
+	ofs->tmpfile = !IS_ERR(temp);
+	if (ofs->tmpfile)
+		dput(temp);
+	else
+		pr_warn("overlayfs: upper fs does not support tmpfile.\n");
+
+	/*
+	 * Check if upper/work fs supports trusted.overlay.* xattr
+	 */
+	err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0);
+	if (err) {
+		ofs->noxattr = true;
+		pr_warn("overlayfs: upper fs does not support xattr.\n");
+	} else {
+		vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
+	}
+
+	/* Check if upper/work fs supports file handles */
+	if (ofs->config.index &&
+	    !ovl_can_decode_fh(ofs->workdir->d_sb)) {
+		ofs->config.index = false;
+		pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
+	}
+
+	return 0;
+}
+
+static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
+{
+	int err;
+	struct path workpath = { };
+
+	err = ovl_mount_dir(ofs->config.workdir, &workpath);
+	if (err)
+		goto out;
 
 	err = -EINVAL;
-	if (!ufs->config.lowerdir) {
-		if (!silent)
-			pr_err("overlayfs: missing 'lowerdir'\n");
-		goto out_free_config;
+	if (upperpath->mnt != workpath.mnt) {
+		pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+		goto out;
+	}
+	if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) {
+		pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+		goto out;
 	}
 
-	sb->s_stack_depth = 0;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	if (ufs->config.upperdir) {
-		if (!ufs->config.workdir) {
-			pr_err("overlayfs: missing 'workdir'\n");
-			goto out_free_config;
-		}
+	err = -EBUSY;
+	if (ovl_inuse_trylock(workpath.dentry)) {
+		ofs->workdir_locked = true;
+	} else if (ofs->config.index) {
+		pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
+		goto out;
+	} else {
+		pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+	}
 
-		err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
-		if (err)
-			goto out_free_config;
+	ofs->workbasedir = dget(workpath.dentry);
+	err = ovl_make_workdir(ofs, &workpath);
+	if (err)
+		goto out;
 
-		/* Upper fs should not be r/o */
-		if (sb_rdonly(upperpath.mnt->mnt_sb)) {
-			pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
-			err = -EINVAL;
-			goto out_put_upperpath;
-		}
+	err = 0;
+out:
+	path_put(&workpath);
 
-		err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir);
-		if (err)
-			goto out_put_upperpath;
-
-		err = -EBUSY;
-		if (ovl_inuse_trylock(upperpath.dentry)) {
-			ufs->upperdir_locked = true;
-		} else if (ufs->config.index) {
-			pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
-			goto out_put_upperpath;
-		} else {
-			pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
-		}
+	return err;
+}
+
+static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe,
+			    struct path *upperpath)
+{
+	int err;
+
+	/* Verify lower root is upper root origin */
+	err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry,
+				false, true);
+	if (err) {
+		pr_err("overlayfs: failed to verify upper root origin\n");
+		goto out;
+	}
 
-		err = ovl_mount_dir(ufs->config.workdir, &workpath);
+	ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
+	if (ofs->indexdir) {
+		/* Verify upper root is index dir origin */
+		err = ovl_verify_origin(ofs->indexdir, upperpath->dentry,
+					true, true);
 		if (err)
-			goto out_unlock_upperdentry;
+			pr_err("overlayfs: failed to verify index dir origin\n");
 
-		err = -EINVAL;
-		if (upperpath.mnt != workpath.mnt) {
-			pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
-			goto out_put_workpath;
-		}
-		if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
-			pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
-			goto out_put_workpath;
+		/* Cleanup bad/stale/orphan index entries */
+		if (!err)
+			err = ovl_indexdir_cleanup(ofs->indexdir,
+						   ofs->upper_mnt,
+						   oe->lowerstack,
+						   oe->numlower);
+	}
+	if (err || !ofs->indexdir)
+		pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+
+out:
+	return err;
+}
+
+static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
+				unsigned int numlower)
+{
+	int err;
+	unsigned int i;
+
+	err = -ENOMEM;
+	ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer),
+				    GFP_KERNEL);
+	if (ofs->lower_layers == NULL)
+		goto out;
+	for (i = 0; i < numlower; i++) {
+		struct vfsmount *mnt;
+		dev_t dev;
+
+		err = get_anon_bdev(&dev);
+		if (err) {
+			pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
+			goto out;
 		}
 
-		err = -EBUSY;
-		if (ovl_inuse_trylock(workpath.dentry)) {
-			ufs->workdir_locked = true;
-		} else if (ufs->config.index) {
-			pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
-			goto out_put_workpath;
-		} else {
-			pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+		mnt = clone_private_mount(&stack[i]);
+		err = PTR_ERR(mnt);
+		if (IS_ERR(mnt)) {
+			pr_err("overlayfs: failed to clone lowerpath\n");
+			free_anon_bdev(dev);
+			goto out;
 		}
+		/*
+		 * Make lower layers R/O.  That way fchmod/fchown on lower file
+		 * will fail instead of modifying lower fs.
+		 */
+		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
 
-		ufs->workbasedir = workpath.dentry;
-		sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
+		ofs->lower_layers[ofs->numlower].mnt = mnt;
+		ofs->lower_layers[ofs->numlower].pseudo_dev = dev;
+		ofs->numlower++;
+
+		/* Check if all lower layers are on same sb */
+		if (i == 0)
+			ofs->same_sb = mnt->mnt_sb;
+		else if (ofs->same_sb != mnt->mnt_sb)
+			ofs->same_sb = NULL;
 	}
+	err = 0;
+out:
+	return err;
+}
+
+static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
+					    struct ovl_fs *ofs)
+{
+	int err;
+	char *lowertmp, *lower;
+	struct path *stack = NULL;
+	unsigned int stacklen, numlower = 0, i;
+	bool remote = false;
+	struct ovl_entry *oe;
+
 	err = -ENOMEM;
-	lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+	lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
 	if (!lowertmp)
-		goto out_unlock_workdentry;
+		goto out_err;
 
 	err = -EINVAL;
 	stacklen = ovl_split_lowerdirs(lowertmp);
 	if (stacklen > OVL_MAX_STACK) {
 		pr_err("overlayfs: too many lower directories, limit is %d\n",
 		       OVL_MAX_STACK);
-		goto out_free_lowertmp;
-	} else if (!ufs->config.upperdir && stacklen == 1) {
+		goto out_err;
+	} else if (!ofs->config.upperdir && stacklen == 1) {
 		pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
-		goto out_free_lowertmp;
+		goto out_err;
 	}
 
 	err = -ENOMEM;
 	stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
 	if (!stack)
-		goto out_free_lowertmp;
+		goto out_err;
 
 	err = -EINVAL;
 	lower = lowertmp;
 	for (numlower = 0; numlower < stacklen; numlower++) {
-		err = ovl_lower_dir(lower, &stack[numlower], ufs,
+		err = ovl_lower_dir(lower, &stack[numlower], ofs,
 				    &sb->s_stack_depth, &remote);
 		if (err)
-			goto out_put_lowerpath;
+			goto out_err;
 
 		lower = strchr(lower, '\0') + 1;
 	}
@@ -957,190 +1153,143 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_stack_depth++;
 	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
 		pr_err("overlayfs: maximum fs stacking depth exceeded\n");
-		goto out_put_lowerpath;
+		goto out_err;
 	}
 
-	if (ufs->config.upperdir) {
-		ufs->upper_mnt = clone_private_mount(&upperpath);
-		err = PTR_ERR(ufs->upper_mnt);
-		if (IS_ERR(ufs->upper_mnt)) {
-			pr_err("overlayfs: failed to clone upperpath\n");
-			goto out_put_lowerpath;
-		}
+	err = ovl_get_lower_layers(ofs, stack, numlower);
+	if (err)
+		goto out_err;
 
-		/* Don't inherit atime flags */
-		ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+	err = -ENOMEM;
+	oe = ovl_alloc_entry(numlower);
+	if (!oe)
+		goto out_err;
 
-		sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran;
+	for (i = 0; i < numlower; i++) {
+		oe->lowerstack[i].dentry = dget(stack[i].dentry);
+		oe->lowerstack[i].layer = &ofs->lower_layers[i];
+	}
 
-		ufs->workdir = ovl_workdir_create(sb, ufs, workpath.dentry,
-						  OVL_WORKDIR_NAME, false);
-		/*
-		 * Upper should support d_type, else whiteouts are visible.
-		 * Given workdir and upper are on same fs, we can do
-		 * iterate_dir() on workdir. This check requires successful
-		 * creation of workdir in previous step.
-		 */
-		if (ufs->workdir) {
-			struct dentry *temp;
-
-			err = ovl_check_d_type_supported(&workpath);
-			if (err < 0)
-				goto out_put_workdir;
-
-			/*
-			 * We allowed this configuration and don't want to
-			 * break users over kernel upgrade. So warn instead
-			 * of erroring out.
-			 */
-			if (!err)
-				pr_warn("overlayfs: upper fs needs to support d_type.\n");
-
-			/* Check if upper/work fs supports O_TMPFILE */
-			temp = ovl_do_tmpfile(ufs->workdir, S_IFREG | 0);
-			ufs->tmpfile = !IS_ERR(temp);
-			if (ufs->tmpfile)
-				dput(temp);
-			else
-				pr_warn("overlayfs: upper fs does not support tmpfile.\n");
-
-			/*
-			 * Check if upper/work fs supports trusted.overlay.*
-			 * xattr
-			 */
-			err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE,
-					      "0", 1, 0);
-			if (err) {
-				ufs->noxattr = true;
-				pr_warn("overlayfs: upper fs does not support xattr.\n");
-			} else {
-				vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE);
-			}
+	if (remote)
+		sb->s_d_op = &ovl_reval_dentry_operations;
+	else
+		sb->s_d_op = &ovl_dentry_operations;
 
-			/* Check if upper/work fs supports file handles */
-			if (ufs->config.index &&
-			    !ovl_can_decode_fh(ufs->workdir->d_sb)) {
-				ufs->config.index = false;
-				pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
-			}
-		}
-	}
+out:
+	for (i = 0; i < numlower; i++)
+		path_put(&stack[i]);
+	kfree(stack);
+	kfree(lowertmp);
+
+	return oe;
+
+out_err:
+	oe = ERR_PTR(err);
+	goto out;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct path upperpath = { };
+	struct dentry *root_dentry;
+	struct ovl_entry *oe;
+	struct ovl_fs *ofs;
+	struct cred *cred;
+	int err;
 
 	err = -ENOMEM;
-	ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
-	if (ufs->lower_mnt == NULL)
-		goto out_put_workdir;
-	for (i = 0; i < numlower; i++) {
-		struct vfsmount *mnt = clone_private_mount(&stack[i]);
+	ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+	if (!ofs)
+		goto out;
 
-		err = PTR_ERR(mnt);
-		if (IS_ERR(mnt)) {
-			pr_err("overlayfs: failed to clone lowerpath\n");
-			goto out_put_lower_mnt;
-		}
-		/*
-		 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
-		 * will fail instead of modifying lower fs.
-		 */
-		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
+	ofs->creator_cred = cred = prepare_creds();
+	if (!cred)
+		goto out_err;
 
-		ufs->lower_mnt[ufs->numlower] = mnt;
-		ufs->numlower++;
+	ofs->config.index = ovl_index_def;
+	err = ovl_parse_opt((char *) data, &ofs->config);
+	if (err)
+		goto out_err;
 
-		/* Check if all lower layers are on same sb */
-		if (i == 0)
-			ufs->same_sb = mnt->mnt_sb;
-		else if (ufs->same_sb != mnt->mnt_sb)
-			ufs->same_sb = NULL;
+	err = -EINVAL;
+	if (!ofs->config.lowerdir) {
+		if (!silent)
+			pr_err("overlayfs: missing 'lowerdir'\n");
+		goto out_err;
 	}
 
-	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
-	if (!ufs->upper_mnt)
-		sb->s_flags |= MS_RDONLY;
-	else if (ufs->upper_mnt->mnt_sb != ufs->same_sb)
-		ufs->same_sb = NULL;
-
-	if (!(ovl_force_readonly(ufs)) && ufs->config.index) {
-		/* Verify lower root is upper root origin */
-		err = ovl_verify_origin(upperpath.dentry, ufs->lower_mnt[0],
-					stack[0].dentry, false, true);
-		if (err) {
-			pr_err("overlayfs: failed to verify upper root origin\n");
-			goto out_put_lower_mnt;
+	sb->s_stack_depth = 0;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	if (ofs->config.upperdir) {
+		if (!ofs->config.workdir) {
+			pr_err("overlayfs: missing 'workdir'\n");
+			goto out_err;
 		}
 
-		ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry,
-						   OVL_INDEXDIR_NAME, true);
-		if (ufs->indexdir) {
-			/* Verify upper root is index dir origin */
-			err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt,
-						upperpath.dentry, true, true);
-			if (err)
-				pr_err("overlayfs: failed to verify index dir origin\n");
+		err = ovl_get_upper(ofs, &upperpath);
+		if (err)
+			goto out_err;
 
-			/* Cleanup bad/stale/orphan index entries */
-			if (!err)
-				err = ovl_indexdir_cleanup(ufs->indexdir,
-							   ufs->upper_mnt,
-							   stack, numlower);
-		}
-		if (err || !ufs->indexdir)
-			pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+		err = ovl_get_workdir(ofs, &upperpath);
 		if (err)
-			goto out_put_indexdir;
+			goto out_err;
+
+		if (!ofs->workdir)
+			sb->s_flags |= SB_RDONLY;
+
+		sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
+		sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
+
 	}
+	oe = ovl_get_lowerstack(sb, ofs);
+	err = PTR_ERR(oe);
+	if (IS_ERR(oe))
+		goto out_err;
 
-	/* Show index=off/on in /proc/mounts for any of the reasons above */
-	if (!ufs->indexdir)
-		ufs->config.index = false;
+	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
+	if (!ofs->upper_mnt)
+		sb->s_flags |= SB_RDONLY;
+	else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
+		ofs->same_sb = NULL;
 
-	if (remote)
-		sb->s_d_op = &ovl_reval_dentry_operations;
-	else
-		sb->s_d_op = &ovl_dentry_operations;
+	if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
+		err = ovl_get_indexdir(ofs, oe, &upperpath);
+		if (err)
+			goto out_free_oe;
 
-	err = -ENOMEM;
-	ufs->creator_cred = cred = prepare_creds();
-	if (!cred)
-		goto out_put_indexdir;
+		if (!ofs->indexdir)
+			sb->s_flags |= SB_RDONLY;
+	}
+
+	/* Show index=off/on in /proc/mounts for any of the reasons above */
+	if (!ofs->indexdir)
+		ofs->config.index = false;
 
 	/* Never override disk quota limits or use reserved space */
 	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
 
-	err = -ENOMEM;
-	oe = ovl_alloc_entry(numlower);
-	if (!oe)
-		goto out_put_cred;
-
 	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
 	sb->s_op = &ovl_super_operations;
 	sb->s_xattr = ovl_xattr_handlers;
-	sb->s_fs_info = ufs;
-	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
+	sb->s_fs_info = ofs;
+	sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK;
 
+	err = -ENOMEM;
 	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
 	if (!root_dentry)
 		goto out_free_oe;
 
 	mntput(upperpath.mnt);
-	for (i = 0; i < numlower; i++)
-		mntput(stack[i].mnt);
-	mntput(workpath.mnt);
-	kfree(lowertmp);
-
 	if (upperpath.dentry) {
 		oe->has_upper = true;
 		if (ovl_is_impuredir(upperpath.dentry))
 			ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
 	}
-	for (i = 0; i < numlower; i++) {
-		oe->lowerstack[i].dentry = stack[i].dentry;
-		oe->lowerstack[i].mnt = ufs->lower_mnt[i];
-	}
-	kfree(stack);
 
 	root_dentry->d_fsdata = oe;
 
+	/* Root is always merge -> can have whiteouts */
+	ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
 	ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
 		       ovl_dentry_lower(root_dentry));
 
@@ -1149,39 +1298,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 out_free_oe:
+	ovl_entry_stack_free(oe);
 	kfree(oe);
-out_put_cred:
-	put_cred(ufs->creator_cred);
-out_put_indexdir:
-	dput(ufs->indexdir);
-out_put_lower_mnt:
-	for (i = 0; i < ufs->numlower; i++)
-		mntput(ufs->lower_mnt[i]);
-	kfree(ufs->lower_mnt);
-out_put_workdir:
-	dput(ufs->workdir);
-	mntput(ufs->upper_mnt);
-out_put_lowerpath:
-	for (i = 0; i < numlower; i++)
-		path_put(&stack[i]);
-	kfree(stack);
-out_free_lowertmp:
-	kfree(lowertmp);
-out_unlock_workdentry:
-	if (ufs->workdir_locked)
-		ovl_inuse_unlock(workpath.dentry);
-out_put_workpath:
-	path_put(&workpath);
-out_unlock_upperdentry:
-	if (ufs->upperdir_locked)
-		ovl_inuse_unlock(upperpath.dentry);
-out_put_upperpath:
+out_err:
 	path_put(&upperpath);
-out_free_config:
-	kfree(ufs->config.lowerdir);
-	kfree(ufs->config.upperdir);
-	kfree(ufs->config.workdir);
-	kfree(ufs);
+	ovl_free_fs(ofs);
 out:
 	return err;
 }
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index b9b239fa5cfd..d6bb1c9f5e7a 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/ratelimit.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 int ovl_want_write(struct dentry *dentry)
 {
@@ -125,7 +124,12 @@ void ovl_path_lower(struct dentry *dentry, struct path *path)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	*path = oe->numlower ? oe->lowerstack[0] : (struct path) { };
+	if (oe->numlower) {
+		path->mnt = oe->lowerstack[0].layer->mnt;
+		path->dentry = oe->lowerstack[0].dentry;
+	} else {
+		*path = (struct path) { };
+	}
 }
 
 enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
@@ -329,6 +333,19 @@ void ovl_copy_up_end(struct dentry *dentry)
 	mutex_unlock(&OVL_I(d_inode(dentry))->lock);
 }
 
+bool ovl_check_origin_xattr(struct dentry *dentry)
+{
+	int res;
+
+	res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+
+	/* Zero size value means "copied up but origin unknown" */
+	if (res >= 0)
+		return true;
+
+	return false;
+}
+
 bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
 {
 	int res;
diff --git a/fs/pipe.c b/fs/pipe.c
index 349c9d56d4b3..6d98566201ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = {
 
 /*
  * Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
  */
-static inline unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
+	if (size < pipe_min_size)
+		size = pipe_min_size;
+
 	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages == 0)
+		return 0;
+
 	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
 }
 
@@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	long ret = 0;
 
 	size = round_pipe_size(arg);
+	if (size == 0)
+		return -EINVAL;
 	nr_pages = size >> PAGE_SHIFT;
 
 	if (!nr_pages)
@@ -1117,20 +1125,13 @@ out_revert_acct:
 }
 
 /*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
  * will return an error.
  */
 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 		 size_t *lenp, loff_t *ppos)
 {
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
-	if (ret < 0 || !write)
-		return ret;
-
-	pipe_max_size = round_pipe_size(pipe_max_size);
-	return ret;
+	return proc_dopipe_max_size(table, write, buf, lenp, ppos);
 }
 
 /*
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index f7456c4e7d0f..ead487e80510 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y	+= loadavg.o
 proc-y	+= meminfo.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
+proc-y	+= util.o
 proc-y	+= version.o
 proc-y	+= softirqs.o
 proc-y	+= namespaces.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6f6fc1672ad1..79375fc115d2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 		   cpumask_pr_args(&task->cpus_allowed));
 }
 
+static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+{
+	seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state);
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 
 	if (mm) {
 		task_mem(m, mm);
+		task_core_dumping(m, mm);
 		mmput(mm);
 	}
 	task_sig(m, task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d357b2ea6cb..60316b52d659 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -443,8 +443,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 		save_stack_trace_tsk(task, &trace);
 
 		for (i = 0; i < trace.nr_entries; i++) {
-			seq_printf(m, "[<%pK>] %pB\n",
-				   (void *)entries[i], (void *)entries[i]);
+			seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
 		}
 		unlock_trace(task);
 	}
@@ -1682,7 +1681,7 @@ const struct inode_operations proc_pid_link_inode_operations = {
 
 /* building an inode */
 
-void task_dump_owner(struct task_struct *task, mode_t mode,
+void task_dump_owner(struct task_struct *task, umode_t mode,
 		     kuid_t *ruid, kgid_t *rgid)
 {
 	/* Depending on the state of dumpable compute who should own a
@@ -2269,7 +2268,7 @@ static int show_timer(struct seq_file *m, void *v)
 	notify = timer->it_sigev_notify;
 
 	seq_printf(m, "ID: %d\n", timer->it_id);
-	seq_printf(m, "signal: %d/%p\n",
+	seq_printf(m, "signal: %d/%px\n",
 		   timer->sigq->info.si_signo,
 		   timer->sigq->info.si_value.sival_ptr);
 	seq_printf(m, "notify: %s/%s.%d\n",
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index e0f867cd8553..96f1087e372c 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -1,12 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/cpufreq.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
+__weak void arch_freq_prepare_all(void)
+{
+}
+
 extern const struct seq_operations cpuinfo_op;
 static int cpuinfo_open(struct inode *inode, struct file *file)
 {
+	arch_freq_prepare_all();
 	return seq_open(file, &cpuinfo_op);
 }
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 225f541f7078..dd0f82622427 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -483,7 +483,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 
 	/* User space would break if executables or devices appear on proc */
 	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
-	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
+	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = PROC_SUPER_MAGIC;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a34195e92b20..4a67188c8d74 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -100,31 +100,10 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
-void task_dump_owner(struct task_struct *task, mode_t mode,
+void task_dump_owner(struct task_struct *task, umode_t mode,
 		     kuid_t *ruid, kgid_t *rgid);
 
-static inline unsigned name_to_int(const struct qstr *qstr)
-{
-	const char *name = qstr->name;
-	int len = qstr->len;
-	unsigned n = 0;
-
-	if (len > 1 && *name == '0')
-		goto out;
-	while (len-- > 0) {
-		unsigned c = *name++ - '0';
-		if (c > 9)
-			goto out;
-		if (n >= (~0U-9)/10)
-			goto out;
-		n *= 10;
-		n += c;
-	}
-	return n;
-out:
-	return ~0U;
-}
-
+unsigned name_to_int(const struct qstr *qstr);
 /*
  * Offset of the first process in the /proc root directory..
  */
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bc5c58c00ee..a000d7547479 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4e42aba97f2e..ede8e64974be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,7 +91,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 {
 	struct pid_namespace *ns;
 
-	if (flags & MS_KERNMOUNT) {
+	if (flags & SB_KERNMOUNT) {
 		ns = data;
 		data = NULL;
 	} else {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..339e4c1c044d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,8 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
-	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -67,7 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
-		"VmPMD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -80,8 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		shmem << (PAGE_SHIFT-10),
 		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		ptes >> 10,
-		pmds >> 10,
+		mm_pgtables_bytes(mm) >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
@@ -665,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_SYNC)]	= "sf",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",
diff --git a/fs/proc/util.c b/fs/proc/util.c
new file mode 100644
index 000000000000..b161cfa0f9fa
--- /dev/null
+++ b/fs/proc/util.c
@@ -0,0 +1,23 @@
+#include <linux/dcache.h>
+
+unsigned name_to_int(const struct qstr *qstr)
+{
+	const char *name = qstr->name;
+	int len = qstr->len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	do {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	} while (--len > 0);
+	return n;
+out:
+	return ~0U;
+}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7b635d173213..b786840facd9 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -45,10 +45,10 @@ struct proc_fs_info {
 static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
 	static const struct proc_fs_info fs_info[] = {
-		{ MS_SYNCHRONOUS, ",sync" },
-		{ MS_DIRSYNC, ",dirsync" },
-		{ MS_MANDLOCK, ",mand" },
-		{ MS_LAZYTIME, ",lazytime" },
+		{ SB_SYNCHRONOUS, ",sync" },
+		{ SB_DIRSYNC, ",dirsync" },
+		{ SB_MANDLOCK, ",mand" },
+		{ SB_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 086e491faf04..691032107f8c 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -61,7 +61,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
 
 static int pstore_new_entry;
 
-static void pstore_timefunc(unsigned long);
+static void pstore_timefunc(struct timer_list *);
 static DEFINE_TIMER(pstore_timer, pstore_timefunc);
 
 static void pstore_dowork(struct work_struct *);
@@ -651,7 +651,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
 		return -EINVAL;
 
 	record->buf = memdup_user(buf, record->size);
-	if (unlikely(IS_ERR(record->buf))) {
+	if (IS_ERR(record->buf)) {
 		ret = PTR_ERR(record->buf);
 		goto out;
 	}
@@ -890,7 +890,7 @@ static void pstore_dowork(struct work_struct *work)
 	pstore_get_records(1);
 }
 
-static void pstore_timefunc(unsigned long dummy)
+static void pstore_timefunc(struct timer_list *unused)
 {
 	if (pstore_new_entry) {
 		pstore_new_entry = 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3a67cfb142d8..3d46fe302fcb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -47,7 +47,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
 	sync_filesystem(sb);
 	qs = qnx4_sb(sb);
 	qs->Version = QNX4_VERSION;
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -199,7 +199,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_op = &qnx4_sops;
 	s->s_magic = QNX4_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
+	s->s_flags |= SB_RDONLY;	/* Yup, read-only yet */
 
 	/* Check the superblock signature. Since the qnx4 code is
 	   dangerous, we should leave as quickly as possible
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 1192422a1c56..4aeb26bcb4d0 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -56,7 +56,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
 static int qnx6_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -427,7 +427,7 @@ mmi_success:
 	}
 	s->s_op = &qnx6_sops;
 	s->s_magic = QNX6_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */
+	s->s_flags |= SB_RDONLY;        /* Yup, read-only yet */
 
 	/* ease the later tree level calculations */
 	sbi = QNX6_SB(s);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39f1b0b0c76f..020c597ef9b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -941,12 +941,13 @@ static int dqinit_needed(struct inode *inode, int type)
 }
 
 /* This routine is guarded by s_umount semaphore */
-static void add_dquot_ref(struct super_block *sb, int type)
+static int add_dquot_ref(struct super_block *sb, int type)
 {
 	struct inode *inode, *old_inode = NULL;
 #ifdef CONFIG_QUOTA_DEBUG
 	int reserved = 0;
 #endif
+	int err = 0;
 
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -966,7 +967,11 @@ static void add_dquot_ref(struct super_block *sb, int type)
 			reserved = 1;
 #endif
 		iput(old_inode);
-		__dquot_initialize(inode, type);
+		err = __dquot_initialize(inode, type);
+		if (err) {
+			iput(inode);
+			goto out;
+		}
 
 		/*
 		 * We hold a reference to 'inode' so it couldn't have been
@@ -981,7 +986,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	}
 	spin_unlock(&sb->s_inode_list_lock);
 	iput(old_inode);
-
+out:
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
 		quota_error(sb, "Writes happened before quota was turned on "
@@ -989,6 +994,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 			"Please run quotacheck(8)");
 	}
 #endif
+	return err;
 }
 
 /*
@@ -2379,10 +2385,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	dqopt->flags |= dquot_state_flag(flags, type);
 	spin_unlock(&dq_state_lock);
 
-	add_dquot_ref(sb, type);
-
-	return 0;
+	error = add_dquot_ref(sb, type);
+	if (error)
+		dquot_disable(sb, type, flags);
 
+	return error;
 out_file_init:
 	dqopt->files[type] = NULL;
 	iput(inode);
@@ -2985,7 +2992,8 @@ static int __init dquot_init(void)
 	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
 		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
 
-	register_shrinker(&dqcache_shrinker);
+	if (register_shrinker(&dqcache_shrinker))
+		panic("Cannot register dquot shrinker");
 
 	return 0;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 0046d72efe94..f8547b82dfb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -635,27 +635,6 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 	return ret;
 }
 
-/*
- * Reduce an iovec's length in-place.  Return the resulting number of segments
- */
-unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
-{
-	unsigned long seg = 0;
-	size_t len = 0;
-
-	while (seg < nr_segs) {
-		seg++;
-		if (len + iov->iov_len >= to) {
-			iov->iov_len = to - len;
-			break;
-		}
-		len += iov->iov_len;
-		iov++;
-	}
-	return seg;
-}
-EXPORT_SYMBOL(iov_shorten);
-
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 		loff_t *ppos, int type, rwf_t flags)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 11a48affa882..b13fc024d2ee 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2106,7 +2106,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 			journal_end(th);
 			goto out_inserted_sd;
 		}
-	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
+	} else if (inode->i_sb->s_flags & SB_POSIXACL) {
 		reiserfs_warning(inode->i_sb, "jdm-13090",
 				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 69ff280bdfe8..70057359fbaf 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1960,7 +1960,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	/*
 	 * Cancel flushing of old commits. Note that neither of these works
 	 * will be requeued because superblock is being shutdown and doesn't
-	 * have MS_ACTIVE set.
+	 * have SB_ACTIVE set.
 	 */
 	reiserfs_cancel_old_flush(sb);
 	/* wait for all commits to finish */
@@ -4302,7 +4302,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
 		 * Avoid queueing work when sb is being shut down. Transaction
 		 * will be flushed on journal shutdown.
 		 */
-		if (sb->s_flags & MS_ACTIVE)
+		if (sb->s_flags & SB_ACTIVE)
 			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
 					   &journal->j_work, HZ / 10);
 	}
@@ -4393,7 +4393,7 @@ void reiserfs_abort_journal(struct super_block *sb, int errno)
 	if (!journal->j_errno)
 		journal->j_errno = errno;
 
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	set_bit(J_ABORTED, &journal->j_state);
 
 #ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 64f49cafbc5b..7e288d97adcb 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -390,7 +390,7 @@ void __reiserfs_error(struct super_block *sb, const char *id,
 		return;
 
 	reiserfs_info(sb, "Remounting filesystem read-only\n");
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	reiserfs_abort_journal(sb, -EIO);
 }
 
@@ -409,7 +409,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
 	       error_buf);
 
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	reiserfs_abort_journal(sb, errno);
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5464ec517702..1fc934d24459 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s)
 	 * Avoid scheduling flush when sb is being shut down. It can race
 	 * with journal shutdown and free still queued delayed work.
 	 */
-	if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE))
+	if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
 		return;
 
 	spin_lock(&sbi->old_work_lock);
@@ -252,11 +252,11 @@ static int finish_unfinished(struct super_block *s)
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	if (s->s_flags & MS_ACTIVE) {
+	if (s->s_flags & SB_ACTIVE) {
 		ms_active_set = 0;
 	} else {
 		ms_active_set = 1;
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	}
 	/* Turn on quotas so that they are updated correctly */
 	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
@@ -411,7 +411,7 @@ static int finish_unfinished(struct super_block *s)
 	reiserfs_write_lock(s);
 	if (ms_active_set)
 		/* Restore the flag back */
-		s->s_flags &= ~MS_ACTIVE;
+		s->s_flags &= ~SB_ACTIVE;
 #endif
 	pathrelse(&path);
 	if (done)
@@ -1521,7 +1521,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 			goto out_err_unlock;
 	}
 
-	if (*mount_flags & MS_RDONLY) {
+	if (*mount_flags & SB_RDONLY) {
 		reiserfs_write_unlock(s);
 		reiserfs_xattr_init(s, *mount_flags);
 		/* remount read-only */
@@ -1567,7 +1567,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
 
 		/* now it is safe to call journal_begin */
-		s->s_flags &= ~MS_RDONLY;
+		s->s_flags &= ~SB_RDONLY;
 		err = journal_begin(&th, s, 10);
 		if (err)
 			goto out_err_unlock;
@@ -1575,7 +1575,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		/* Mount a partition which is read-only, read-write */
 		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-		s->s_flags &= ~MS_RDONLY;
+		s->s_flags &= ~SB_RDONLY;
 		set_sb_umount_state(rs, REISERFS_ERROR_FS);
 		if (!old_format_only(s))
 			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
@@ -1590,7 +1590,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		goto out_err_unlock;
 
 	reiserfs_write_unlock(s);
-	if (!(*mount_flags & MS_RDONLY)) {
+	if (!(*mount_flags & SB_RDONLY)) {
 		dquot_resume(s, -1);
 		reiserfs_write_lock(s);
 		finish_unfinished(s);
@@ -2055,7 +2055,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
 		SWARN(silent, s, "clm-7000",
 		      "Detected readonly device, marking FS readonly");
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 	}
 	args.objectid = REISERFS_ROOT_OBJECTID;
 	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
@@ -2591,7 +2591,6 @@ out:
 		return err;
 	if (inode->i_size < off + len - towrite)
 		i_size_write(inode, off + len - towrite);
-	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = current_time(inode);
 	mark_inode_dirty(inode);
 	return len - towrite;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 46492fb37a4c..5dbf5324bdda 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -959,7 +959,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 
 /*
  * We need to take a copy of the mount flags since things like
- * MS_RDONLY don't get set until *after* we're called.
+ * SB_RDONLY don't get set until *after* we're called.
  * mount_flags != mount_options
  */
 int reiserfs_xattr_init(struct super_block *s, int mount_flags)
@@ -971,7 +971,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	if (err)
 		goto error;
 
-	if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
+	if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
 		inode_lock(d_inode(s->s_root));
 		err = create_privroot(REISERFS_SB(s)->priv_root);
 		inode_unlock(d_inode(s->s_root));
@@ -999,11 +999,11 @@ error:
 		clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
 	}
 
-	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+	/* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
 	if (reiserfs_posixacl(s))
-		s->s_flags |= MS_POSIXACL;
+		s->s_flags |= SB_POSIXACL;
 	else
-		s->s_flags &= ~MS_POSIXACL;
+		s->s_flags &= ~SB_POSIXACL;
 
 	return err;
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 0186fe6d39f3..8f06fd1f3d69 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -451,7 +451,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -502,7 +502,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_maxbytes = 0xFFFFFFFF;
 	sb->s_magic = ROMFS_MAGIC;
-	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_flags |= SB_RDONLY | SB_NOATIME;
 	sb->s_op = &romfs_super_ops;
 
 #ifdef CONFIG_ROMFS_ON_MTD
diff --git a/fs/select.c b/fs/select.c
index 063067e606ca..6de493bb42a4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -292,8 +292,7 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
 				      void __user *p,
 				      int timeval, int ret)
 {
-	struct timespec64 rts64;
-	struct timespec rts;
+	struct timespec64 rts;
 	struct timeval rtv;
 
 	if (!p)
@@ -306,23 +305,22 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
 	if (!end_time->tv_sec && !end_time->tv_nsec)
 		return ret;
 
-	ktime_get_ts64(&rts64);
-	rts64 = timespec64_sub(*end_time, rts64);
-	if (rts64.tv_sec < 0)
-		rts64.tv_sec = rts64.tv_nsec = 0;
+	ktime_get_ts64(&rts);
+	rts = timespec64_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
 
-	rts = timespec64_to_timespec(rts64);
 
 	if (timeval) {
 		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
 			memset(&rtv, 0, sizeof(rtv));
-		rtv.tv_sec = rts64.tv_sec;
-		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
 
 		if (!copy_to_user(p, &rtv, sizeof(rtv)))
 			return ret;
 
-	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+	} else if (!put_timespec64(&rts, p))
 		return ret;
 
 	/*
@@ -705,17 +703,15 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	struct timespec64 ts64, end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (get_timespec64(&ts, tsp))
 			return -EFAULT;
-		ts64 = timespec_to_timespec64(ts);
 
 		to = &end_time;
-		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -1052,12 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		size_t, sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	struct timespec64 end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (get_timespec64(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1103,10 +1098,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static
-int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
 				      int timeval, int ret)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 
 	if (!p)
 		return ret;
@@ -1118,8 +1113,8 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 	if (!end_time->tv_sec && !end_time->tv_nsec)
 		return ret;
 
-	ktime_get_ts(&ts);
-	ts = timespec_sub(*end_time, ts);
+	ktime_get_ts64(&ts);
+	ts = timespec64_sub(*end_time, ts);
 	if (ts.tv_sec < 0)
 		ts.tv_sec = ts.tv_nsec = 0;
 
@@ -1132,12 +1127,7 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 		if (!copy_to_user(p, &rtv, sizeof(rtv)))
 			return ret;
 	} else {
-		struct compat_timespec rts;
-
-		rts.tv_sec = ts.tv_sec;
-		rts.tv_nsec = ts.tv_nsec;
-
-		if (!copy_to_user(p, &rts, sizeof(rts)))
+		if (!compat_put_timespec64(&ts, p))
 			return ret;
 	}
 	/*
@@ -1195,7 +1185,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
  */
 static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct timespec *end_time)
+	struct timespec64 *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -1268,7 +1258,7 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
 	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
 	struct compat_timeval __user *, tvp)
 {
-	struct timespec end_time, *to = NULL;
+	struct timespec64 end_time, *to = NULL;
 	struct compat_timeval tv;
 	int ret;
 
@@ -1312,14 +1302,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (compat_get_timespec64(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1330,9 +1318,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
@@ -1381,14 +1368,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct compat_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (compat_get_timespec64(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1399,9 +1384,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c667af86da5..5f1ff8756595 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -313,15 +313,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
 		     compat_size_t, sigsetsize,
 		     int, flags)
 {
-	compat_sigset_t ss32;
 	sigset_t tmp;
 	sigset_t __user *ksigmask;
 
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+	if (get_compat_sigset(&tmp, sigmask))
 		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
 	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cf01e15a7b16..8a73b97217c8 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -195,7 +195,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		(u64) le64_to_cpu(sblk->id_table_start));
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &squashfs_super_ops;
 
 	err = -ENOMEM;
@@ -373,7 +373,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
diff --git a/fs/statfs.c b/fs/statfs.c
index c25dd9a26cc1..5b2a24f0f263 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -35,11 +35,11 @@ static int flags_by_mnt(int mnt_flags)
 static int flags_by_sb(int s_flags)
 {
 	int flags = 0;
-	if (s_flags & MS_SYNCHRONOUS)
+	if (s_flags & SB_SYNCHRONOUS)
 		flags |= ST_SYNCHRONOUS;
-	if (s_flags & MS_MANDLOCK)
+	if (s_flags & SB_MANDLOCK)
 		flags |= ST_MANDLOCK;
-	if (s_flags & MS_RDONLY)
+	if (s_flags & SB_RDONLY)
 		flags |= ST_RDONLY;
 	return flags;
 }
@@ -217,7 +217,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
 	return error;
 }
 
-int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
+static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 {
 	struct super_block *s = user_get_super(dev);
 	int err;
diff --git a/fs/super.c b/fs/super.c
index 994db21f59bf..7ff1349609e4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -155,21 +155,19 @@ static void destroy_super_rcu(struct rcu_head *head)
 	schedule_work(&s->destroy_work);
 }
 
-/**
- *	destroy_super	-	frees a superblock
- *	@s: superblock to free
- *
- *	Frees a superblock.
- */
-static void destroy_super(struct super_block *s)
+/* Free a superblock that has never been seen by anyone */
+static void destroy_unused_super(struct super_block *s)
 {
+	if (!s)
+		return;
+	up_write(&s->s_umount);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
 	security_sb_free(s);
-	WARN_ON(!list_empty(&s->s_mounts));
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
-	call_rcu(&s->rcu, destroy_super_rcu);
+	/* no delays needed */
+	destroy_super_work(&s->destroy_work);
 }
 
 /**
@@ -193,6 +191,24 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 
 	INIT_LIST_HEAD(&s->s_mounts);
 	s->s_user_ns = get_user_ns(user_ns);
+	init_rwsem(&s->s_umount);
+	lockdep_set_class(&s->s_umount, &type->s_umount_key);
+	/*
+	 * sget() can have s_umount recursion.
+	 *
+	 * When it cannot find a suitable sb, it allocates a new
+	 * one (this one), and tries again to find a suitable old
+	 * one.
+	 *
+	 * In case that succeeds, it will acquire the s_umount
+	 * lock of the old one. Since these are clearly distrinct
+	 * locks, and this object isn't exposed yet, there's no
+	 * risk of deadlocks.
+	 *
+	 * Annotate this by putting this lock in a different
+	 * subclass.
+	 */
+	down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 
 	if (security_sb_alloc(s))
 		goto fail;
@@ -220,25 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru))
 		goto fail;
-
-	init_rwsem(&s->s_umount);
-	lockdep_set_class(&s->s_umount, &type->s_umount_key);
-	/*
-	 * sget() can have s_umount recursion.
-	 *
-	 * When it cannot find a suitable sb, it allocates a new
-	 * one (this one), and tries again to find a suitable old
-	 * one.
-	 *
-	 * In case that succeeds, it will acquire the s_umount
-	 * lock of the old one. Since these are clearly distrinct
-	 * locks, and this object isn't exposed yet, there's no
-	 * risk of deadlocks.
-	 *
-	 * Annotate this by putting this lock in a different
-	 * subclass.
-	 */
-	down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 	s->s_count = 1;
 	atomic_set(&s->s_active, 1);
 	mutex_init(&s->s_vfs_rename_mutex);
@@ -257,7 +254,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	return s;
 
 fail:
-	destroy_super(s);
+	destroy_unused_super(s);
 	return NULL;
 }
 
@@ -266,11 +263,17 @@ fail:
 /*
  * Drop a superblock's refcount.  The caller must hold sb_lock.
  */
-static void __put_super(struct super_block *sb)
+static void __put_super(struct super_block *s)
 {
-	if (!--sb->s_count) {
-		list_del_init(&sb->s_list);
-		destroy_super(sb);
+	if (!--s->s_count) {
+		list_del_init(&s->s_list);
+		WARN_ON(s->s_dentry_lru.node);
+		WARN_ON(s->s_inode_lru.node);
+		WARN_ON(!list_empty(&s->s_mounts));
+		security_sb_free(s);
+		put_user_ns(s->s_user_ns);
+		kfree(s->s_subtype);
+		call_rcu(&s->rcu, destroy_super_rcu);
 	}
 }
 
@@ -485,19 +488,12 @@ retry:
 				continue;
 			if (user_ns != old->s_user_ns) {
 				spin_unlock(&sb_lock);
-				if (s) {
-					up_write(&s->s_umount);
-					destroy_super(s);
-				}
+				destroy_unused_super(s);
 				return ERR_PTR(-EBUSY);
 			}
 			if (!grab_super(old))
 				goto retry;
-			if (s) {
-				up_write(&s->s_umount);
-				destroy_super(s);
-				s = NULL;
-			}
+			destroy_unused_super(s);
 			return old;
 		}
 	}
@@ -512,8 +508,7 @@ retry:
 	err = set(s, data);
 	if (err) {
 		spin_unlock(&sb_lock);
-		up_write(&s->s_umount);
-		destroy_super(s);
+		destroy_unused_super(s);
 		return ERR_PTR(err);
 	}
 	s->s_type = type;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 20b8f82e115b..fb49510c5dcf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -30,7 +30,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	void *ns;
 	bool new_sb;
 
-	if (!(flags & MS_KERNMOUNT)) {
+	if (!(flags & SB_KERNMOUNT)) {
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
 			return ERR_PTR(-EPERM);
 	}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3c47b7d5d4cf..bec9f79adb25 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -63,7 +63,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 
 	sync_filesystem(sb);
 	if (sbi->s_forced_ro)
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 	return 0;
 }
 
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 0d56e486b392..89765ddfb738 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -333,7 +333,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 	/* set up enough so that it can read an inode */
 	sb->s_op = &sysv_sops;
 	if (sbi->s_forced_ro)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	if (sbi->s_truncate)
 		sb->s_d_op = &sysv_dentry_operations;
 	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a02aa59d1e24..dfe85069586e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1406,7 +1406,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
 
-	if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME))
 		iflags |= I_DIRTY_SYNC;
 
 	release = ui->dirty;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3be28900bf37..fe77e9625e84 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -84,7 +84,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 	if (!c->ro_error) {
 		c->ro_error = 1;
 		c->no_chk_data_crc = 0;
-		c->vfs_sb->s_flags |= MS_RDONLY;
+		c->vfs_sb->s_flags |= SB_RDONLY;
 		ubifs_warn(c, "switched to read-only mode, error %d", err);
 		dump_stack();
 	}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7503e7cdf870..0beb285b143d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -968,7 +968,7 @@ static int parse_standard_option(const char *option)
 
 	pr_notice("UBIFS: parse %s\n", option);
 	if (!strcmp(option, "sync"))
-		return MS_SYNCHRONOUS;
+		return SB_SYNCHRONOUS;
 	return 0;
 }
 
@@ -1160,8 +1160,8 @@ static int mount_ubifs(struct ubifs_info *c)
 	size_t sz;
 
 	c->ro_mount = !!sb_rdonly(c->vfs_sb);
-	/* Suppress error messages while probing if MS_SILENT is set */
-	c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+	/* Suppress error messages while probing if SB_SILENT is set */
+	c->probing = !!(c->vfs_sb->s_flags & SB_SILENT);
 
 	err = init_constants_early(c);
 	if (err)
@@ -1852,7 +1852,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
-	if (c->ro_mount && !(*flags & MS_RDONLY)) {
+	if (c->ro_mount && !(*flags & SB_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg(c, "cannot re-mount R/W due to prior errors");
 			return -EROFS;
@@ -1864,7 +1864,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+	} else if (!c->ro_mount && (*flags & SB_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg(c, "cannot re-mount R/O due to prior errors");
 			return -EROFS;
@@ -2117,7 +2117,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 	 */
 	ubi = open_ubi(name, UBI_READONLY);
 	if (IS_ERR(ubi)) {
-		if (!(flags & MS_SILENT))
+		if (!(flags & SB_SILENT))
 			pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
 			       current->pid, name, (int)PTR_ERR(ubi));
 		return ERR_CAST(ubi);
@@ -2143,18 +2143,18 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 		kfree(c);
 		/* A new mount point for already mounted UBIFS */
 		dbg_gen("this ubi volume is already mounted");
-		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
+		if (!!(flags & SB_RDONLY) != c1->ro_mount) {
 			err = -EBUSY;
 			goto out_deact;
 		}
 	} else {
-		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
 		if (err)
 			goto out_deact;
 		/* We do not support atime */
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 #ifndef CONFIG_UBIFS_ATIME_SUPPORT
-		sb->s_flags |= MS_NOATIME;
+		sb->s_flags |= SB_NOATIME;
 #else
 		ubifs_msg(c, "full atime support is enabled.");
 #endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 63c7468147eb..5ee7af879cc4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1201,7 +1201,7 @@ struct ubifs_debug_info;
  * @need_recovery: %1 if the file-system needs recovery
  * @replaying: %1 during journal replay
  * @mounting: %1 while mounting
- * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
+ * @probing: %1 while attempting to mount if SB_SILENT mount flag is set
  * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
  * @replay_list: temporary list used during journal replay
  * @replay_buds: list of buds to replay
@@ -1850,7 +1850,7 @@ __printf(2, 3)
 void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
 /*
  * A conditional variant of 'ubifs_err()' which doesn't output anything
- * if probing (ie. MS_SILENT set).
+ * if probing (ie. SB_SILENT set).
  */
 #define ubifs_errc(c, fmt, ...)						\
 do {									\
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f80e0a0f24d3..f73239a9a97d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -650,7 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sync_filesystem(sb);
 	if (lvidiu) {
 		int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
-		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY))
 			return -EACCES;
 	}
 
@@ -673,10 +673,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sbi->s_dmode = uopt.dmode;
 	write_unlock(&sbi->s_cred_lock);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_unlock;
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index b5cd79065ef9..e727ee07dbe4 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -115,7 +115,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -205,7 +205,7 @@ do_more:
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 
 	if (overflow) {
@@ -567,7 +567,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -688,7 +688,7 @@ cg_found:
 succed:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 916b4a428933..e1ef0f0a1353 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -112,7 +112,7 @@ void ufs_free_inode (struct inode * inode)
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	
 	ufs_mark_sb_dirty(sb);
@@ -146,14 +146,14 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 		unlock_buffer(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
+		if (sb->s_flags & SB_SYNCHRONOUS)
 			sync_dirty_buffer(bh);
 		brelse(bh);
 	}
 
 	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
 	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 
 	UFSD("EXIT\n");
@@ -284,7 +284,7 @@ cg_found:
 	}
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -330,7 +330,7 @@ cg_found:
 		ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
 		mark_buffer_dirty(bh);
 		unlock_buffer(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
+		if (sb->s_flags & SB_SYNCHRONOUS)
 			sync_dirty_buffer(bh);
 		brelse(bh);
 	}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6440003f8ddc..4d497e9c6883 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -282,7 +282,7 @@ void ufs_error (struct super_block * sb, const char * function,
 		usb1->fs_clean = UFS_FSBAD;
 		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		ufs_mark_sb_dirty(sb);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	va_start(args, fmt);
 	vaf.fmt = fmt;
@@ -320,7 +320,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	pr_crit("panic (device %s): %s: %pV\n",
 		sb->s_id, function, &vaf);
 	va_end(args);
@@ -905,7 +905,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=old is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -921,7 +921,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=nextstep is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -937,7 +937,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=nextstep-cd is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -953,7 +953,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=openstep is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -968,7 +968,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=hp is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
  		}
  		break;
 	default:
@@ -1125,21 +1125,21 @@ magic_found:
 			break;
 		case UFS_FSACTIVE:
 			pr_err("%s(): fs is active\n", __func__);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		case UFS_FSBAD:
 			pr_err("%s(): fs is bad\n", __func__);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		default:
 			pr_err("%s(): can't grok fs_clean 0x%x\n",
 			       __func__, usb1->fs_clean);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		}
 	} else {
 		pr_err("%s(): fs needs fsck\n", __func__);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/*
@@ -1328,7 +1328,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		return -EINVAL;
 	}
 
-	if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) {
+	if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		mutex_unlock(&UFS_SB(sb)->s_lock);
 		return 0;
@@ -1337,7 +1337,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	/*
 	 * fs was mouted as rw, remounting ro
 	 */
-	if (*mount_flags & MS_RDONLY) {
+	if (*mount_flags & SB_RDONLY) {
 		ufs_put_super_internal(sb);
 		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
@@ -1346,7 +1346,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			ufs_set_fs_state(sb, usb1, usb3,
 				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else {
 	/*
 	 * fs was mounted as ro, remounting rw
@@ -1370,7 +1370,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			mutex_unlock(&UFS_SB(sb)->s_lock);
 			return -EPERM;
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 #endif
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f46d133c0949..ac9a4e65ca49 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -668,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 		ctx->features = octx->features;
 		ctx->released = false;
 		ctx->mm = vma->vm_mm;
-		atomic_inc(&ctx->mm->mm_count);
+		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
 		fctx->orig = octx;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 758f37ac5ad3..4b87472f35bc 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name)
 }
 
 static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
 		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 0da80019a917..83ed7715f856 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -702,7 +702,7 @@ xfs_alloc_ag_vextent(
 	ASSERT(args->agbno % args->alignment == 0);
 
 	/* if not file data, insert new block into the reverse map btree */
-	if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
 		error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
 				       args->agbno, args->len, &args->oinfo);
 		if (error)
@@ -1682,7 +1682,7 @@ xfs_free_ag_extent(
 	bno_cur = cnt_cur = NULL;
 	mp = tp->t_mountp;
 
-	if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
 		error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
 		if (error)
 			goto error0;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 6249c92671de..a76914db72ef 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -212,6 +212,7 @@ xfs_attr_set(
 	int			flags)
 {
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*leaf_bp = NULL;
 	struct xfs_da_args	args;
 	struct xfs_defer_ops	dfops;
 	struct xfs_trans_res	tres;
@@ -327,9 +328,16 @@ xfs_attr_set(
 		 * GROT: another possible req'mt for a double-split btree op.
 		 */
 		xfs_defer_init(args.dfops, args.firstblock);
-		error = xfs_attr_shortform_to_leaf(&args);
+		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
 		if (error)
 			goto out_defer_cancel;
+		/*
+		 * Prevent the leaf buffer from being unlocked so that a
+		 * concurrent AIL push cannot grab the half-baked leaf
+		 * buffer and run into problems with the write verifier.
+		 */
+		xfs_trans_bhold(args.trans, leaf_bp);
+		xfs_defer_bjoin(args.dfops, leaf_bp);
 		xfs_defer_ijoin(args.dfops, dp);
 		error = xfs_defer_finish(&args.trans, args.dfops);
 		if (error)
@@ -337,13 +345,14 @@ xfs_attr_set(
 
 		/*
 		 * Commit the leaf transformation.  We'll need another (linked)
-		 * transaction to add the new attribute to the leaf.
+		 * transaction to add the new attribute to the leaf, which
+		 * means that we have to hold & join the leaf buffer here too.
 		 */
-
 		error = xfs_trans_roll_inode(&args.trans, dp);
 		if (error)
 			goto out;
-
+		xfs_trans_bjoin(args.trans, leaf_bp);
+		leaf_bp = NULL;
 	}
 
 	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
@@ -374,8 +383,9 @@ xfs_attr_set(
 
 out_defer_cancel:
 	xfs_defer_cancel(&dfops);
-	args.trans = NULL;
 out:
+	if (leaf_bp)
+		xfs_trans_brelse(args.trans, leaf_bp);
 	if (args.trans)
 		xfs_trans_cancel(args.trans);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 53cc8b986eac..601eaa36f1ad 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -735,10 +735,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
 }
 
 /*
- * Convert from using the shortform to the leaf.
+ * Convert from using the shortform to the leaf.  On success, return the
+ * buffer so that we can keep it locked until we're totally done with it.
  */
 int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+xfs_attr_shortform_to_leaf(
+	struct xfs_da_args	*args,
+	struct xfs_buf		**leaf_bp)
 {
 	xfs_inode_t *dp;
 	xfs_attr_shortform_t *sf;
@@ -818,7 +821,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 	}
 	error = 0;
-
+	*leaf_bp = bp;
 out:
 	kmem_free(tmpbuffer);
 	return error;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index f7dda0c237b0..894124efb421 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -48,7 +48,8 @@ void	xfs_attr_shortform_create(struct xfs_da_args *args);
 void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
 int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
+			struct xfs_buf **leaf_bp);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 08df809e2315..1bddbba6b80c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5136,7 +5136,7 @@ __xfs_bunmapi(
 	 * blowing out the transaction with a mix of EFIs and reflink
 	 * adjustments.
 	 */
-	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+	if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
 		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
 	else
 		max_len = len;
@@ -5662,7 +5662,8 @@ xfs_bmap_collapse_extents(
 		*done = true;
 		goto del_cursor;
 	}
-	XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
+	XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
+				del_cursor);
 
 	new_startoff = got.br_startoff - offset_shift_fsb;
 	if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
@@ -5767,7 +5768,8 @@ xfs_bmap_insert_extents(
 			goto del_cursor;
 		}
 	}
-	XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
+	XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
+				del_cursor);
 
 	if (stop_fsb >= got.br_startoff + got.br_blockcount) {
 		error = -EIO;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 072ebfe1d6ae..087fea02c389 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -249,6 +249,10 @@ xfs_defer_trans_roll(
 	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
 		xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
 
+	/* Hold the (previously bjoin'd) buffer locked across the roll. */
+	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++)
+		xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]);
+
 	trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
 
 	/* Roll the transaction. */
@@ -264,6 +268,12 @@ xfs_defer_trans_roll(
 	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
 		xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
 
+	/* Rejoin the buffers and dirty them so the log moves forward. */
+	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) {
+		xfs_trans_bjoin(*tp, dop->dop_bufs[i]);
+		xfs_trans_bhold(*tp, dop->dop_bufs[i]);
+	}
+
 	return error;
 }
 
@@ -295,6 +305,31 @@ xfs_defer_ijoin(
 		}
 	}
 
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Add this buffer to the deferred op.  Each joined buffer is relogged
+ * each time we roll the transaction.
+ */
+int
+xfs_defer_bjoin(
+	struct xfs_defer_ops		*dop,
+	struct xfs_buf			*bp)
+{
+	int				i;
+
+	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) {
+		if (dop->dop_bufs[i] == bp)
+			return 0;
+		else if (dop->dop_bufs[i] == NULL) {
+			dop->dop_bufs[i] = bp;
+			return 0;
+		}
+	}
+
+	ASSERT(0);
 	return -EFSCORRUPTED;
 }
 
@@ -493,9 +528,7 @@ xfs_defer_init(
 	struct xfs_defer_ops		*dop,
 	xfs_fsblock_t			*fbp)
 {
-	dop->dop_committed = false;
-	dop->dop_low = false;
-	memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
+	memset(dop, 0, sizeof(struct xfs_defer_ops));
 	*fbp = NULLFSBLOCK;
 	INIT_LIST_HEAD(&dop->dop_intake);
 	INIT_LIST_HEAD(&dop->dop_pending);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index d4f046dd44bd..045beacdd37d 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -59,6 +59,7 @@ enum xfs_defer_ops_type {
 };
 
 #define XFS_DEFER_OPS_NR_INODES	2	/* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS	2	/* join up to two buffers */
 
 struct xfs_defer_ops {
 	bool			dop_committed;	/* did any trans commit? */
@@ -66,8 +67,9 @@ struct xfs_defer_ops {
 	struct list_head	dop_intake;	/* unlogged pending work */
 	struct list_head	dop_pending;	/* logged pending work */
 
-	/* relog these inodes with each roll */
+	/* relog these with each roll */
 	struct xfs_inode	*dop_inodes[XFS_DEFER_OPS_NR_INODES];
+	struct xfs_buf		*dop_bufs[XFS_DEFER_OPS_NR_BUFS];
 };
 
 void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
@@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop);
 void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
 bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
 int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp);
 
 /* Description of a deferred type. */
 struct xfs_defer_op_type {
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index de3f04a98656..3b57ef0f2f76 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -920,8 +920,7 @@ STATIC xfs_agnumber_t
 xfs_ialloc_ag_select(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_ino_t	parent,		/* parent directory inode number */
-	umode_t		mode,		/* bits set to indicate file type */
-	int		okalloc)	/* ok to allocate more space */
+	umode_t		mode)		/* bits set to indicate file type */
 {
 	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
 	xfs_agnumber_t	agno;		/* current ag number */
@@ -978,9 +977,6 @@ xfs_ialloc_ag_select(
 			return agno;
 		}
 
-		if (!okalloc)
-			goto nextag;
-
 		if (!pag->pagf_init) {
 			error = xfs_alloc_pagf_init(mp, tp, agno, flags);
 			if (error)
@@ -1680,7 +1676,6 @@ xfs_dialloc(
 	struct xfs_trans	*tp,
 	xfs_ino_t		parent,
 	umode_t			mode,
-	int			okalloc,
 	struct xfs_buf		**IO_agbp,
 	xfs_ino_t		*inop)
 {
@@ -1692,6 +1687,7 @@ xfs_dialloc(
 	int			noroom = 0;
 	xfs_agnumber_t		start_agno;
 	struct xfs_perag	*pag;
+	int			okalloc = 1;
 
 	if (*IO_agbp) {
 		/*
@@ -1707,7 +1703,7 @@ xfs_dialloc(
 	 * We do not have an agbp, so select an initial allocation
 	 * group for inode allocation.
 	 */
-	start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+	start_agno = xfs_ialloc_ag_select(tp, parent, mode);
 	if (start_agno == NULLAGNUMBER) {
 		*inop = NULLFSINO;
 		return 0;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index d2bdcd5e7312..66a8de0b1caa 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -81,7 +81,6 @@ xfs_dialloc(
 	struct xfs_trans *tp,		/* transaction pointer */
 	xfs_ino_t	parent,		/* parent inode (directory) */
 	umode_t		mode,		/* mode bits for new inode */
-	int		okalloc,	/* ok to allocate more space */
 	struct xfs_buf	**agbp,		/* buf for a.g. inode header */
 	xfs_ino_t	*inop);		/* inode number allocated */
 
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 343a94246f5b..b0f31791c7e6 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -302,7 +302,7 @@ xfs_iext_rec_cmp(
 	xfs_fileoff_t		offset)
 {
 	uint64_t		rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
-	u32			rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
+	uint32_t		rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
 
 	if (rec_offset > offset)
 		return 1;
@@ -632,8 +632,6 @@ xfs_iext_insert(
 	struct xfs_iext_leaf	*new = NULL;
 	int			nr_entries, i;
 
-	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
 	if (ifp->if_height == 0)
 		xfs_iext_alloc_root(ifp, cur);
 	else if (ifp->if_height == 1)
@@ -661,6 +659,8 @@ xfs_iext_insert(
 	xfs_iext_set(cur_rec(cur), irec);
 	ifp->if_bytes += sizeof(struct xfs_iext_rec);
 
+	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
 	if (new)
 		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
 }
@@ -850,9 +850,9 @@ static void
 xfs_iext_free_last_leaf(
 	struct xfs_ifork	*ifp)
 {
-	ifp->if_u1.if_root = NULL;
 	ifp->if_height--;
 	kmem_free(ifp->if_u1.if_root);
+	ifp->if_u1.if_root = NULL;
 }
 
 void
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1c90ec41e9df..c79a1616b79d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,11 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
 
-static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
-{
-	return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
-}
-
 /*
  * Copy inode type and data and attr format specific information from the
  * on-disk inode to the in-core inode and fork structures.  For fifos, devices,
@@ -792,7 +787,8 @@ xfs_iflush_fork(
 	case XFS_DINODE_FMT_DEV:
 		if (iip->ili_fields & XFS_ILOG_DEV) {
 			ASSERT(whichfork == XFS_DATA_FORK);
-			xfs_dinode_put_rdev(dip, sysv_encode_dev(VFS_I(ip)->i_rdev));
+			xfs_dinode_put_rdev(dip,
+					linux_to_xfs_dev_t(VFS_I(ip)->i_rdev));
 		}
 		break;
 
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 996f035ee205..349d9f8edb89 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -274,7 +274,7 @@ struct xfs_inode_log_format {
 	uint64_t		ilf_ino;	/* inode number */
 	union {
 		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
-		u8		__pad[16];	/* unused */
+		uint8_t		__pad[16];	/* unused */
 	} ilf_u;
 	int64_t			ilf_blkno;	/* blkno of inode buffer */
 	int32_t			ilf_len;	/* len of inode buffer */
@@ -295,7 +295,7 @@ struct xfs_inode_log_format_32 {
 	uint64_t		ilf_ino;	/* inode number */
 	union {
 		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
-		u8		__pad[16];	/* unused */
+		uint8_t		__pad[16];	/* unused */
 	} ilf_u;
 	int64_t			ilf_blkno;	/* blkno of inode buffer */
 	int32_t			ilf_len;	/* len of inode buffer */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 585b35d34142..c40d26763075 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1488,27 +1488,12 @@ __xfs_refcount_cow_alloc(
 	xfs_extlen_t		aglen,
 	struct xfs_defer_ops	*dfops)
 {
-	int			error;
-
 	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
 			agbno, aglen);
 
 	/* Add refcount btree reservation */
-	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
 			XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
-	if (error)
-		return error;
-
-	/* Add rmap entry */
-	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
-		error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
-				rcur->bc_private.a.agno,
-				agbno, aglen, XFS_RMAP_OWN_COW);
-		if (error)
-			return error;
-	}
-
-	return error;
 }
 
 /*
@@ -1521,27 +1506,12 @@ __xfs_refcount_cow_free(
 	xfs_extlen_t		aglen,
 	struct xfs_defer_ops	*dfops)
 {
-	int			error;
-
 	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
 			agbno, aglen);
 
 	/* Remove refcount btree reservation */
-	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
 			XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
-	if (error)
-		return error;
-
-	/* Remove rmap entry */
-	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
-		error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
-				rcur->bc_private.a.agno,
-				agbno, aglen, XFS_RMAP_OWN_COW);
-		if (error)
-			return error;
-	}
-
-	return error;
 }
 
 /* Record a CoW staging extent in the refcount btree. */
@@ -1552,11 +1522,19 @@ xfs_refcount_alloc_cow_extent(
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
+	int				error;
+
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return 0;
 
-	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+	error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
 			fsb, len);
+	if (error)
+		return error;
+
+	/* Add rmap entry */
+	return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1567,9 +1545,17 @@ xfs_refcount_free_cow_extent(
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
+	int				error;
+
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return 0;
 
+	/* Remove rmap entry */
+	error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	if (error)
+		return error;
+
 	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
 			fsb, len);
 }
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index dd019cee1b3b..50db920ceeeb 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -368,6 +368,51 @@ xfs_rmap_lookup_le_range(
 }
 
 /*
+ * Perform all the relevant owner checks for a removal op.  If we're doing an
+ * unknown-owner removal then we have no owner information to check.
+ */
+static int
+xfs_rmap_free_check_owner(
+	struct xfs_mount	*mp,
+	uint64_t		ltoff,
+	struct xfs_rmap_irec	*rec,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			error = 0;
+
+	if (owner == XFS_RMAP_OWN_UNKNOWN)
+		return 0;
+
+	/* Make sure the unwritten flag matches. */
+	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+			(rec->rm_flags & XFS_RMAP_UNWRITTEN), out);
+
+	/* Make sure the owner matches what we expect to find in the tree. */
+	XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out);
+
+	/* Check the offset, if necessary. */
+	if (XFS_RMAP_NON_INODE_OWNER(owner))
+		goto out;
+
+	if (flags & XFS_RMAP_BMBT_BLOCK) {
+		XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK,
+				out);
+	} else {
+		XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out);
+		XFS_WANT_CORRUPTED_GOTO(mp,
+				ltoff + rec->rm_blockcount >= offset + len,
+				out);
+	}
+
+out:
+	return error;
+}
+
+/*
  * Find the extent in the rmap btree and remove it.
  *
  * The record we find should always be an exact match for the extent that we're
@@ -444,33 +489,40 @@ xfs_rmap_unmap(
 		goto out_done;
 	}
 
-	/* Make sure the unwritten flag matches. */
-	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
-			(ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+	/*
+	 * If we're doing an unknown-owner removal for EFI recovery, we expect
+	 * to find the full range in the rmapbt or nothing at all.  If we
+	 * don't find any rmaps overlapping either end of the range, we're
+	 * done.  Hopefully this means that the EFI creator already queued
+	 * (and finished) a RUI to remove the rmap.
+	 */
+	if (owner == XFS_RMAP_OWN_UNKNOWN &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount <= bno) {
+		struct xfs_rmap_irec    rtrec;
+
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto out_error;
+		if (i == 0)
+			goto out_done;
+		error = xfs_rmap_get_rec(cur, &rtrec, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+		if (rtrec.rm_startblock >= bno + len)
+			goto out_done;
+	}
 
 	/* Make sure the extent we found covers the entire freeing range. */
 	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
-		ltrec.rm_startblock + ltrec.rm_blockcount >=
-		bno + len, out_error);
+			ltrec.rm_startblock + ltrec.rm_blockcount >=
+			bno + len, out_error);
 
-	/* Make sure the owner matches what we expect to find in the tree. */
-	XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner ||
-				    XFS_RMAP_NON_INODE_OWNER(owner), out_error);
-
-	/* Check the offset, if necessary. */
-	if (!XFS_RMAP_NON_INODE_OWNER(owner)) {
-		if (flags & XFS_RMAP_BMBT_BLOCK) {
-			XFS_WANT_CORRUPTED_GOTO(mp,
-					ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK,
-					out_error);
-		} else {
-			XFS_WANT_CORRUPTED_GOTO(mp,
-					ltrec.rm_offset <= offset, out_error);
-			XFS_WANT_CORRUPTED_GOTO(mp,
-					ltoff + ltrec.rm_blockcount >= offset + len,
-					out_error);
-		}
-	}
+	/* Check owner information. */
+	error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, bno, len, owner,
+			offset, flags);
+	if (error)
+		goto out_error;
 
 	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
 		/* exact match, simply remove the record from rmap tree */
@@ -664,6 +716,7 @@ xfs_rmap_map(
 		flags |= XFS_RMAP_UNWRITTEN;
 	trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
 			unwritten, oinfo);
+	ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
 
 	/*
 	 * For the initial lookup, look for an exact match or the left-adjacent
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 466ede637080..0fcd5b1ba729 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -61,7 +61,21 @@ static inline void
 xfs_rmap_skip_owner_update(
 	struct xfs_owner_info	*oi)
 {
-	oi->oi_owner = XFS_RMAP_OWN_UNKNOWN;
+	xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL);
+}
+
+static inline bool
+xfs_rmap_should_skip_owner_update(
+	struct xfs_owner_info	*oi)
+{
+	return oi->oi_owner == XFS_RMAP_OWN_NULL;
+}
+
+static inline void
+xfs_rmap_any_owner_update(
+	struct xfs_owner_info	*oi)
+{
+	xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN);
 }
 
 /* Reverse mapping functions. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 637b7a892313..f120fb20452f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -318,8 +318,20 @@ xfs_scrub_dinode(
 
 	/* di_mode */
 	mode = be16_to_cpu(dip->di_mode);
-	if (mode & ~(S_IALLUGO | S_IFMT))
+	switch (mode & S_IFMT) {
+	case S_IFLNK:
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		/* mode is recognized */
+		break;
+	default:
 		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	}
 
 	/* v1/v2 fields */
 	switch (dip->di_version) {
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 8e58ba842946..3d9037eceaf1 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -107,7 +107,7 @@ xfs_scrub_quota_item(
 	unsigned long long		rcount;
 	xfs_ino_t			fs_icount;
 
-	offset = id * qi->qi_dqperchunk;
+	offset = id / qi->qi_dqperchunk;
 
 	/*
 	 * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
@@ -207,7 +207,7 @@ xfs_scrub_quota(
 	xfs_dqid_t			id = 0;
 	uint				dqtype;
 	int				nimaps;
-	int				error;
+	int				error = 0;
 
 	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return -ENOENT;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 9c42c4efd01e..ab3aef2ae823 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -46,7 +46,6 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
-#include "scrub/scrub.h"
 #include "scrub/btree.h"
 
 /*
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 472080e75788..86daed0e3a45 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -26,7 +26,6 @@
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_da_format.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a3eeaba156c5..21e2d70884e1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -399,7 +399,7 @@ xfs_map_blocks(
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 
-	if (offset + count > mp->m_super->s_maxbytes)
+	if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes)
 		count = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -896,13 +896,13 @@ xfs_writepage_map(
 	struct writeback_control *wbc,
 	struct inode		*inode,
 	struct page		*page,
-	loff_t			offset,
-	uint64_t              end_offset)
+	uint64_t		end_offset)
 {
 	LIST_HEAD(submit_list);
 	struct xfs_ioend	*ioend, *next;
 	struct buffer_head	*bh, *head;
 	ssize_t			len = i_blocksize(inode);
+	uint64_t		offset;
 	int			error = 0;
 	int			count = 0;
 	int			uptodate = 1;
@@ -1146,7 +1146,7 @@ xfs_do_writepage(
 		end_offset = offset;
 	}
 
-	return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
+	return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
 
 redirty:
 	redirty_page_for_writepage(wbc, page);
@@ -1265,7 +1265,7 @@ xfs_map_trim_size(
 	if (mapping_size > size)
 		mapping_size = size;
 	if (offset < i_size_read(inode) &&
-	    offset + mapping_size >= i_size_read(inode)) {
+	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
 		/* limit mapping to block that spans EOF */
 		mapping_size = roundup_64(i_size_read(inode) - offset,
 					  i_blocksize(inode));
@@ -1312,7 +1312,7 @@ xfs_get_blocks(
 	lockmode = xfs_ilock_data_map_shared(ip);
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if (offset + size > mp->m_super->s_maxbytes)
+	if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes)
 		size = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index dd136f7275e4..e5fb008d75e8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -389,7 +389,8 @@ xfs_bud_init(
 int
 xfs_bui_recover(
 	struct xfs_mount		*mp,
-	struct xfs_bui_log_item		*buip)
+	struct xfs_bui_log_item		*buip,
+	struct xfs_defer_ops		*dfops)
 {
 	int				error = 0;
 	unsigned int			bui_type;
@@ -404,9 +405,7 @@ xfs_bui_recover(
 	xfs_exntst_t			state;
 	struct xfs_trans		*tp;
 	struct xfs_inode		*ip = NULL;
-	struct xfs_defer_ops		dfops;
 	struct xfs_bmbt_irec		irec;
-	xfs_fsblock_t			firstfsb;
 
 	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
 
@@ -464,7 +463,6 @@ xfs_bui_recover(
 
 	if (VFS_I(ip)->i_nlink == 0)
 		xfs_iflags_set(ip, XFS_IRECOVERY);
-	xfs_defer_init(&dfops, &firstfsb);
 
 	/* Process deferred bmap item. */
 	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
@@ -479,16 +477,16 @@ xfs_bui_recover(
 		break;
 	default:
 		error = -EFSCORRUPTED;
-		goto err_dfops;
+		goto err_inode;
 	}
 	xfs_trans_ijoin(tp, ip, 0);
 
 	count = bmap->me_len;
-	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+	error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type,
 			ip, whichfork, bmap->me_startoff,
 			bmap->me_startblock, &count, state);
 	if (error)
-		goto err_dfops;
+		goto err_inode;
 
 	if (count > 0) {
 		ASSERT(type == XFS_BMAP_UNMAP);
@@ -496,16 +494,11 @@ xfs_bui_recover(
 		irec.br_blockcount = count;
 		irec.br_startoff = bmap->me_startoff;
 		irec.br_state = state;
-		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+		error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec);
 		if (error)
-			goto err_dfops;
+			goto err_inode;
 	}
 
-	/* Finish transaction, free inodes. */
-	error = xfs_defer_finish(&tp, &dfops);
-	if (error)
-		goto err_dfops;
-
 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
 	error = xfs_trans_commit(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -513,8 +506,6 @@ xfs_bui_recover(
 
 	return error;
 
-err_dfops:
-	xfs_defer_cancel(&dfops);
 err_inode:
 	xfs_trans_cancel(tp);
 	if (ip) {
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index c867daae4a3c..24b354a2c836 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
 		struct xfs_bui_log_item *);
 void xfs_bui_item_free(struct xfs_bui_log_item *);
 void xfs_bui_release(struct xfs_bui_log_item *);
-int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip);
+int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip,
+		struct xfs_defer_ops *dfops);
 
 #endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4db6e8d780f6..4c6e86d861fd 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1815,22 +1815,27 @@ xfs_alloc_buftarg(
 	btp->bt_daxdev = dax_dev;
 
 	if (xfs_setsize_buftarg_early(btp, bdev))
-		goto error;
+		goto error_free;
 
 	if (list_lru_init(&btp->bt_lru))
-		goto error;
+		goto error_free;
 
 	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
-		goto error;
+		goto error_lru;
 
 	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
 	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
-	register_shrinker(&btp->bt_shrinker);
+	if (register_shrinker(&btp->bt_shrinker))
+		goto error_pcpu;
 	return btp;
 
-error:
+error_pcpu:
+	percpu_counter_destroy(&btp->bt_io_count);
+error_lru:
+	list_lru_destroy(&btp->bt_lru);
+error_free:
 	kmem_free(btp);
 	return NULL;
 }
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index d57c2db64e59..f248708c10ff 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -970,14 +970,22 @@ xfs_qm_dqflush_done(
 	 * holding the lock before removing the dquot from the AIL.
 	 */
 	if ((lip->li_flags & XFS_LI_IN_AIL) &&
-	    lip->li_lsn == qip->qli_flush_lsn) {
+	    ((lip->li_lsn == qip->qli_flush_lsn) ||
+	     (lip->li_flags & XFS_LI_FAILED))) {
 
 		/* xfs_trans_ail_delete() drops the AIL lock. */
 		spin_lock(&ailp->xa_lock);
-		if (lip->li_lsn == qip->qli_flush_lsn)
+		if (lip->li_lsn == qip->qli_flush_lsn) {
 			xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
-		else
+		} else {
+			/*
+			 * Clear the failed state since we are about to drop the
+			 * flush lock
+			 */
+			if (lip->li_flags & XFS_LI_FAILED)
+				xfs_clear_li_failed(lip);
 			spin_unlock(&ailp->xa_lock);
+		}
 	}
 
 	/*
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 2c7a1629e064..664dea105e76 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -137,6 +137,26 @@ xfs_qm_dqunpin_wait(
 	wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
 }
 
+/*
+ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
+ * have been failed during writeback
+ *
+ * this informs the AIL that the dquot is already flush locked on the next push,
+ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
+ * dirty data makes it to disk.
+ */
+STATIC void
+xfs_dquot_item_error(
+	struct xfs_log_item	*lip,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dquot	*dqp;
+
+	dqp = DQUOT_ITEM(lip)->qli_dquot;
+	ASSERT(!completion_done(&dqp->q_flush));
+	xfs_set_li_failed(lip, bp);
+}
+
 STATIC uint
 xfs_qm_dquot_logitem_push(
 	struct xfs_log_item	*lip,
@@ -144,13 +164,28 @@ xfs_qm_dquot_logitem_push(
 					      __acquires(&lip->li_ailp->xa_lock)
 {
 	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
-	struct xfs_buf		*bp = NULL;
+	struct xfs_buf		*bp = lip->li_buf;
 	uint			rval = XFS_ITEM_SUCCESS;
 	int			error;
 
 	if (atomic_read(&dqp->q_pincount) > 0)
 		return XFS_ITEM_PINNED;
 
+	/*
+	 * The buffer containing this item failed to be written back
+	 * previously. Resubmit the buffer for IO
+	 */
+	if (lip->li_flags & XFS_LI_FAILED) {
+		if (!xfs_buf_trylock(bp))
+			return XFS_ITEM_LOCKED;
+
+		if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+
+		xfs_buf_unlock(bp);
+		return rval;
+	}
+
 	if (!xfs_dqlock_nowait(dqp))
 		return XFS_ITEM_LOCKED;
 
@@ -242,7 +277,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_unlock	= xfs_qm_dquot_logitem_unlock,
 	.iop_committed	= xfs_qm_dquot_logitem_committed,
 	.iop_push	= xfs_qm_dquot_logitem_push,
-	.iop_committing = xfs_qm_dquot_logitem_committing
+	.iop_committing = xfs_qm_dquot_logitem_committing,
+	.iop_error	= xfs_dquot_item_error
 };
 
 /*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 44f8c5451210..64da90655e95 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -538,7 +538,7 @@ xfs_efi_recover(
 		return error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
-	xfs_rmap_skip_owner_update(&oinfo);
+	xfs_rmap_any_owner_update(&oinfo);
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &efip->efi_format.efi_extents[i];
 		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 18146873a8b3..8601275cc5e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
+#include <linux/mman.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+		pfn_t pfn;
+
+		ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+		if (ret & VM_FAULT_NEEDDSYNC)
+			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
 	} else {
 		if (write_fault)
 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
 }
 
 /*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
  */
 static int
 xfs_filemap_pfn_mkwrite(
 	struct vm_fault		*vmf)
 {
 
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret = VM_FAULT_NOPAGE;
-	loff_t			size;
-
-	trace_xfs_filemap_pfn_mkwrite(ip);
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-
-	/* check if the faulting page hasn't raced with truncate */
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1136,6 +1120,13 @@ xfs_file_mmap(
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(filp);
 	vma->vm_ops = &xfs_file_vm_ops;
 	if (IS_DAX(file_inode(filp)))
@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.mmap		= xfs_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8f22fc579dbb..60a2e128cb6a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -571,6 +571,11 @@ xfs_growfs_data_private(
 		 * this doesn't actually exist in the rmap btree.
 		 */
 		xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+		error = xfs_rmap_free(tp, bp, agno,
+				be32_to_cpu(agf->agf_length) - new,
+				new, &oinfo);
+		if (error)
+			goto error0;
 		error = xfs_free_extent(tp,
 				XFS_AGB_TO_FSB(mp, agno,
 					be32_to_cpu(agf->agf_length) - new),
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 43005fbe8b1e..3861d61fb265 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -870,7 +870,7 @@ xfs_eofblocks_worker(
  * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
  * (We'll just piggyback on the post-EOF prealloc space workqueue.)
  */
-STATIC void
+void
 xfs_queue_cowblocks(
 	struct xfs_mount *mp)
 {
@@ -1536,8 +1536,23 @@ xfs_inode_free_quota_eofblocks(
 	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
 }
 
+static inline unsigned long
+xfs_iflag_for_tag(
+	int		tag)
+{
+	switch (tag) {
+	case XFS_ICI_EOFBLOCKS_TAG:
+		return XFS_IEOFBLOCKS;
+	case XFS_ICI_COWBLOCKS_TAG:
+		return XFS_ICOWBLOCKS;
+	default:
+		ASSERT(0);
+		return 0;
+	}
+}
+
 static void
-__xfs_inode_set_eofblocks_tag(
+__xfs_inode_set_blocks_tag(
 	xfs_inode_t	*ip,
 	void		(*execute)(struct xfs_mount *mp),
 	void		(*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -1552,10 +1567,10 @@ __xfs_inode_set_eofblocks_tag(
 	 * Don't bother locking the AG and looking up in the radix trees
 	 * if we already know that we have the tag set.
 	 */
-	if (ip->i_flags & XFS_IEOFBLOCKS)
+	if (ip->i_flags & xfs_iflag_for_tag(tag))
 		return;
 	spin_lock(&ip->i_flags_lock);
-	ip->i_flags |= XFS_IEOFBLOCKS;
+	ip->i_flags |= xfs_iflag_for_tag(tag);
 	spin_unlock(&ip->i_flags_lock);
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@ -1587,13 +1602,13 @@ xfs_inode_set_eofblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_set_eofblocks_tag(ip);
-	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+	return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
 			trace_xfs_perag_set_eofblocks,
 			XFS_ICI_EOFBLOCKS_TAG);
 }
 
 static void
-__xfs_inode_clear_eofblocks_tag(
+__xfs_inode_clear_blocks_tag(
 	xfs_inode_t	*ip,
 	void		(*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
 				    int error, unsigned long caller_ip),
@@ -1603,7 +1618,7 @@ __xfs_inode_clear_eofblocks_tag(
 	struct xfs_perag *pag;
 
 	spin_lock(&ip->i_flags_lock);
-	ip->i_flags &= ~XFS_IEOFBLOCKS;
+	ip->i_flags &= ~xfs_iflag_for_tag(tag);
 	spin_unlock(&ip->i_flags_lock);
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@ -1630,7 +1645,7 @@ xfs_inode_clear_eofblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_clear_eofblocks_tag(ip);
-	return __xfs_inode_clear_eofblocks_tag(ip,
+	return __xfs_inode_clear_blocks_tag(ip,
 			trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
 }
 
@@ -1724,7 +1739,7 @@ xfs_inode_set_cowblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_set_cowblocks_tag(ip);
-	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+	return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
 			trace_xfs_perag_set_cowblocks,
 			XFS_ICI_COWBLOCKS_TAG);
 }
@@ -1734,6 +1749,6 @@ xfs_inode_clear_cowblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_clear_cowblocks_tag(ip);
-	return __xfs_inode_clear_eofblocks_tag(ip,
+	return __xfs_inode_clear_blocks_tag(ip,
 			trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
 }
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index bff4d85e5498..d4a77588eca1 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
 int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
 void xfs_cowblocks_worker(struct work_struct *);
+void xfs_queue_cowblocks(struct xfs_mount *);
 
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d8226f7a5dde..6f95bdb408ce 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -749,7 +749,6 @@ xfs_ialloc(
 	xfs_nlink_t	nlink,
 	dev_t		rdev,
 	prid_t		prid,
-	int		okalloc,
 	xfs_buf_t	**ialloc_context,
 	xfs_inode_t	**ipp)
 {
@@ -765,7 +764,7 @@ xfs_ialloc(
 	 * Call the space management code to pick
 	 * the on-disk inode to be allocated.
 	 */
-	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
 			    ialloc_context, &ino);
 	if (error)
 		return error;
@@ -957,7 +956,6 @@ xfs_dir_ialloc(
 	xfs_nlink_t	nlink,
 	dev_t		rdev,
 	prid_t		prid,		/* project id */
-	int		okalloc,	/* ok to allocate new space */
 	xfs_inode_t	**ipp,		/* pointer to inode; it will be
 					   locked. */
 	int		*committed)
@@ -988,8 +986,8 @@ xfs_dir_ialloc(
 	 * transaction commit so that no other process can steal
 	 * the inode(s) that we've just allocated.
 	 */
-	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
-			  &ialloc_context, &ip);
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
+			&ip);
 
 	/*
 	 * Return an error if we were unable to allocate a new inode.
@@ -1061,7 +1059,7 @@ xfs_dir_ialloc(
 		 * this call should always succeed.
 		 */
 		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
-				  okalloc, &ialloc_context, &ip);
+				  &ialloc_context, &ip);
 
 		/*
 		 * If we get an error at this point, return to the caller
@@ -1182,11 +1180,6 @@ xfs_create(
 		xfs_flush_inodes(mp);
 		error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
 	}
-	if (error == -ENOSPC) {
-		/* No space at all so try a "no-allocation" reservation */
-		resblks = 0;
-		error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
-	}
 	if (error)
 		goto out_release_inode;
 
@@ -1203,19 +1196,13 @@ xfs_create(
 	if (error)
 		goto out_trans_cancel;
 
-	if (!resblks) {
-		error = xfs_dir_canenter(tp, dp, name);
-		if (error)
-			goto out_trans_cancel;
-	}
-
 	/*
 	 * A newly created regular or special file just has one directory
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
-			       prid, resblks > 0, &ip, NULL);
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip,
+			NULL);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1340,11 +1327,6 @@ xfs_create_tmpfile(
 	tres = &M_RES(mp)->tr_create_tmpfile;
 
 	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
-	if (error == -ENOSPC) {
-		/* No space at all so try a "no-allocation" reservation */
-		resblks = 0;
-		error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
-	}
 	if (error)
 		goto out_release_inode;
 
@@ -1353,8 +1335,7 @@ xfs_create_tmpfile(
 	if (error)
 		goto out_trans_cancel;
 
-	error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
-				prid, resblks > 0, &ip, NULL);
+	error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1506,6 +1487,24 @@ xfs_link(
 	return error;
 }
 
+/* Clear the reflink flag and the cowblocks tag if possible. */
+static void
+xfs_itruncate_clear_reflink_flags(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*dfork;
+	struct xfs_ifork	*cfork;
+
+	if (!xfs_is_reflink_inode(ip))
+		return;
+	dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+	if (cfork->if_bytes == 0)
+		xfs_inode_clear_cowblocks_tag(ip);
+}
+
 /*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
@@ -1602,15 +1601,7 @@ xfs_itruncate_extents(
 	if (error)
 		goto out;
 
-	/*
-	 * Clear the reflink flag if there are no data fork blocks and
-	 * there are no extents staged in the cow fork.
-	 */
-	if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
-		if (ip->i_d.di_nblocks == 0)
-			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
-		xfs_inode_clear_cowblocks_tag(ip);
-	}
+	xfs_itruncate_clear_reflink_flags(ip);
 
 	/*
 	 * Always re-log the inode so that our permanent transaction can keep
@@ -2357,6 +2348,7 @@ retry:
 				 */
 				if (ip->i_ino != inum + i) {
 					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					rcu_read_unlock();
 					continue;
 				}
 			}
@@ -2400,6 +2392,24 @@ retry:
 }
 
 /*
+ * Free any local-format buffers sitting around before we reset to
+ * extents format.
+ */
+static inline void
+xfs_ifree_local_data(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+		return;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+}
+
+/*
  * This is called to return an inode to the inode free list.
  * The inode should already be truncated to 0 length and have
  * no pages associated with it.  This routine also assumes that
@@ -2436,6 +2446,9 @@ xfs_ifree(
 	if (error)
 		return error;
 
+	xfs_ifree_local_data(ip, XFS_DATA_FORK);
+	xfs_ifree_local_data(ip, XFS_ATTR_FORK);
+
 	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cc13c3763721..d383e392ec9d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
  * log recovery to replay a bmap operation on the inode.
  */
 #define XFS_IRECOVERY		(1 << 11)
+#define XFS_ICOWBLOCKS		(1 << 12)/* has the cowblocks tag set */
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -428,7 +429,7 @@ xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
 xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
 
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
-			       xfs_nlink_t, dev_t, prid_t, int,
+			       xfs_nlink_t, dev_t, prid_t,
 			       struct xfs_inode **, int *);
 
 /* from xfs_file.c */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18077e2189a9..7ab52a8bc0a9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
 	}
 
+	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+				& ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
 
 	if (shared)
@@ -1208,7 +1213,7 @@ xfs_xattr_iomap_begin(
 
 	ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
-			       &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
+			       &nimaps, XFS_BMAPI_ATTRFORK);
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 6282bfc1afa9..99562ec0de56 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -204,6 +204,16 @@ static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
 	return make_kgid(&init_user_ns, gid);
 }
 
+static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
+{
+	return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+}
+
+static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev)
+{
+	return sysv_encode_dev(dev);
+}
+
 /*
  * Various platform dependent calls that don't fit anywhere else
  */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 38d4227895ae..a503af96d780 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -781,17 +781,17 @@ xfs_log_mount_finish(
 	 * something to an unlinked inode, the irele won't cause
 	 * premature truncation and freeing of the inode, which results
 	 * in log recovery failure.  We have to evict the unreferenced
-	 * lru inodes after clearing MS_ACTIVE because we don't
+	 * lru inodes after clearing SB_ACTIVE because we don't
 	 * otherwise clean up the lru if there's a subsequent failure in
 	 * xfs_mountfs, which leads to us leaking the inodes if nothing
 	 * else (e.g. quotacheck) references the inodes before the
 	 * mount failure occurs.
 	 */
-	mp->m_super->s_flags |= MS_ACTIVE;
+	mp->m_super->s_flags |= SB_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
-	mp->m_super->s_flags &= ~MS_ACTIVE;
+	mp->m_super->s_flags &= ~SB_ACTIVE;
 	evict_inodes(mp->m_super);
 
 	/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 87b1c331f9eb..28d1abfe835e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,6 +24,7 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
@@ -4716,7 +4717,8 @@ STATIC int
 xlog_recover_process_cui(
 	struct xfs_mount		*mp,
 	struct xfs_ail			*ailp,
-	struct xfs_log_item		*lip)
+	struct xfs_log_item		*lip,
+	struct xfs_defer_ops		*dfops)
 {
 	struct xfs_cui_log_item		*cuip;
 	int				error;
@@ -4729,7 +4731,7 @@ xlog_recover_process_cui(
 		return 0;
 
 	spin_unlock(&ailp->xa_lock);
-	error = xfs_cui_recover(mp, cuip);
+	error = xfs_cui_recover(mp, cuip, dfops);
 	spin_lock(&ailp->xa_lock);
 
 	return error;
@@ -4756,7 +4758,8 @@ STATIC int
 xlog_recover_process_bui(
 	struct xfs_mount		*mp,
 	struct xfs_ail			*ailp,
-	struct xfs_log_item		*lip)
+	struct xfs_log_item		*lip,
+	struct xfs_defer_ops		*dfops)
 {
 	struct xfs_bui_log_item		*buip;
 	int				error;
@@ -4769,7 +4772,7 @@ xlog_recover_process_bui(
 		return 0;
 
 	spin_unlock(&ailp->xa_lock);
-	error = xfs_bui_recover(mp, buip);
+	error = xfs_bui_recover(mp, buip, dfops);
 	spin_lock(&ailp->xa_lock);
 
 	return error;
@@ -4805,6 +4808,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
 	}
 }
 
+/* Take all the collected deferred ops and finish them in order. */
+static int
+xlog_finish_defer_ops(
+	struct xfs_mount	*mp,
+	struct xfs_defer_ops	*dfops)
+{
+	struct xfs_trans	*tp;
+	int64_t			freeblks;
+	uint			resblks;
+	int			error;
+
+	/*
+	 * We're finishing the defer_ops that accumulated as a result of
+	 * recovering unfinished intent items during log recovery.  We
+	 * reserve an itruncate transaction because it is the largest
+	 * permanent transaction type.  Since we're the only user of the fs
+	 * right now, take 93% (15/16) of the available free blocks.  Use
+	 * weird math to avoid a 64-bit division.
+	 */
+	freeblks = percpu_counter_sum(&mp->m_fdblocks);
+	if (freeblks <= 0)
+		return -ENOSPC;
+	resblks = min_t(int64_t, UINT_MAX, freeblks);
+	resblks = (resblks * 15) >> 4;
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+			0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	error = xfs_defer_finish(&tp, dfops);
+	if (error)
+		goto out_cancel;
+
+	return xfs_trans_commit(tp);
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
 /*
  * When this is called, all of the log intent items which did not have
  * corresponding log done items should be in the AIL.  What we do now
@@ -4825,10 +4868,12 @@ STATIC int
 xlog_recover_process_intents(
 	struct xlog		*log)
 {
-	struct xfs_log_item	*lip;
-	int			error = 0;
+	struct xfs_defer_ops	dfops;
 	struct xfs_ail_cursor	cur;
+	struct xfs_log_item	*lip;
 	struct xfs_ail		*ailp;
+	xfs_fsblock_t		firstfsb;
+	int			error = 0;
 #if defined(DEBUG) || defined(XFS_WARN)
 	xfs_lsn_t		last_lsn;
 #endif
@@ -4839,6 +4884,7 @@ xlog_recover_process_intents(
 #if defined(DEBUG) || defined(XFS_WARN)
 	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
 #endif
+	xfs_defer_init(&dfops, &firstfsb);
 	while (lip != NULL) {
 		/*
 		 * We're done when we see something other than an intent.
@@ -4859,6 +4905,12 @@ xlog_recover_process_intents(
 		 */
 		ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
 
+		/*
+		 * NOTE: If your intent processing routine can create more
+		 * deferred ops, you /must/ attach them to the dfops in this
+		 * routine or else those subsequent intents will get
+		 * replayed in the wrong order!
+		 */
 		switch (lip->li_type) {
 		case XFS_LI_EFI:
 			error = xlog_recover_process_efi(log->l_mp, ailp, lip);
@@ -4867,10 +4919,12 @@ xlog_recover_process_intents(
 			error = xlog_recover_process_rui(log->l_mp, ailp, lip);
 			break;
 		case XFS_LI_CUI:
-			error = xlog_recover_process_cui(log->l_mp, ailp, lip);
+			error = xlog_recover_process_cui(log->l_mp, ailp, lip,
+					&dfops);
 			break;
 		case XFS_LI_BUI:
-			error = xlog_recover_process_bui(log->l_mp, ailp, lip);
+			error = xlog_recover_process_bui(log->l_mp, ailp, lip,
+					&dfops);
 			break;
 		}
 		if (error)
@@ -4880,6 +4934,11 @@ xlog_recover_process_intents(
 out:
 	xfs_trans_ail_cursor_done(&cur);
 	spin_unlock(&ailp->xa_lock);
+	if (error)
+		xfs_defer_cancel(&dfops);
+	else
+		error = xlog_finish_defer_ops(log->l_mp, &dfops);
+
 	return error;
 }
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 010a13a201aa..ec952dfad359 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -793,8 +793,8 @@ xfs_qm_qino_alloc(
 		return error;
 
 	if (need_alloc) {
-		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
-								&committed);
+		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip,
+				&committed);
 		if (error) {
 			xfs_trans_cancel(tp);
 			return error;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 8f2e2fac4255..3a55d6fc271b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -393,7 +393,8 @@ xfs_cud_init(
 int
 xfs_cui_recover(
 	struct xfs_mount		*mp,
-	struct xfs_cui_log_item		*cuip)
+	struct xfs_cui_log_item		*cuip,
+	struct xfs_defer_ops		*dfops)
 {
 	int				i;
 	int				error = 0;
@@ -405,11 +406,9 @@ xfs_cui_recover(
 	struct xfs_trans		*tp;
 	struct xfs_btree_cur		*rcur = NULL;
 	enum xfs_refcount_intent_type	type;
-	xfs_fsblock_t			firstfsb;
 	xfs_fsblock_t			new_fsb;
 	xfs_extlen_t			new_len;
 	struct xfs_bmbt_irec		irec;
-	struct xfs_defer_ops		dfops;
 	bool				requeue_only = false;
 
 	ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -465,7 +464,6 @@ xfs_cui_recover(
 		return error;
 	cudp = xfs_trans_get_cud(tp, cuip);
 
-	xfs_defer_init(&dfops, &firstfsb);
 	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
 		refc = &cuip->cui_format.cui_extents[i];
 		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
@@ -485,7 +483,7 @@ xfs_cui_recover(
 			new_len = refc->pe_len;
 		} else
 			error = xfs_trans_log_finish_refcount_update(tp, cudp,
-				&dfops, type, refc->pe_startblock, refc->pe_len,
+				dfops, type, refc->pe_startblock, refc->pe_len,
 				&new_fsb, &new_len, &rcur);
 		if (error)
 			goto abort_error;
@@ -497,21 +495,21 @@ xfs_cui_recover(
 			switch (type) {
 			case XFS_REFCOUNT_INCREASE:
 				error = xfs_refcount_increase_extent(
-						tp->t_mountp, &dfops, &irec);
+						tp->t_mountp, dfops, &irec);
 				break;
 			case XFS_REFCOUNT_DECREASE:
 				error = xfs_refcount_decrease_extent(
-						tp->t_mountp, &dfops, &irec);
+						tp->t_mountp, dfops, &irec);
 				break;
 			case XFS_REFCOUNT_ALLOC_COW:
 				error = xfs_refcount_alloc_cow_extent(
-						tp->t_mountp, &dfops,
+						tp->t_mountp, dfops,
 						irec.br_startblock,
 						irec.br_blockcount);
 				break;
 			case XFS_REFCOUNT_FREE_COW:
 				error = xfs_refcount_free_cow_extent(
-						tp->t_mountp, &dfops,
+						tp->t_mountp, dfops,
 						irec.br_startblock,
 						irec.br_blockcount);
 				break;
@@ -525,17 +523,12 @@ xfs_cui_recover(
 	}
 
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
-	error = xfs_defer_finish(&tp, &dfops);
-	if (error)
-		goto abort_defer;
 	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
 	error = xfs_trans_commit(tp);
 	return error;
 
 abort_error:
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
-abort_defer:
-	xfs_defer_cancel(&dfops);
 	xfs_trans_cancel(tp);
 	return error;
 }
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index 5b74dddfa64b..0e5327349a13 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
 		struct xfs_cui_log_item *);
 void xfs_cui_item_free(struct xfs_cui_log_item *);
 void xfs_cui_release(struct xfs_cui_log_item *);
-int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip);
+int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip,
+		struct xfs_defer_ops *dfops);
 
 #endif	/* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cc041a29eb70..47aea2e82c26 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -49,8 +49,6 @@
 #include "xfs_alloc.h"
 #include "xfs_quota_defs.h"
 #include "xfs_quota.h"
-#include "xfs_btree.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
 #include "xfs_iomap.h"
 #include "xfs_rmap_btree.h"
@@ -456,6 +454,8 @@ retry:
 	if (error)
 		goto out_bmap_cancel;
 
+	xfs_inode_set_cowblocks_tag(ip);
+
 	/* Finish up. */
 	error = xfs_defer_finish(&tp, &dfops);
 	if (error)
@@ -492,8 +492,9 @@ xfs_reflink_find_cow_mapping(
 	struct xfs_iext_cursor		icur;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
-	ASSERT(xfs_is_reflink_inode(ip));
 
+	if (!xfs_is_reflink_inode(ip))
+		return false;
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
 		return false;
@@ -612,6 +613,9 @@ xfs_reflink_cancel_cow_blocks(
 
 			/* Remove the mapping from the CoW fork. */
 			xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
+		} else {
+			/* Didn't do anything, push cursor back. */
+			xfs_iext_prev(ifp, &icur);
 		}
 next_extent:
 		if (!xfs_iext_get_extent(ifp, &icur, &got))
@@ -727,7 +731,7 @@ xfs_reflink_end_cow(
 			(unsigned int)(end_fsb - offset_fsb),
 			XFS_DATA_FORK);
 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
-			resblks, 0, 0, &tp);
+			resblks, 0, XFS_TRANS_RESERVE, &tp);
 	if (error)
 		goto out;
 
@@ -1293,6 +1297,17 @@ xfs_reflink_remap_range(
 
 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
+	/*
+	 * Clear out post-eof preallocations because we don't have page cache
+	 * backing the delayed allocations and they'll never get freed on
+	 * their own.
+	 */
+	if (xfs_can_free_eofblocks(dest, true)) {
+		ret = xfs_free_eofblocks(dest);
+		if (ret)
+			goto out_unlock;
+	}
+
 	/* Set flags and remap blocks. */
 	ret = xfs_reflink_set_inode_flag(src, dest);
 	if (ret)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f663022353c0..1dacccc367f8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -212,9 +212,9 @@ xfs_parseargs(
 	 */
 	if (sb_rdonly(sb))
 		mp->m_flags |= XFS_MOUNT_RDONLY;
-	if (sb->s_flags & MS_DIRSYNC)
+	if (sb->s_flags & SB_DIRSYNC)
 		mp->m_flags |= XFS_MOUNT_DIRSYNC;
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		mp->m_flags |= XFS_MOUNT_WSYNC;
 
 	/*
@@ -1312,7 +1312,7 @@ xfs_fs_remount(
 	}
 
 	/* ro -> rw */
-	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
 		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
 			xfs_warn(mp,
 		"ro->rw transition prohibited on norecovery mount");
@@ -1360,6 +1360,7 @@ xfs_fs_remount(
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 			return error;
 		}
+		xfs_queue_cowblocks(mp);
 
 		/* Create the per-AG metadata reservation pool .*/
 		error = xfs_fs_reserve_ag_blocks(mp);
@@ -1368,7 +1369,15 @@ xfs_fs_remount(
 	}
 
 	/* rw -> ro */
-	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
+		/* Get rid of any leftover CoW reservations... */
+		cancel_delayed_work_sync(&mp->m_cowblocks_work);
+		error = xfs_icache_free_cowblocks(mp, NULL);
+		if (error) {
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+
 		/* Free the per-AG metadata reservation pool. */
 		error = xfs_fs_unreserve_ag_blocks(mp);
 		if (error) {
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 5f2f32408011..fcc5dfc70aa0 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -30,7 +30,7 @@ extern void xfs_qm_exit(void);
 
 #ifdef CONFIG_XFS_POSIX_ACL
 # define XFS_ACL_STRING		"ACLs, "
-# define set_posix_acl_flag(sb)	((sb)->s_flags |= MS_POSIXACL)
+# define set_posix_acl_flag(sb)	((sb)->s_flags |= SB_POSIXACL)
 #else
 # define XFS_ACL_STRING
 # define set_posix_acl_flag(sb)	do { } while (0)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 68d3ca2c4968..2e9e793a8f9d 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -232,11 +232,6 @@ xfs_symlink(
 	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
-	if (error == -ENOSPC && fs_blocks == 0) {
-		resblks = 0;
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
-				&tp);
-	}
 	if (error)
 		goto out_release_inode;
 
@@ -260,14 +255,6 @@ xfs_symlink(
 		goto out_trans_cancel;
 
 	/*
-	 * Check for ability to enter directory entry, if no space reserved.
-	 */
-	if (!resblks) {
-		error = xfs_dir_canenter(tp, dp, link_name);
-		if (error)
-			goto out_trans_cancel;
-	}
-	/*
 	 * Initialize the bmap freelist prior to calling either
 	 * bmapi or the directory create code.
 	 */
@@ -277,7 +264,7 @@ xfs_symlink(
 	 * Allocate an inode for the symlink.
 	 */
 	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-			       prid, resblks > 0, &ip, NULL);
+			       prid, &ip, NULL);
 	if (error)
 		goto out_trans_cancel;
 
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 5d95fe348294..35f3546b6af5 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -24,7 +24,6 @@
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_da_format.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_da_btree.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 515ba042d75c..d718a10c2271 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
 TRACE_EVENT(xfs_filemap_fault,
 	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
 		 bool write_fault),