From 2796d303e3c5ec213c578ed3a66872205c126eb8 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Wed, 25 Apr 2018 11:30:04 -0700
Subject: cifs: Allocate validate negotiation request through kmalloc

The data buffer allocated on the stack can't be DMA'ed, ib_dma_map_page will
return an invalid DMA address for a buffer on stack. Even worse, this
incorrect address can't be detected by ib_dma_mapping_error. Sending data
from this address to hardware will not fail, but the remote peer will get
junk data.

Fix this by allocating the request on the heap in smb3_validate_negotiate.

Changes in v2:
Removed duplicated code on freeing buffers on function exit.
(Thanks to Parav Pandit <parav@mellanox.com>)
Fixed typo in the patch title.

Changes in v3:
Added "Fixes" to the patch.
Changed several sizeof() to use *pointer in place of struct.

Changes in v4:
Added detailed comments on the failure through RDMA.
Allocate request buffer using GPF_NOFS.
Fixed possible memory leak.

Changes in v5:
Removed variable ret for checking return value.
Changed to use pneg_inbuf->Dialects[0] to calculate unused space in pneg_inbuf.

Fixes: ff1c038addc4 ("Check SMB3 dialects against downgrade attacks")
Signed-off-by: Long Li <longli@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Reviewed-by: Tom Talpey <ttalpey@microsoft.com>
---
 fs/cifs/smb2pdu.c | 68 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 60db51bae0e3..260e9c4219d8 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -730,8 +730,8 @@ neg_exit:
 
 int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
 {
-	int rc = 0;
-	struct validate_negotiate_info_req vneg_inbuf;
+	int rc;
+	struct validate_negotiate_info_req *pneg_inbuf;
 	struct validate_negotiate_info_rsp *pneg_rsp = NULL;
 	u32 rsplen;
 	u32 inbuflen; /* max of 4 dialects */
@@ -765,63 +765,69 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
 	if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL)
 		cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n");
 
-	vneg_inbuf.Capabilities =
+	pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS);
+	if (!pneg_inbuf)
+		return -ENOMEM;
+
+	pneg_inbuf->Capabilities =
 			cpu_to_le32(tcon->ses->server->vals->req_capabilities);
-	memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid,
+	memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid,
 					SMB2_CLIENT_GUID_SIZE);
 
 	if (tcon->ses->sign)
-		vneg_inbuf.SecurityMode =
+		pneg_inbuf->SecurityMode =
 			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
 	else if (global_secflags & CIFSSEC_MAY_SIGN)
-		vneg_inbuf.SecurityMode =
+		pneg_inbuf->SecurityMode =
 			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
 	else
-		vneg_inbuf.SecurityMode = 0;
+		pneg_inbuf->SecurityMode = 0;
 
 
 	if (strcmp(tcon->ses->server->vals->version_string,
 		SMB3ANY_VERSION_STRING) == 0) {
-		vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
-		vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
-		vneg_inbuf.DialectCount = cpu_to_le16(2);
+		pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
+		pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
+		pneg_inbuf->DialectCount = cpu_to_le16(2);
 		/* structure is big enough for 3 dialects, sending only 2 */
-		inbuflen = sizeof(struct validate_negotiate_info_req) - 2;
+		inbuflen = sizeof(*pneg_inbuf) -
+				sizeof(pneg_inbuf->Dialects[0]);
 	} else if (strcmp(tcon->ses->server->vals->version_string,
 		SMBDEFAULT_VERSION_STRING) == 0) {
-		vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
-		vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
-		vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
-		vneg_inbuf.DialectCount = cpu_to_le16(3);
+		pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
+		pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
+		pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
+		pneg_inbuf->DialectCount = cpu_to_le16(3);
 		/* structure is big enough for 3 dialects */
-		inbuflen = sizeof(struct validate_negotiate_info_req);
+		inbuflen = sizeof(*pneg_inbuf);
 	} else {
 		/* otherwise specific dialect was requested */
-		vneg_inbuf.Dialects[0] =
+		pneg_inbuf->Dialects[0] =
 			cpu_to_le16(tcon->ses->server->vals->protocol_id);
-		vneg_inbuf.DialectCount = cpu_to_le16(1);
+		pneg_inbuf->DialectCount = cpu_to_le16(1);
 		/* structure is big enough for 3 dialects, sending only 1 */
-		inbuflen = sizeof(struct validate_negotiate_info_req) - 4;
+		inbuflen = sizeof(*pneg_inbuf) -
+				sizeof(pneg_inbuf->Dialects[0]) * 2;
 	}
 
 	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
 		FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
-		(char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
-		(char **)&pneg_rsp, &rsplen);
+		(char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
 
 	if (rc != 0) {
 		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
-		return -EIO;
+		rc = -EIO;
+		goto out_free_inbuf;
 	}
 
-	if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
+	rc = -EIO;
+	if (rsplen != sizeof(*pneg_rsp)) {
 		cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n",
 			 rsplen);
 
 		/* relax check since Mac returns max bufsize allowed on ioctl */
-		if ((rsplen > CIFSMaxBufSize)
-		     || (rsplen < sizeof(struct validate_negotiate_info_rsp)))
-			goto err_rsp_free;
+		if (rsplen > CIFSMaxBufSize || rsplen < sizeof(*pneg_rsp))
+			goto out_free_rsp;
 	}
 
 	/* check validate negotiate info response matches what we got earlier */
@@ -838,15 +844,17 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
 		goto vneg_out;
 
 	/* validate negotiate successful */
+	rc = 0;
 	cifs_dbg(FYI, "validate negotiate info successful\n");
-	kfree(pneg_rsp);
-	return 0;
+	goto out_free_rsp;
 
 vneg_out:
 	cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
-err_rsp_free:
+out_free_rsp:
 	kfree(pneg_rsp);
-	return -EIO;
+out_free_inbuf:
+	kfree(pneg_inbuf);
+	return rc;
 }
 
 enum securityEnum
-- 
cgit v1.2.1


From f7c439668a291ca94f358e44d3a3e9f2a2524b8a Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Wed, 25 Apr 2018 11:30:05 -0700
Subject: cifs: smbd: Enable signing with smbdirect

Now signing is supported with RDMA transport.

Remove the code that disabled it.

Signed-off-by: Long Li <longli@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/connect.c | 8 --------
 fs/cifs/smb2pdu.c | 5 -----
 2 files changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a5aa158d535a..7a10a5d0731f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1977,14 +1977,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		goto cifs_parse_mount_err;
 	}
 
-#ifdef CONFIG_CIFS_SMB_DIRECT
-	if (vol->rdma && vol->sign) {
-		cifs_dbg(VFS, "Currently SMB direct doesn't support signing."
-			" This is being fixed\n");
-		goto cifs_parse_mount_err;
-	}
-#endif
-
 #ifndef CONFIG_KEYS
 	/* Muliuser mounts require CONFIG_KEYS support */
 	if (vol->multiuser) {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 260e9c4219d8..0f48741a0130 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -738,11 +738,6 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
 
 	cifs_dbg(FYI, "validate negotiate\n");
 
-#ifdef CONFIG_CIFS_SMB_DIRECT
-	if (tcon->ses->server->rdma)
-		return 0;
-#endif
-
 	/* In SMB3.11 preauth integrity supersedes validate negotiate */
 	if (tcon->ses->server->dialect == SMB311_PROT_ID)
 		return 0;
-- 
cgit v1.2.1


From ae2cd7fb478b8da707906ee1706ae1379968a8f9 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <palcantara@suse.de>
Date: Fri, 4 May 2018 11:25:26 -0300
Subject: cifs: smb2ops: Fix listxattr() when there are no EAs

As per listxattr(2):

       On success, a nonnegative number is returned indicating the size
       of the extended attribute name list.  On failure, -1 is returned
       and errno  is set appropriately.

In SMB1, when the server returns an empty EA list through a listxattr(),
it will correctly return 0 as there are no EAs for the given file.

However, in SMB2+, it returns -ENODATA in listxattr() which is wrong since
the request and response were sent successfully, although there's no actual
EA for the given file.

This patch fixes listxattr() for SMB2+ by returning 0 in cifs_listxattr()
when the server returns an empty list of EAs.

Signed-off-by: Paulo Alcantara <palcantara@suse.de>
Reviewed-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2ops.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index b76b85881dcc..9c6d95ffca97 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -589,9 +589,15 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
 
 	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
 
+	/*
+	 * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's
+	 * not an error. Otherwise, the specified ea_name was not found.
+	 */
 	if (!rc)
 		rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data,
 					  SMB2_MAX_EA_BUF, ea_name);
+	else if (!ea_name && rc == -ENODATA)
+		rc = 0;
 
 	kfree(smb2_data);
 	return rc;
-- 
cgit v1.2.1


From 6e70c267e68d77679534dcf4aaf84e66f2cf1425 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Thu, 10 May 2018 10:59:37 -0500
Subject: smb3: directory sync should not return an error

As with NFS, which ignores sync on directory handles,
fsync on a directory handle is a noop for CIFS/SMB3.
Do not return an error on it.  It breaks some database
apps otherwise.

Signed-off-by: Steve French <smfrench@gmail.com>
CC: Stable <stable@vger.kernel.org>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
---
 fs/cifs/cifsfs.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f715609b13f3..5a5a0158cc8f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1047,6 +1047,18 @@ out:
 	return rc;
 }
 
+/*
+ * Directory operations under CIFS/SMB2/SMB3 are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int cifs_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	cifs_dbg(FYI, "Sync directory - name: %pD datasync: 0x%x\n",
+		 file, datasync);
+
+	return 0;
+}
+
 static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
 				struct file *dst_file, loff_t destoff,
 				size_t len, unsigned int flags)
@@ -1181,6 +1193,7 @@ const struct file_operations cifs_dir_ops = {
 	.copy_file_range = cifs_copy_file_range,
 	.clone_file_range = cifs_clone_file_range,
 	.llseek = generic_file_llseek,
+	.fsync = cifs_dir_fsync,
 };
 
 static void
-- 
cgit v1.2.1


From 3955333df9a50e8783d115613a397ae55d905080 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Fri, 11 May 2018 16:01:57 -0700
Subject: proc/kcore: don't bounds check against address 0

The existing kcore code checks for bad addresses against __va(0) with
the assumption that this is the lowest address on the system.  This may
not hold true on some systems (e.g.  arm64) and produce overflows and
crashes.  Switch to using other functions to validate the address range.

It's currently only seen on arm64 and it's not clear if anyone wants to
use that particular combination on a stable release.  So this is not
urgent for stable.

Link: http://lkml.kernel.org/r/20180501201143.15121-1-labbott@redhat.com
Signed-off-by: Laura Abbott <labbott@redhat.com>
Tested-by: Dave Anderson <anderson@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>a
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d1e82761de81..e64ecb9f2720 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -209,25 +209,34 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
 {
 	struct list_head *head = (struct list_head *)arg;
 	struct kcore_list *ent;
+	struct page *p;
+
+	if (!pfn_valid(pfn))
+		return 1;
+
+	p = pfn_to_page(pfn);
+	if (!memmap_valid_within(pfn, p, page_zone(p)))
+		return 1;
 
 	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
 	if (!ent)
 		return -ENOMEM;
-	ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+	ent->addr = (unsigned long)page_to_virt(p);
 	ent->size = nr_pages << PAGE_SHIFT;
 
-	/* Sanity check: Can happen in 32bit arch...maybe */
-	if (ent->addr < (unsigned long) __va(0))
+	if (!virt_addr_valid(ent->addr))
 		goto free_out;
 
 	/* cut not-mapped area. ....from ppc-32 code. */
 	if (ULONG_MAX - ent->addr < ent->size)
 		ent->size = ULONG_MAX - ent->addr;
 
-	/* cut when vmalloc() area is higher than direct-map area */
-	if (VMALLOC_START > (unsigned long)__va(0)) {
-		if (ent->addr > VMALLOC_START)
-			goto free_out;
+	/*
+	 * We've already checked virt_addr_valid so we know this address
+	 * is a valid pointer, therefore we can check against it to determine
+	 * if we need to trim
+	 */
+	if (VMALLOC_START > ent->addr) {
 		if (VMALLOC_START - ent->addr < ent->size)
 			ent->size = VMALLOC_START - ent->addr;
 	}
-- 
cgit v1.2.1


From e4383029201470523c3ffe339bd7d57e9b4a7d65 Mon Sep 17 00:00:00 2001
From: Ashish Samant <ashish.samant@oracle.com>
Date: Fri, 11 May 2018 16:02:07 -0700
Subject: ocfs2: take inode cluster lock before moving reflinked inode from
 orphan dir

While reflinking an inode, we create a new inode in orphan directory,
then take EX lock on it, reflink the original inode to orphan inode and
release EX lock.  Once the lock is released another node could request
it in EX mode from ocfs2_recover_orphans() which causes downconvert of
the lock, on this node, to NL mode.

Later we attempt to initialize security acl for the orphan inode and
move it to the reflink destination.  However, while doing this we dont
take EX lock on the inode.  This could potentially cause problems
because we could be starting transaction, accessing journal and
modifying metadata of the inode while holding NL lock and with another
node holding EX lock on the inode.

Fix this by taking orphan inode cluster lock in EX mode before
initializing security and moving orphan inode to reflink destination.
Use the __tracker variant while taking inode lock to avoid recursive
locking in the ocfs2_init_security_and_acl() call chain.

Link: http://lkml.kernel.org/r/1523475107-7639-1-git-send-email-ashish.samant@oracle.com
Signed-off-by: Ashish Samant <ashish.samant@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Acked-by: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/refcounttree.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 01c6b3894406..7869622af22a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4250,10 +4250,11 @@ out:
 static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 			 struct dentry *new_dentry, bool preserve)
 {
-	int error;
+	int error, had_lock;
 	struct inode *inode = d_inode(old_dentry);
 	struct buffer_head *old_bh = NULL;
 	struct inode *new_orphan_inode = NULL;
+	struct ocfs2_lock_holder oh;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
@@ -4295,6 +4296,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 		goto out;
 	}
 
+	had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1,
+					    &oh);
+	if (had_lock < 0) {
+		error = had_lock;
+		mlog_errno(error);
+		goto out;
+	}
+
 	/* If the security isn't preserved, we need to re-initialize them. */
 	if (!preserve) {
 		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
@@ -4302,14 +4311,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 		if (error)
 			mlog_errno(error);
 	}
-out:
 	if (!error) {
 		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
 						       new_dentry);
 		if (error)
 			mlog_errno(error);
 	}
+	ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock);
 
+out:
 	if (new_orphan_inode) {
 		/*
 		 * We need to open_unlock the inode no matter whether we
-- 
cgit v1.2.1


From b61f7dcf4eb2653e870c9079b02d11a0834cfe39 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Apr 2018 20:46:22 +0100
Subject: afs: Fix directory page locking

The afs directory loading code (primarily afs_read_dir()) locks all the
pages that hold a directory's content blob to defend against
getdents/getdents races and getdents/lookup races where the competitors
issue conflicting reads on the same data.  As the reads will complete
consecutively, they may retrieve different versions of the data and
one may overwrite the data that the other is busy parsing.

Fix this by not locking the pages at all, but rather by turning the
validation lock into an rwsem and getting an exclusive lock on it whilst
reading the data or validating the attributes and a shared lock whilst
parsing the data.  Sharing the attribute validation lock should be fine as
the data fetch will retrieve the attributes also.

The individual page locks aren't needed at all as the only place they're
being used is to serialise data loading.

Without this patch, the:

 	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
		...
	}

part of afs_read_dir() may be skipped, leaving the pages unlocked when we
hit the success: clause - in which case we try to unlock the not-locked
pages, leading to the following oops:

  page:ffffe38b405b4300 count:3 mapcount:0 mapping:ffff98156c83a978 index:0x0
  flags: 0xfffe000001004(referenced|private)
  raw: 000fffe000001004 ffff98156c83a978 0000000000000000 00000003ffffffff
  raw: dead000000000100 dead000000000200 0000000000000001 ffff98156b27c000
  page dumped because: VM_BUG_ON_PAGE(!PageLocked(page))
  page->mem_cgroup:ffff98156b27c000
  ------------[ cut here ]------------
  kernel BUG at mm/filemap.c:1205!
  ...
  RIP: 0010:unlock_page+0x43/0x50
  ...
  Call Trace:
   afs_dir_iterate+0x789/0x8f0 [kafs]
   ? _cond_resched+0x15/0x30
   ? kmem_cache_alloc_trace+0x166/0x1d0
   ? afs_do_lookup+0x69/0x490 [kafs]
   ? afs_do_lookup+0x101/0x490 [kafs]
   ? key_default_cmp+0x20/0x20
   ? request_key+0x3c/0x80
   ? afs_lookup+0xf1/0x340 [kafs]
   ? __lookup_slow+0x97/0x150
   ? lookup_slow+0x35/0x50
   ? walk_component+0x1bf/0x490
   ? path_lookupat.isra.52+0x75/0x200
   ? filename_lookup.part.66+0xa0/0x170
   ? afs_end_vnode_operation+0x41/0x60 [kafs]
   ? __check_object_size+0x9c/0x171
   ? strncpy_from_user+0x4a/0x170
   ? vfs_statx+0x73/0xe0
   ? __do_sys_newlstat+0x39/0x70
   ? __x64_sys_getdents+0xc9/0x140
   ? __x64_sys_getdents+0x140/0x140
   ? do_syscall_64+0x5b/0x160
   ? entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: f3ddee8dc4e2 ("afs: Fix directory handling")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c      | 36 +++++++++++++++++-------------------
 fs/afs/inode.c    |  6 +++---
 fs/afs/internal.h |  2 +-
 fs/afs/super.c    |  2 +-
 4 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5889f70d4d27..2853acd64482 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -180,6 +180,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)
  * get reclaimed during the iteration.
  */
 static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
+	__acquires(&dvnode->validate_lock)
 {
 	struct afs_read *req;
 	loff_t i_size;
@@ -261,18 +262,21 @@ retry:
 	/* If we're going to reload, we need to lock all the pages to prevent
 	 * races.
 	 */
-	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
-		ret = -ERESTARTSYS;
-		for (i = 0; i < req->nr_pages; i++)
-			if (lock_page_killable(req->pages[i]) < 0)
-				goto error_unlock;
+	ret = -ERESTARTSYS;
+	if (down_read_killable(&dvnode->validate_lock) < 0)
+		goto error;
 
-		if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-			goto success;
+	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+		goto success;
+
+	up_read(&dvnode->validate_lock);
+	if (down_write_killable(&dvnode->validate_lock) < 0)
+		goto error;
 
+	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
 		ret = afs_fetch_data(dvnode, key, req);
 		if (ret < 0)
-			goto error_unlock_all;
+			goto error_unlock;
 
 		task_io_account_read(PAGE_SIZE * req->nr_pages);
 
@@ -284,33 +288,26 @@ retry:
 		for (i = 0; i < req->nr_pages; i++)
 			if (!afs_dir_check_page(dvnode, req->pages[i],
 						req->actual_len))
-				goto error_unlock_all;
+				goto error_unlock;
 
 		// TODO: Trim excess pages
 
 		set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
 	}
 
+	downgrade_write(&dvnode->validate_lock);
 success:
-	i = req->nr_pages;
-	while (i > 0)
-		unlock_page(req->pages[--i]);
 	return req;
 
-error_unlock_all:
-	i = req->nr_pages;
 error_unlock:
-	while (i > 0)
-		unlock_page(req->pages[--i]);
+	up_write(&dvnode->validate_lock);
 error:
 	afs_put_read(req);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 
 content_has_grown:
-	i = req->nr_pages;
-	while (i > 0)
-		unlock_page(req->pages[--i]);
+	up_write(&dvnode->validate_lock);
 	afs_put_read(req);
 	goto retry;
 }
@@ -473,6 +470,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
 	}
 
 out:
+	up_read(&dvnode->validate_lock);
 	afs_put_read(req);
 	_leave(" = %d", ret);
 	return ret;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 06194cfe9724..e855c6e5cf28 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -415,7 +415,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	if (valid)
 		goto valid;
 
-	mutex_lock(&vnode->validate_lock);
+	down_write(&vnode->validate_lock);
 
 	/* if the promise has expired, we need to check the server again to get
 	 * a new promise - note that if the (parent) directory's metadata was
@@ -444,13 +444,13 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	 * different */
 	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
 		afs_zap_data(vnode);
-	mutex_unlock(&vnode->validate_lock);
+	up_write(&vnode->validate_lock);
 valid:
 	_leave(" = 0");
 	return 0;
 
 error_unlock:
-	mutex_unlock(&vnode->validate_lock);
+	up_write(&vnode->validate_lock);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index f8086ec95e24..468be1e0dffb 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,7 +494,7 @@ struct afs_vnode {
 #endif
 	struct afs_permits __rcu *permit_cache;	/* cache of permits so far obtained */
 	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
-	struct mutex		validate_lock;	/* lock for validating this vnode */
+	struct rw_semaphore	validate_lock;	/* lock for validating this vnode */
 	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 65081ec3c36e..b02838cd9525 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -590,7 +590,7 @@ static void afs_i_init_once(void *_vnode)
 	memset(vnode, 0, sizeof(*vnode));
 	inode_init_once(&vnode->vfs_inode);
 	mutex_init(&vnode->io_lock);
-	mutex_init(&vnode->validate_lock);
+	init_rwsem(&vnode->validate_lock);
 	spin_lock_init(&vnode->wb_lock);
 	spin_lock_init(&vnode->lock);
 	INIT_LIST_HEAD(&vnode->wb_keys);
-- 
cgit v1.2.1


From 01fd79e6de74a447c5657913a335d9ce6508cdb1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 May 2018 22:03:18 +0100
Subject: afs: Fix address list parsing

The parsing of port specifiers in the address list obtained from the DNS
resolution upcall doesn't work as in4_pton() and in6_pton() will fail on
encountering an unexpected delimiter (in this case, the '+' marking the
port number).  However, in*_pton() can't be given multiple specifiers.

Fix this by finding the delimiter in advance and not relying on in*_pton()
to find the end of the address for us.

Fixes: 8b2a464ced77 ("afs: Add an address list concept")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/addr_list.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 3bedfed608a2..7587fb665ff1 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -121,7 +121,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
 	p = text;
 	do {
 		struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
-		char tdelim = delim;
+		const char *q, *stop;
 
 		if (*p == delim) {
 			p++;
@@ -130,28 +130,33 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
 
 		if (*p == '[') {
 			p++;
-			tdelim = ']';
+			q = memchr(p, ']', end - p);
+		} else {
+			for (q = p; q < end; q++)
+				if (*q == '+' || *q == delim)
+					break;
 		}
 
-		if (in4_pton(p, end - p,
+		if (in4_pton(p, q - p,
 			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
-			     tdelim, &p)) {
+			     -1, &stop)) {
 			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
 			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
 			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
-		} else if (in6_pton(p, end - p,
+		} else if (in6_pton(p, q - p,
 				    srx->transport.sin6.sin6_addr.s6_addr,
-				    tdelim, &p)) {
+				    -1, &stop)) {
 			/* Nothing to do */
 		} else {
 			goto bad_address;
 		}
 
-		if (tdelim == ']') {
-			if (p == end || *p != ']')
-				goto bad_address;
+		if (stop != q)
+			goto bad_address;
+
+		p = q;
+		if (q < end && *q == ']')
 			p++;
-		}
 
 		if (p < end) {
 			if (*p == '+') {
-- 
cgit v1.2.1


From f2686b09269ec1a6f23028b5675d87c3b4579a4c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 14:12:50 +0100
Subject: afs: Fix giving up callbacks on server destruction

When a server record is destroyed, we want to send a message to the server
telling it that we're giving up all the callbacks it has promised us.

Apply two fixes to this:

 (1) Only send the FS.GiveUpAllCallBacks message if we actually got a
     callback from that server.  We assume this to be the case if we
     performed at least one successful FS operation on that server.

 (2) Send it to the address last used for that server rather than always
     picking the first address in the list (which might be unreachable).

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/internal.h | 1 +
 fs/afs/rxrpc.c    | 6 +++++-
 fs/afs/server.c   | 8 +++++---
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 468be1e0dffb..8de04b29bec1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -396,6 +396,7 @@ struct afs_server {
 #define AFS_SERVER_FL_PROBED	5		/* The fileserver has been probed */
 #define AFS_SERVER_FL_PROBING	6		/* Fileserver is being probed */
 #define AFS_SERVER_FL_NO_IBULK	7		/* Fileserver doesn't support FS.InlineBulkStatus */
+#define AFS_SERVER_FL_MAY_HAVE_CB 8		/* May have callbacks on this fileserver */
 	atomic_t		usage;
 	u32			addr_version;	/* Address list version */
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 5c6263972ec9..1f6235a6e9ae 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -482,8 +482,12 @@ static void afs_deliver_to_call(struct afs_call *call)
 		state = READ_ONCE(call->state);
 		switch (ret) {
 		case 0:
-			if (state == AFS_CALL_CL_PROC_REPLY)
+			if (state == AFS_CALL_CL_PROC_REPLY) {
+				if (call->cbi)
+					set_bit(AFS_SERVER_FL_MAY_HAVE_CB,
+						&call->cbi->server->flags);
 				goto call_complete;
+			}
 			ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY);
 			goto done;
 		case -EINPROGRESS:
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 629c74986cff..2c5cff60e34d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -395,14 +395,16 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
 	struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
 	struct afs_addr_cursor ac = {
 		.alist	= alist,
-		.addr	= &alist->addrs[0],
 		.start	= alist->index,
-		.index	= alist->index,
+		.index	= 0,
+		.addr	= &alist->addrs[alist->index],
 		.error	= 0,
 	};
 	_enter("%p", server);
 
-	afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+	if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+		afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+
 	call_rcu(&server->rcu, afs_server_rcu);
 	afs_dec_servers_outstanding(net);
 }
-- 
cgit v1.2.1


From d4a96bec7a7362834ef5c31d7b2cc9bf36eb0570 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 08:43:04 +0100
Subject: afs: Fix refcounting in callback registration

The refcounting on afs_cb_interest struct objects in
afs_register_server_cb_interest() is wrong as it uses the server list
entry's call back interest pointer without regard for the fact that it
might be replaced at any time and the object thrown away.

Fix this by:

 (1) Put a lock on the afs_server_list struct that can be used to
     mediate access to the callback interest pointers in the servers array.

 (2) Keep a ref on the callback interest that we get from the entry.

 (3) Dropping the old reference held by vnode->cb_interest if we replace
     the pointer.

Fixes: c435ee34551e ("afs: Overhaul the callback handling")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/callback.c    | 56 +++++++++++++++++++++++++++++++++++++---------------
 fs/afs/internal.h    |  7 +++++--
 fs/afs/rotate.c      |  4 ++--
 fs/afs/server_list.c |  7 +++++--
 4 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index abd9a84f4e88..09332945d322 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -23,36 +23,55 @@
 /*
  * Set up an interest-in-callbacks record for a volume on a server and
  * register it with the server.
- * - Called with volume->server_sem held.
+ * - Called with vnode->io_lock held.
  */
 int afs_register_server_cb_interest(struct afs_vnode *vnode,
-				    struct afs_server_entry *entry)
+				    struct afs_server_list *slist,
+				    unsigned int index)
 {
-	struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x;
+	struct afs_server_entry *entry = &slist->servers[index];
+	struct afs_cb_interest *cbi, *vcbi, *new, *old;
 	struct afs_server *server = entry->server;
 
 again:
+	if (vnode->cb_interest &&
+	    likely(vnode->cb_interest == entry->cb_interest))
+		return 0;
+
+	read_lock(&slist->lock);
+	cbi = afs_get_cb_interest(entry->cb_interest);
+	read_unlock(&slist->lock);
+
 	vcbi = vnode->cb_interest;
 	if (vcbi) {
-		if (vcbi == cbi)
+		if (vcbi == cbi) {
+			afs_put_cb_interest(afs_v2net(vnode), cbi);
 			return 0;
+		}
 
+		/* Use a new interest in the server list for the same server
+		 * rather than an old one that's still attached to a vnode.
+		 */
 		if (cbi && vcbi->server == cbi->server) {
 			write_seqlock(&vnode->cb_lock);
-			vnode->cb_interest = afs_get_cb_interest(cbi);
+			old = vnode->cb_interest;
+			vnode->cb_interest = cbi;
 			write_sequnlock(&vnode->cb_lock);
-			afs_put_cb_interest(afs_v2net(vnode), cbi);
+			afs_put_cb_interest(afs_v2net(vnode), old);
 			return 0;
 		}
 
+		/* Re-use the one attached to the vnode. */
 		if (!cbi && vcbi->server == server) {
-			afs_get_cb_interest(vcbi);
-			x = cmpxchg(&entry->cb_interest, cbi, vcbi);
-			if (x != cbi) {
-				cbi = x;
-				afs_put_cb_interest(afs_v2net(vnode), vcbi);
+			write_lock(&slist->lock);
+			if (entry->cb_interest) {
+				write_unlock(&slist->lock);
+				afs_put_cb_interest(afs_v2net(vnode), cbi);
 				goto again;
 			}
+
+			entry->cb_interest = cbi;
+			write_unlock(&slist->lock);
 			return 0;
 		}
 	}
@@ -72,13 +91,16 @@ again:
 		list_add_tail(&new->cb_link, &server->cb_interests);
 		write_unlock(&server->cb_break_lock);
 
-		x = cmpxchg(&entry->cb_interest, cbi, new);
-		if (x == cbi) {
+		write_lock(&slist->lock);
+		if (!entry->cb_interest) {
+			entry->cb_interest = afs_get_cb_interest(new);
 			cbi = new;
+			new = NULL;
 		} else {
-			cbi = x;
-			afs_put_cb_interest(afs_v2net(vnode), new);
+			cbi = afs_get_cb_interest(entry->cb_interest);
 		}
+		write_unlock(&slist->lock);
+		afs_put_cb_interest(afs_v2net(vnode), new);
 	}
 
 	ASSERT(cbi);
@@ -88,11 +110,13 @@ again:
 	 */
 	write_seqlock(&vnode->cb_lock);
 
-	vnode->cb_interest = afs_get_cb_interest(cbi);
+	old = vnode->cb_interest;
+	vnode->cb_interest = cbi;
 	vnode->cb_s_break = cbi->server->cb_s_break;
 	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 
 	write_sequnlock(&vnode->cb_lock);
+	afs_put_cb_interest(afs_v2net(vnode), old);
 	return 0;
 }
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8de04b29bec1..e75e57e13320 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -434,6 +434,7 @@ struct afs_server_list {
 	unsigned short		index;		/* Server currently in use */
 	unsigned short		vnovol_mask;	/* Servers to be skipped due to VNOVOL */
 	unsigned int		seq;		/* Set to ->servers_seq when installed */
+	rwlock_t		lock;
 	struct afs_server_entry	servers[];
 };
 
@@ -649,13 +650,15 @@ extern void afs_init_callback_state(struct afs_server *);
 extern void afs_break_callback(struct afs_vnode *);
 extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*);
 
-extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *);
+extern int afs_register_server_cb_interest(struct afs_vnode *,
+					   struct afs_server_list *, unsigned int);
 extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *);
 extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *);
 
 static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi)
 {
-	refcount_inc(&cbi->usage);
+	if (cbi)
+		refcount_inc(&cbi->usage);
 	return cbi;
 }
 
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ac0feac9d746..4a26d51b2968 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -350,8 +350,8 @@ use_server:
 	 * break request before we've finished decoding the reply and
 	 * installing the vnode.
 	 */
-	fc->ac.error = afs_register_server_cb_interest(
-		vnode, &fc->server_list->servers[fc->index]);
+	fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
+						       fc->index);
 	if (fc->ac.error < 0)
 		goto failed;
 
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 0f8dc4c8f07c..8a5760aa5832 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -49,6 +49,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
 		goto error;
 
 	refcount_set(&slist->usage, 1);
+	rwlock_init(&slist->lock);
 
 	/* Make sure a records exists for each server in the list. */
 	for (i = 0; i < vldb->nr_servers; i++) {
@@ -64,9 +65,11 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
 			goto error_2;
 		}
 
-		/* Insertion-sort by server pointer */
+		/* Insertion-sort by UUID */
 		for (j = 0; j < slist->nr_servers; j++)
-			if (slist->servers[j].server >= server)
+			if (memcmp(&slist->servers[j].server->uuid,
+				   &server->uuid,
+				   sizeof(server->uuid)) >= 0)
 				break;
 		if (j < slist->nr_servers) {
 			if (slist->servers[j].server == server) {
-- 
cgit v1.2.1


From ec5a3b4b507efca903d848518dcf2ebf7b04b466 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 14:22:38 +0100
Subject: afs: Fix server rotation's handling of fileserver probe failure

The server rotation algorithm just gives up if it fails to probe a
fileserver.  Fix this by rotating to the next fileserver instead.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rotate.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 4a26d51b2968..84584dcced72 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -369,8 +369,16 @@ use_server:
 	if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
 		fc->ac.alist = afs_get_addrlist(alist);
 
-		if (!afs_probe_fileserver(fc))
-			goto failed;
+		if (!afs_probe_fileserver(fc)) {
+			switch (fc->ac.error) {
+			case -ENOMEM:
+			case -ERESTARTSYS:
+			case -EINTR:
+				goto failed;
+			default:
+				goto next_server;
+			}
+		}
 	}
 
 	if (!fc->ac.alist)
-- 
cgit v1.2.1


From 684b0f68cf1c1cf4a40834818653491c5cad4435 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 21:51:47 +0100
Subject: afs: Fix AFSFetchStatus decoder to provide OpenAFS compatibility

The OpenAFS server's RXAFS_InlineBulkStatus implementation has a bug
whereby if an error occurs on one of the vnodes being queried, then the
errorCode field is set correctly in the corresponding status, but the
interfaceVersion field is left unset.

Fix kAFS to deal with this by evaluating the AFSFetchStatus blob against
the following cases when called from FS.InlineBulkStatus delivery:

 (1) If InterfaceVersion == 0 then:

     (a) If errorCode != 0 then it indicates the abort code for the
         corresponding vnode.

     (b) If errorCode == 0 then the status record is invalid.

 (2) If InterfaceVersion == 1 then:

     (a) If errorCode != 0 then it indicates the abort code for the
         corresponding vnode.

     (b) If errorCode == 0 then the status record is valid and can be
     	 parsed.

 (3) If InterfaceVersion is anything else then the status record is
     invalid.

Fixes: dd9fbcb8e103 ("afs: Rearrange status mapping")
Reported-by: Jeffrey Altman <jaltman@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/fsclient.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index efacdb7c1dee..d16f26c6cdbe 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -134,6 +134,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 				     struct afs_read *read_req)
 {
 	const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp;
+	bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
 	u64 data_version, size;
 	u32 type, abort_code;
 	u8 flags = 0;
@@ -142,13 +143,32 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 	if (vnode)
 		write_seqlock(&vnode->cb_lock);
 
+	abort_code = ntohl(xdr->abort_code);
+
 	if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) {
+		if (xdr->if_version == htonl(0) &&
+		    abort_code != 0 &&
+		    inline_error) {
+			/* The OpenAFS fileserver has a bug in FS.InlineBulkStatus
+			 * whereby it doesn't set the interface version in the error
+			 * case.
+			 */
+			status->abort_code = abort_code;
+			ret = 0;
+			goto out;
+		}
+
 		pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version));
 		goto bad;
 	}
 
+	if (abort_code != 0 && inline_error) {
+		status->abort_code = abort_code;
+		ret = 0;
+		goto out;
+	}
+
 	type = ntohl(xdr->type);
-	abort_code = ntohl(xdr->abort_code);
 	switch (type) {
 	case AFS_FTYPE_FILE:
 	case AFS_FTYPE_DIR:
@@ -165,13 +185,6 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 		}
 		status->type = type;
 		break;
-	case AFS_FTYPE_INVALID:
-		if (abort_code != 0) {
-			status->abort_code = abort_code;
-			ret = 0;
-			goto out;
-		}
-		/* Fall through */
 	default:
 		goto bad;
 	}
-- 
cgit v1.2.1


From 3d9fa91161387ee629e7a07c47934d119910c8ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 22:55:59 +0100
Subject: afs: Fix VNOVOL handling in address rotation

If a volume location record lists multiple file servers for a volume, then
it's possible that due to a misconfiguration or a changing configuration
that one of the file servers doesn't know about it yet and will abort
VNOVOL.  Currently, the rotation algorithm will stop with EREMOTEIO.

Fix this by moving on to try the next server if VNOVOL is returned.  Once
all the servers have been tried and the record rechecked, the algorithm
will stop with EREMOTEIO or ENOMEDIUM.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rotate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 84584dcced72..e065bc0768e6 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -179,7 +179,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 			 */
 			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
 				fc->ac.error = -EREMOTEIO;
-				goto failed;
+				goto next_server;
 			}
 
 			write_lock(&vnode->volume->servers_lock);
@@ -201,7 +201,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 			 */
 			if (vnode->volume->servers == fc->server_list) {
 				fc->ac.error = -EREMOTEIO;
-				goto failed;
+				goto next_server;
 			}
 
 			/* Try again */
-- 
cgit v1.2.1


From 001ab5a67ee5d191c64aebf4b4ef8c7a0dcfd2bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 23:21:35 +0100
Subject: afs: Fix the handling of CB.InitCallBackState3 to find the server by
 UUID

Fix the handling of the CB.InitCallBackState3 service call to find the
record of a server that we're using by looking it up by the UUID passed as
the parameter rather than by its address (of which it might have many, and
which may change).

Fixes: c35eccb1f614 ("[AFS]: Implement the CB.InitCallBackState3 operation.")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 357de908df3a..bcd13397bd59 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -341,7 +341,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
  */
 static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
-	struct sockaddr_rxrpc srx;
 	struct afs_server *server;
 	struct afs_uuid *r;
 	unsigned loop;
@@ -398,8 +397,9 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-	server = afs_find_server(call->net, &srx);
+	rcu_read_lock();
+	server = afs_find_server_by_uuid(call->net, call->request);
+	rcu_read_unlock();
 	if (!server)
 		return -ENOTCONN;
 	call->cm_server = server;
-- 
cgit v1.2.1


From 3709a399c15e4273d9a94b123374f12e5664318c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 22:59:42 +0100
Subject: afs: Add a tracepoint to record callbacks from unlisted servers

Add a tracepoint to record callbacks from servers for which we don't have a
record.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index bcd13397bd59..9f13375f49b8 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -287,8 +287,10 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 	 * vnodes to operate upon */
 	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 	server = afs_find_server(call->net, &srx);
-	if (!server)
+	if (!server) {
+		trace_afs_cm_no_server(call, &srx);
 		return -ENOTCONN;
+	}
 	call->cm_server = server;
 
 	return afs_queue_call_work(call);
@@ -329,8 +331,10 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
 	server = afs_find_server(call->net, &srx);
-	if (!server)
+	if (!server) {
+		trace_afs_cm_no_server(call, &srx);
 		return -ENOTCONN;
+	}
 	call->cm_server = server;
 
 	return afs_queue_call_work(call);
@@ -400,8 +404,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 	rcu_read_lock();
 	server = afs_find_server_by_uuid(call->net, call->request);
 	rcu_read_unlock();
-	if (!server)
+	if (!server) {
+		trace_afs_cm_no_server_u(call, call->request);
 		return -ENOTCONN;
+	}
 	call->cm_server = server;
 
 	return afs_queue_call_work(call);
-- 
cgit v1.2.1


From a86b06d1ccd218a6a50d6a3a88fbd2abcd0eaa94 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 May 2018 23:45:40 +0100
Subject: afs: Fix the handling of an unfound server in CM operations

If the client cache manager operations that need the server record
(CB.Callback, CB.InitCallBackState, and CB.InitCallBackState3) can't find
the server record, they abort the call from the file server with
RX_CALL_DEAD when they should return okay.

Fixes: c35eccb1f614 ("[AFS]: Implement the CB.InitCallBackState3 operation.")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 34 ++++++++++++----------------------
 fs/afs/rxrpc.c     |  5 -----
 2 files changed, 12 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 9f13375f49b8..b44491410af3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -143,8 +143,8 @@ static void afs_cm_destructor(struct afs_call *call)
 	 * received.  The step number here must match the final number in
 	 * afs_deliver_cb_callback().
 	 */
-	if (call->unmarshall == 5) {
-		ASSERT(call->cm_server && call->count && call->request);
+	if (call->cm_server && call->unmarshall == 5) {
+		ASSERT(call->count && call->request);
 		afs_break_callbacks(call->cm_server, call->count, call->request);
 	}
 
@@ -168,7 +168,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	 * yet */
 	afs_send_empty_reply(call);
 
-	afs_break_callbacks(call->cm_server, call->count, call->request);
+	if (call->cm_server)
+		afs_break_callbacks(call->cm_server, call->count, call->request);
 	afs_put_call(call);
 	_leave("");
 }
@@ -180,7 +181,6 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 {
 	struct afs_callback_break *cb;
 	struct sockaddr_rxrpc srx;
-	struct afs_server *server;
 	__be32 *bp;
 	int ret, loop;
 
@@ -286,12 +286,9 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
 	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-	server = afs_find_server(call->net, &srx);
-	if (!server) {
+	call->cm_server = afs_find_server(call->net, &srx);
+	if (!call->cm_server)
 		trace_afs_cm_no_server(call, &srx);
-		return -ENOTCONN;
-	}
-	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -305,7 +302,8 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 
 	_enter("{%p}", call->cm_server);
 
-	afs_init_callback_state(call->cm_server);
+	if (call->cm_server)
+		afs_init_callback_state(call->cm_server);
 	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
@@ -317,7 +315,6 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 {
 	struct sockaddr_rxrpc srx;
-	struct afs_server *server;
 	int ret;
 
 	_enter("");
@@ -330,12 +327,9 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(call->net, &srx);
-	if (!server) {
+	call->cm_server = afs_find_server(call->net, &srx);
+	if (!call->cm_server)
 		trace_afs_cm_no_server(call, &srx);
-		return -ENOTCONN;
-	}
-	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -345,7 +339,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
  */
 static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
-	struct afs_server *server;
 	struct afs_uuid *r;
 	unsigned loop;
 	__be32 *b;
@@ -402,13 +395,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
 	rcu_read_lock();
-	server = afs_find_server_by_uuid(call->net, call->request);
+	call->cm_server = afs_find_server_by_uuid(call->net, call->request);
 	rcu_read_unlock();
-	if (!server) {
+	if (!call->cm_server)
 		trace_afs_cm_no_server_u(call, call->request);
-		return -ENOTCONN;
-	}
-	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 1f6235a6e9ae..d0eee5d32c94 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -497,11 +497,6 @@ static void afs_deliver_to_call(struct afs_call *call)
 		case -ECONNABORTED:
 			ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
 			goto done;
-		case -ENOTCONN:
-			abort_code = RX_CALL_DEAD;
-			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
-						abort_code, ret, "KNC");
-			goto local_abort;
 		case -ENOTSUPP:
 			abort_code = RXGEN_OPCODE;
 			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
-- 
cgit v1.2.1


From f9c1bba3d392843f046d2ee27b4dfcec989d8a4b Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Fri, 11 May 2018 21:35:06 -0300
Subject: afs: Fix afs_find_server search loop

The code that looks up servers by addresses makes the assumption
that the list of addresses for a server is sorted.  It exits the
loop if it finds that the target address is larger than the
current candidate.  As the list is not currently sorted, this
can lead to a failure to find a matching server, which can cause
callbacks from that server to be ignored.

Remove the early exit case so that the complete list is searched.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/server.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/server.c b/fs/afs/server.c
index 2c5cff60e34d..3af4625e2f8c 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -67,12 +67,6 @@ struct afs_server *afs_find_server(struct afs_net *net,
 							      sizeof(struct in6_addr));
 					if (diff == 0)
 						goto found;
-					if (diff < 0) {
-						// TODO: Sort the list
-						//if (i == alist->nr_ipv4)
-						//	goto not_found;
-						break;
-					}
 				}
 			}
 		} else {
@@ -87,17 +81,10 @@ struct afs_server *afs_find_server(struct afs_net *net,
 							(u32 __force)b->sin6_addr.s6_addr32[3]);
 					if (diff == 0)
 						goto found;
-					if (diff < 0) {
-						// TODO: Sort the list
-						//if (i == 0)
-						//	goto not_found;
-						break;
-					}
 				}
 			}
 		}
 
-	//not_found:
 		server = NULL;
 	found:
 		if (server && !atomic_inc_not_zero(&server->usage))
-- 
cgit v1.2.1


From 68251f0a6818f3be19b1471f36c956ca97c1427d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 12 May 2018 22:31:33 +0100
Subject: afs: Fix whole-volume callback handling

It's possible for an AFS file server to issue a whole-volume notification
that callbacks on all the vnodes in the file have been broken.  This is
done for R/O and backup volumes (which don't have per-file callbacks) and
for things like a volume being taken offline.

Fix callback handling to detect whole-volume notifications, to track it
across operations and to check it during inode validation.

Fixes: c435ee34551e ("afs: Overhaul the callback handling")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/callback.c | 28 +++++++++++++++++++++-------
 fs/afs/dir.c      | 18 +++++++++---------
 fs/afs/file.c     |  2 +-
 fs/afs/flock.c    |  6 +++---
 fs/afs/fsclient.c |  2 +-
 fs/afs/inode.c    | 13 ++++++++-----
 fs/afs/internal.h | 15 +++++++++++++++
 fs/afs/security.c |  7 +++----
 fs/afs/super.c    |  2 +-
 fs/afs/write.c    |  2 +-
 10 files changed, 63 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 09332945d322..571437dcb252 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -113,6 +113,7 @@ again:
 	old = vnode->cb_interest;
 	vnode->cb_interest = cbi;
 	vnode->cb_s_break = cbi->server->cb_s_break;
+	vnode->cb_v_break = vnode->volume->cb_v_break;
 	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 
 	write_sequnlock(&vnode->cb_lock);
@@ -195,13 +196,24 @@ static void afs_break_one_callback(struct afs_server *server,
 		if (cbi->vid != fid->vid)
 			continue;
 
-		data.volume = NULL;
-		data.fid = *fid;
-		inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data);
-		if (inode) {
-			vnode = AFS_FS_I(inode);
-			afs_break_callback(vnode);
-			iput(inode);
+		if (fid->vnode == 0 && fid->unique == 0) {
+			/* The callback break applies to an entire volume. */
+			struct afs_super_info *as = AFS_FS_S(cbi->sb);
+			struct afs_volume *volume = as->volume;
+
+			write_lock(&volume->cb_break_lock);
+			volume->cb_v_break++;
+			write_unlock(&volume->cb_break_lock);
+		} else {
+			data.volume = NULL;
+			data.fid = *fid;
+			inode = ilookup5_nowait(cbi->sb, fid->vnode,
+						afs_iget5_test, &data);
+			if (inode) {
+				vnode = AFS_FS_I(inode);
+				afs_break_callback(vnode);
+				iput(inode);
+			}
 		}
 	}
 
@@ -219,6 +231,8 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
 	ASSERT(server != NULL);
 	ASSERTCMP(count, <=, AFSCBMAX);
 
+	/* TODO: Sort the callback break list by volume ID */
+
 	for (; count > 0; callbacks++, count--) {
 		_debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
 		       callbacks->fid.vid,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2853acd64482..7d623008157f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1141,7 +1141,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
 				      &newfid, &newstatus, &newcb);
 		}
@@ -1211,7 +1211,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_remove(&fc, dentry->d_name.name, true,
 				      data_version);
 		}
@@ -1314,7 +1314,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_remove(&fc, dentry->d_name.name, false,
 				      data_version);
 		}
@@ -1371,7 +1371,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
 				      &newfid, &newstatus, &newcb);
 		}
@@ -1441,8 +1441,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
 		}
 
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
-			fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+			fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
 			afs_fs_link(&fc, vnode, dentry->d_name.name, data_version);
 		}
 
@@ -1510,7 +1510,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			afs_fs_symlink(&fc, dentry->d_name.name,
 				       content, data_version,
 				       &newfid, &newstatus);
@@ -1586,8 +1586,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			}
 		}
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break;
-			fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode);
+			fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
 			afs_fs_rename(&fc, old_dentry->d_name.name,
 				      new_dvnode, new_dentry->d_name.name,
 				      orig_data_version, new_data_version);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index c24c08016dd9..7d4f26198573 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -238,7 +238,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_fetch_data(&fc, desc);
 		}
 
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 7a0e017070ec..dc62d15a964b 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -86,7 +86,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_set_lock(&fc, type);
 		}
 
@@ -117,7 +117,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_current_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_extend_lock(&fc);
 		}
 
@@ -148,7 +148,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_current_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_release_lock(&fc);
 		}
 
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index d16f26c6cdbe..b273e1d60478 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -261,7 +261,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
 
 	write_seqlock(&vnode->cb_lock);
 
-	if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) {
+	if (call->cb_break == afs_cb_break_sum(vnode, cbi)) {
 		vnode->cb_version	= ntohl(*bp++);
 		cb_expiry		= ntohl(*bp++);
 		vnode->cb_type		= ntohl(*bp++);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e855c6e5cf28..479b7fdda124 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -108,7 +108,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_fetch_file_status(&fc, NULL, new_inode);
 		}
 
@@ -393,15 +393,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	read_seqlock_excl(&vnode->cb_lock);
 
 	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) {
+		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break ||
+		    vnode->cb_v_break != vnode->volume->cb_v_break) {
 			vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
+			vnode->cb_v_break = vnode->volume->cb_v_break;
+			valid = false;
 		} else if (vnode->status.type == AFS_FTYPE_DIR &&
 			   test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) &&
 			   vnode->cb_expires_at - 10 > now) {
-				valid = true;
+			valid = true;
 		} else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
 			   vnode->cb_expires_at - 10 > now) {
-				valid = true;
+			valid = true;
 		}
 	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		valid = true;
@@ -574,7 +577,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_setattr(&fc, attr);
 		}
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index e75e57e13320..e3f8a46663db 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -461,6 +461,9 @@ struct afs_volume {
 	rwlock_t		servers_lock;	/* Lock for ->servers */
 	unsigned int		servers_seq;	/* Incremented each time ->servers changes */
 
+	unsigned		cb_v_break;	/* Break-everything counter. */
+	rwlock_t		cb_break_lock;
+
 	afs_voltype_t		type;		/* type of volume */
 	short			error;
 	char			type_force;	/* force volume type (suppress R/O -> R/W) */
@@ -521,6 +524,7 @@ struct afs_vnode {
 	/* outstanding callback notification on this file */
 	struct afs_cb_interest	*cb_interest;	/* Server on which this resides */
 	unsigned int		cb_s_break;	/* Mass break counter on ->server */
+	unsigned int		cb_v_break;	/* Mass break counter on ->volume */
 	unsigned int		cb_break;	/* Break counter on vnode */
 	seqlock_t		cb_lock;	/* Lock for ->cb_interest, ->status, ->cb_*break */
 
@@ -662,6 +666,17 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest
 	return cbi;
 }
 
+static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
+{
+	return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
+}
+
+static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode,
+					    struct afs_cb_interest *cbi)
+{
+	return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break;
+}
+
 /*
  * cell.c
  */
diff --git a/fs/afs/security.c b/fs/afs/security.c
index cea2fff313dc..1992b0ffa543 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -147,8 +147,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
 					break;
 				}
 
-				if (cb_break != (vnode->cb_break +
-						 vnode->cb_interest->server->cb_s_break)) {
+				if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) {
 					changed = true;
 					break;
 				}
@@ -178,7 +177,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
 		}
 	}
 
-	if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break))
+	if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest))
 		goto someone_else_changed_it;
 
 	/* We need a ref on any permits list we want to copy as we'll have to
@@ -257,7 +256,7 @@ found:
 
 	spin_lock(&vnode->lock);
 	zap = rcu_access_pointer(vnode->permit_cache);
-	if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) &&
+	if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) &&
 	    zap == permits)
 		rcu_assign_pointer(vnode->permit_cache, replacement);
 	else
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b02838cd9525..9e5d7966621c 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -688,7 +688,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (afs_begin_vnode_operation(&fc, vnode, key)) {
 		fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_get_volume_status(&fc, &vs);
 		}
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c164698dc304..8b39e6ebb40b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -351,7 +351,7 @@ found_key:
 	ret = -ERESTARTSYS;
 	if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
 		while (afs_select_fileserver(&fc)) {
-			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
 			afs_fs_store_data(&fc, mapping, first, last, offset, to);
 		}
 
-- 
cgit v1.2.1


From 428edade4e6c70e5b51fcd4188d944fbb744d84c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 12 May 2018 00:28:58 +0100
Subject: afs: Fix CB.CallBack handling

The handling of CB.CallBack messages sent by the fileserver to the client
is broken in that they are currently being processed after the reply has
been transmitted.

This is not what the fileserver expects, however.  It holds up change
visibility until the reply comes so as to maintain cache coherency, and so
expects the client to have to refetch the state on the affected files.

Fix CB.CallBack handling to perform the callback break before sending the
reply.

The fileserver is free to hold up status fetches issued by other threads on
the same client that occur in reponse to the callback until any pending
changes have been committed.

Fixes: d001648ec7cf ("rxrpc: Don't expose skbs to in-kernel users [ver #2]")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/cmservice.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index b44491410af3..c332c95a6940 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -133,21 +133,10 @@ bool afs_cm_incoming_call(struct afs_call *call)
 }
 
 /*
- * clean up a cache manager call
+ * Clean up a cache manager call.
  */
 static void afs_cm_destructor(struct afs_call *call)
 {
-	_enter("");
-
-	/* Break the callbacks here so that we do it after the final ACK is
-	 * received.  The step number here must match the final number in
-	 * afs_deliver_cb_callback().
-	 */
-	if (call->cm_server && call->unmarshall == 5) {
-		ASSERT(call->count && call->request);
-		afs_break_callbacks(call->cm_server, call->count, call->request);
-	}
-
 	kfree(call->buffer);
 	call->buffer = NULL;
 }
@@ -161,15 +150,14 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 
 	_enter("");
 
-	/* be sure to send the reply *before* attempting to spam the AFS server
-	 * with FSFetchStatus requests on the vnodes with broken callbacks lest
-	 * the AFS server get into a vicious cycle of trying to break further
-	 * callbacks because it hadn't received completion of the CBCallBack op
-	 * yet */
-	afs_send_empty_reply(call);
-
+	/* We need to break the callbacks before sending the reply as the
+	 * server holds up change visibility till it receives our reply so as
+	 * to maintain cache coherency.
+	 */
 	if (call->cm_server)
 		afs_break_callbacks(call->cm_server, call->count, call->request);
+
+	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
 }
@@ -267,15 +255,6 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 		call->offset = 0;
 		call->unmarshall++;
-
-		/* Record that the message was unmarshalled successfully so
-		 * that the call destructor can know do the callback breaking
-		 * work, even if the final ACK isn't received.
-		 *
-		 * If the step number changes, then afs_cm_destructor() must be
-		 * updated also.
-		 */
-		call->unmarshall++;
 	case 5:
 		break;
 	}
-- 
cgit v1.2.1


From 4776cab43fd3111618112737a257dc3ef368eddd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 May 2018 23:10:40 +0100
Subject: afs: Fix the non-encryption of calls

Some AFS servers refuse to accept unencrypted traffic, so can't be accessed
with kAFS.  Set the AF_RXRPC security level to encrypt client calls to deal
with this.

Note that incoming service calls are set by the remote client and so aren't
affected by this.

This requires an AF_RXRPC patch to pass the value set by setsockopt to calls
begun by the kernel.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d0eee5d32c94..08735948f15d 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -41,6 +41,7 @@ int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
+	unsigned int min_level;
 	int ret;
 
 	_enter("");
@@ -60,6 +61,12 @@ int afs_open_socket(struct afs_net *net)
 	srx.transport.sin6.sin6_family	= AF_INET6;
 	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
+	min_level = RXRPC_SECURITY_ENCRYPT;
+	ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
+				(void *)&min_level, sizeof(min_level));
+	if (ret < 0)
+		goto error_2;
+
 	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
 	if (ret == -EADDRINUSE) {
 		srx.transport.sin6.sin6_port = 0;
-- 
cgit v1.2.1


From 6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Mon, 14 May 2018 10:51:34 +0800
Subject: Btrfs: send, fix invalid access to commit roots due to concurrent
 snapshotting

[BUG]
btrfs incremental send BUG happens when creating a snapshot of snapshot
that is being used by send.

[REASON]
The problem can happen if while we are doing a send one of the snapshots
used (parent or send) is snapshotted, because snapshoting implies COWing
the root of the source subvolume/snapshot.

1. When doing an incremental send, the send process will get the commit
   roots from the parent and send snapshots, and add references to them
   through extent_buffer_get().

2. When a snapshot/subvolume is snapshotted, its root node is COWed
   (transaction.c:create_pending_snapshot()).

3. COWing releases the space used by the node immediately, through:

   __btrfs_cow_block()
   --btrfs_free_tree_block()
   ----btrfs_add_free_space(bytenr of node)

4. Because send doesn't hold a transaction open, it's possible that
   the transaction used to create the snapshot commits, switches the
   commit root and the old space used by the previous root node gets
   assigned to some other node allocation. Allocation of a new node will
   use the existing extent buffer found in memory, which we previously
   got a reference through extent_buffer_get(), and allow the extent
   buffer's content (pages) to be modified:

   btrfs_alloc_tree_block
   --btrfs_reserve_extent
   ----find_free_extent (get bytenr of old node)
   --btrfs_init_new_buffer (use bytenr of old node)
   ----btrfs_find_create_tree_block
   ------alloc_extent_buffer
   --------find_extent_buffer (get old node)

5. So send can access invalid memory content and have unpredictable
   behaviour.

[FIX]
So we fix the problem by copying the commit roots of the send and
parent snapshots and use those copies.

CallTrace looks like this:
 ------------[ cut here ]------------
 kernel BUG at fs/btrfs/ctree.c:1861!
 invalid opcode: 0000 [#1] SMP
 CPU: 6 PID: 24235 Comm: btrfs Tainted: P           O 3.10.105 #23721
 ffff88046652d680 ti: ffff88041b720000 task.ti: ffff88041b720000
 RIP: 0010:[<ffffffffa08dd0e8>] read_node_slot+0x108/0x110 [btrfs]
 RSP: 0018:ffff88041b723b68  EFLAGS: 00010246
 RAX: ffff88043ca6b000 RBX: ffff88041b723c50 RCX: ffff880000000000
 RDX: 000000000000004c RSI: ffff880314b133f8 RDI: ffff880458b24000
 RBP: 0000000000000000 R08: 0000000000000001 R09: ffff88041b723c66
 R10: 0000000000000001 R11: 0000000000001000 R12: ffff8803f3e48890
 R13: ffff8803f3e48880 R14: ffff880466351800 R15: 0000000000000001
 FS:  00007f8c321dc8c0(0000) GS:ffff88047fcc0000(0000)
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 R2: 00007efd1006d000 CR3: 0000000213a24000 CR4: 00000000003407e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Stack:
 ffff88041b723c50 ffff8803f3e48880 ffff8803f3e48890 ffff8803f3e48880
 ffff880466351800 0000000000000001 ffffffffa08dd9d7 ffff88041b723c50
 ffff8803f3e48880 ffff88041b723c66 ffffffffa08dde85 a9ff88042d2c4400
 Call Trace:
 [<ffffffffa08dd9d7>] ? tree_move_down.isra.33+0x27/0x50 [btrfs]
 [<ffffffffa08dde85>] ? tree_advance+0xb5/0xc0 [btrfs]
 [<ffffffffa08e83d4>] ? btrfs_compare_trees+0x2d4/0x760 [btrfs]
 [<ffffffffa0982050>] ? finish_inode_if_needed+0x870/0x870 [btrfs]
 [<ffffffffa09841ea>] ? btrfs_ioctl_send+0xeda/0x1050 [btrfs]
 [<ffffffffa094bd3d>] ? btrfs_ioctl+0x1e3d/0x33f0 [btrfs]
 [<ffffffff81111133>] ? handle_pte_fault+0x373/0x990
 [<ffffffff8153a096>] ? atomic_notifier_call_chain+0x16/0x20
 [<ffffffff81063256>] ? set_task_cpu+0xb6/0x1d0
 [<ffffffff811122c3>] ? handle_mm_fault+0x143/0x2a0
 [<ffffffff81539cc0>] ? __do_page_fault+0x1d0/0x500
 [<ffffffff81062f07>] ? check_preempt_curr+0x57/0x90
 [<ffffffff8115075a>] ? do_vfs_ioctl+0x4aa/0x990
 [<ffffffff81034f83>] ? do_fork+0x113/0x3b0
 [<ffffffff812dd7d7>] ? trace_hardirqs_off_thunk+0x3a/0x6c
 [<ffffffff81150cc8>] ? SyS_ioctl+0x88/0xa0
 [<ffffffff8153e422>] ? system_call_fastpath+0x16/0x1b
 ---[ end trace 29576629ee80b2e1 ]---

Fixes: 7069830a9e38 ("Btrfs: add btrfs_compare_trees function")
CC: stable@vger.kernel.org # 3.6+
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fd44835b386..63488f0b850f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5414,12 +5414,24 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 	down_read(&fs_info->commit_root_sem);
 	left_level = btrfs_header_level(left_root->commit_root);
 	left_root_level = left_level;
-	left_path->nodes[left_level] = left_root->commit_root;
+	left_path->nodes[left_level] =
+			btrfs_clone_extent_buffer(left_root->commit_root);
+	if (!left_path->nodes[left_level]) {
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOMEM;
+		goto out;
+	}
 	extent_buffer_get(left_path->nodes[left_level]);
 
 	right_level = btrfs_header_level(right_root->commit_root);
 	right_root_level = right_level;
-	right_path->nodes[right_level] = right_root->commit_root;
+	right_path->nodes[right_level] =
+			btrfs_clone_extent_buffer(right_root->commit_root);
+	if (!right_path->nodes[right_level]) {
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOMEM;
+		goto out;
+	}
 	extent_buffer_get(right_path->nodes[right_level]);
 	up_read(&fs_info->commit_root_sem);
 
-- 
cgit v1.2.1


From 9a8fca62aacc1599fea8e813d01e1955513e4fad Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 May 2018 16:42:42 +0100
Subject: Btrfs: fix xattr loss after power failure

If a file has xattrs, we fsync it, to ensure we clear the flags
BTRFS_INODE_NEEDS_FULL_SYNC and BTRFS_INODE_COPY_EVERYTHING from its
inode, the current transaction commits and then we fsync it (without
either of those bits being set in its inode), we end up not logging
all its xattrs. This results in deleting all xattrs when replying the
log after a power failure.

Trivial reproducer

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foobar
  $ setfattr -n user.xa -v qwerty /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 64K" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar
  <power failure>

  $ mount /dev/sdb /mnt
  $ getfattr --absolute-names --dump /mnt/foobar
  <empty output>
  $

So fix this by making sure all xattrs are logged if we log a file's inode
item and neither the flags BTRFS_INODE_NEEDS_FULL_SYNC nor
BTRFS_INODE_COPY_EVERYTHING were set in the inode.

Fixes: 36283bf777d9 ("Btrfs: fix fsync xattr loss in the fast fsync path")
Cc: <stable@vger.kernel.org> # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 43758e30aa7a..c1509547c762 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4827,6 +4827,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree = &inode->extent_tree;
 	u64 logged_isize = 0;
 	bool need_log_inode_item = true;
+	bool xattrs_logged = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5128,6 +5129,7 @@ next_key:
 	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
 	if (err)
 		goto out_unlock;
+	xattrs_logged = true;
 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
 		btrfs_release_path(path);
 		btrfs_release_path(dst_path);
@@ -5140,6 +5142,11 @@ log_extents:
 	btrfs_release_path(dst_path);
 	if (need_log_inode_item) {
 		err = log_inode_item(trans, log, dst_path, inode);
+		if (!err && !xattrs_logged) {
+			err = btrfs_log_all_xattrs(trans, root, inode, path,
+						   dst_path);
+			btrfs_release_path(path);
+		}
 		if (err)
 			goto out_unlock;
 	}
-- 
cgit v1.2.1


From 31d11b83b96faaee4bb514d375a09489117c3e8d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 9 May 2018 16:01:46 +0100
Subject: Btrfs: fix duplicate extents after fsync of file with prealloc
 extents

In commit 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size
after fsync log replay"), on fsync,  we started to always log all prealloc
extents beyond an inode's i_size in order to avoid losing them after a
power failure. However under some cases this can lead to the log replay
code to create duplicate extent items, with different lengths, in the
extent tree. That happens because, as of that commit, we can now log
extent items based on extent maps that are not on the "modified" list
of extent maps of the inode's extent map tree. Logging extent items based
on extent maps is used during the fast fsync path to save time and for
this to work reliably it requires that the extent maps are not merged
with other adjacent extent maps - having the extent maps in the list
of modified extents gives such guarantee.

Consider the following example, captured during a long run of fsstress,
which illustrates this problem.

We have inode 271, in the filesystem tree (root 5), for which all of the
following operations and discussion apply to.

A buffered write starts at offset 312391 with a length of 933471 bytes
(end offset at 1245862). At this point we have, for this inode, the
following extent maps with the their field values:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
      block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 376832, block_start 1106399232,
      block_len 376832, orig_block_len 376832
em C, start 417792, orig_start 417792, len 782336, block_start
      18446744073709551613, block_len 0, orig_block_len 0
em D, start 1200128, orig_start 1200128, len 835584, block_start
      1106776064, block_len 835584, orig_block_len 835584
em E, start 2035712, orig_start 2035712, len 245760, block_start
      1107611648, block_len 245760, orig_block_len 245760

Extent map A corresponds to a hole and extent maps D and E correspond to
preallocated extents.

Extent map D ends where extent map E begins (1106776064 + 835584 =
1107611648), but these extent maps were not merged because they are in
the inode's list of modified extent maps.

An fsync against this inode is made, which triggers the fast path
(BTRFS_INODE_NEEDS_FULL_SYNC is not set). This fsync triggers writeback
of the data previously written using buffered IO, and when the respective
ordered extent finishes, btrfs_drop_extents() is called against the
(aligned) range 311296..1249279. This causes a split of extent map D at
btrfs_drop_extent_cache(), replacing extent map D with a new extent map
D', also added to the list of modified extents,  with the following
values:

em D', start 1249280, orig_start of 1200128,
       block_start 1106825216 (= 1106776064 + 1249280 - 1200128),
       orig_block_len 835584,
       block_len 786432 (835584 - (1249280 - 1200128))

Then, during the fast fsync, btrfs_log_changed_extents() is called and
extent maps D' and E are removed from the list of modified extents. The
flag EXTENT_FLAG_LOGGING is also set on them. After the extents are logged
clear_em_logging() is called on each of them, and that makes extent map E
to be merged with extent map D' (try_merge_map()), resulting in D' being
deleted and E adjusted to:

em E, start 1249280, orig_start 1200128, len 1032192,
      block_start 1106825216, block_len 1032192,
      orig_block_len 245760

A direct IO write at offset 1847296 and length of 360448 bytes (end offset
at 2207744) starts, and at that moment the following extent maps exist for
our inode:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
      block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 270336, block_start 1106399232,
      block_len 270336, orig_block_len 376832
em C, start 311296, orig_start 311296, len 937984, block_start 1112842240,
      block_len 937984, orig_block_len 937984
em E (prealloc), start 1249280, orig_start 1200128, len 1032192,
      block_start 1106825216, block_len 1032192, orig_block_len 245760

The dio write results in drop_extent_cache() being called twice. The first
time for a range that starts at offset 1847296 and ends at offset 2035711
(length of 188416), which results in a double split of extent map E,
replacing it with two new extent maps:

em F, start 1249280, orig_start 1200128, block_start 1106825216,
      block_len 598016, orig_block_len 598016
em G, start 2035712, orig_start 1200128, block_start 1107611648,
      block_len 245760, orig_block_len 1032192

It also creates a new extent map that represents a part of the requested
IO (through create_io_em()):

em H, start 1847296, len 188416, block_start 1107423232, block_len 188416

The second call to drop_extent_cache() has a range with a start offset of
2035712 and end offset of 2207743 (length of 172032). This leads to
replacing extent map G with a new extent map I with the following values:

em I, start 2207744, orig_start 1200128, block_start 1107783680,
      block_len 73728, orig_block_len 1032192

It also creates a new extent map that represents the second part of the
requested IO (through create_io_em()):

em J, start 2035712, len 172032, block_start 1107611648, block_len 172032

The dio write set the inode's i_size to 2207744 bytes.

After the dio write the inode has the following extent maps:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
      block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 270336, block_start 1106399232,
      block_len 270336, orig_block_len 376832
em C, start 311296, orig_start 311296, len 937984, block_start 1112842240,
      block_len 937984, orig_block_len 937984
em F, start 1249280, orig_start 1200128, len 598016,
      block_start 1106825216, block_len 598016, orig_block_len 598016
em H, start 1847296, orig_start 1200128, len 188416,
      block_start 1107423232, block_len 188416, orig_block_len 835584
em J, start 2035712, orig_start 2035712, len 172032,
      block_start 1107611648, block_len 172032, orig_block_len 245760
em I, start 2207744, orig_start 1200128, len 73728,
      block_start 1107783680, block_len 73728, orig_block_len 1032192

Now do some change to the file, like adding a xattr for example and then
fsync it again. This triggers a fast fsync path, and as of commit
471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync
log replay"), we use the extent map I to log a file extent item because
it's a prealloc extent and it starts at an offset matching the inode's
i_size. However when we log it, we create a file extent item with a value
for the disk byte location that is wrong, as can be seen from the
following output of "btrfs inspect-internal dump-tree":

 item 1 key (271 EXTENT_DATA 2207744) itemoff 3782 itemsize 53
     generation 22 type 2 (prealloc)
     prealloc data disk byte 1106776064 nr 1032192
     prealloc data offset 1007616 nr 73728

Here the disk byte value corresponds to calculation based on some fields
from the extent map I:

  1106776064 = block_start (1107783680) - 1007616 (extent_offset)
  extent_offset = 2207744 (start) - 1200128 (orig_start) = 1007616

The disk byte value of 1106776064 clashes with disk byte values of the
file extent items at offsets 1249280 and 1847296 in the fs tree:

        item 6 key (271 EXTENT_DATA 1249280) itemoff 3568 itemsize 53
                generation 20 type 2 (prealloc)
                prealloc data disk byte 1106776064 nr 835584
                prealloc data offset 49152 nr 598016
        item 7 key (271 EXTENT_DATA 1847296) itemoff 3515 itemsize 53
                generation 20 type 1 (regular)
                extent data disk byte 1106776064 nr 835584
                extent data offset 647168 nr 188416 ram 835584
                extent compression 0 (none)
        item 8 key (271 EXTENT_DATA 2035712) itemoff 3462 itemsize 53
                generation 20 type 1 (regular)
                extent data disk byte 1107611648 nr 245760
                extent data offset 0 nr 172032 ram 245760
                extent compression 0 (none)
        item 9 key (271 EXTENT_DATA 2207744) itemoff 3409 itemsize 53
                generation 20 type 2 (prealloc)
                prealloc data disk byte 1107611648 nr 245760
                prealloc data offset 172032 nr 73728

Instead of the disk byte value of 1106776064, the value of 1107611648
should have been logged. Also the data offset value should have been
172032 and not 1007616.
After a log replay we end up getting two extent items in the extent tree
with different lengths, one of 835584, which is correct and existed
before the log replay, and another one of 1032192 which is wrong and is
based on the logged file extent item:

 item 12 key (1106776064 EXTENT_ITEM 835584) itemoff 3406 itemsize 53
    refs 2 gen 15 flags DATA
    extent data backref root 5 objectid 271 offset 1200128 count 2
 item 13 key (1106776064 EXTENT_ITEM 1032192) itemoff 3353 itemsize 53
    refs 1 gen 22 flags DATA
    extent data backref root 5 objectid 271 offset 1200128 count 1

Obviously this leads to many problems and a filesystem check reports many
errors:

 (...)
 checking extents
 Extent back ref already exists for 1106776064 parent 0 root 5 owner 271 offset 1200128 num_refs 1
 extent item 1106776064 has multiple extent items
 ref mismatch on [1106776064 835584] extent item 2, found 3
 Incorrect local backref count on 1106776064 root 5 owner 271 offset 1200128 found 2 wanted 1 back 0x55b1d0ad7680
 Backref 1106776064 root 5 owner 271 offset 1200128 num_refs 0 not found in extent tree
 Incorrect local backref count on 1106776064 root 5 owner 271 offset 1200128 found 1 wanted 0 back 0x55b1d0ad4e70
 Backref bytes do not match extent backref, bytenr=1106776064, ref bytes=835584, backref bytes=1032192
 backpointer mismatch on [1106776064 835584]
 checking free space cache
 block group 1103101952 has wrong amount of free space
 failed to load free space cache for block group 1103101952
 checking fs roots
 (...)

So fix this by logging the prealloc extents beyond the inode's i_size
based on searches in the subvolume tree instead of the extent maps.

Fixes: 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 137 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 112 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c1509547c762..8f23a94dab77 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4320,6 +4320,110 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * Log all prealloc extents beyond the inode's i_size to make sure we do not
+ * lose them after doing a fast fsync and replaying the log. We scan the
+ * subvolume's root instead of iterating the inode's extent map tree because
+ * otherwise we can log incorrect extent items based on extent map conversion.
+ * That can happen due to the fact that extent maps are merged when they
+ * are not in the extent map tree's list of modified extents.
+ */
+static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
+				      struct btrfs_inode *inode,
+				      struct btrfs_path *path)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_key key;
+	const u64 i_size = i_size_read(&inode->vfs_inode);
+	const u64 ino = btrfs_ino(inode);
+	struct btrfs_path *dst_path = NULL;
+	u64 last_extent = (u64)-1;
+	int ins_nr = 0;
+	int start_slot;
+	int ret;
+
+	if (!(inode->flags & BTRFS_INODE_PREALLOC))
+		return 0;
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = i_size;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			if (ins_nr > 0) {
+				ret = copy_items(trans, inode, dst_path, path,
+						 &last_extent, start_slot,
+						 ins_nr, 1, 0);
+				if (ret < 0)
+					goto out;
+				ins_nr = 0;
+			}
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid > ino)
+			break;
+		if (WARN_ON_ONCE(key.objectid < ino) ||
+		    key.type < BTRFS_EXTENT_DATA_KEY ||
+		    key.offset < i_size) {
+			path->slots[0]++;
+			continue;
+		}
+		if (last_extent == (u64)-1) {
+			last_extent = key.offset;
+			/*
+			 * Avoid logging extent items logged in past fsync calls
+			 * and leading to duplicate keys in the log tree.
+			 */
+			do {
+				ret = btrfs_truncate_inode_items(trans,
+							 root->log_root,
+							 &inode->vfs_inode,
+							 i_size,
+							 BTRFS_EXTENT_DATA_KEY);
+			} while (ret == -EAGAIN);
+			if (ret)
+				goto out;
+		}
+		if (ins_nr == 0)
+			start_slot = slot;
+		ins_nr++;
+		path->slots[0]++;
+		if (!dst_path) {
+			dst_path = btrfs_alloc_path();
+			if (!dst_path) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+	if (ins_nr > 0) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 start_slot, ins_nr, 1, 0);
+		if (ret > 0)
+			ret = 0;
+	}
+out:
+	btrfs_release_path(path);
+	btrfs_free_path(dst_path);
+	return ret;
+}
+
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct btrfs_inode *inode,
@@ -4362,6 +4466,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		if (em->generation <= test_gen)
 			continue;
 
+		/* We log prealloc extents beyond eof later. */
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+		    em->start >= i_size_read(&inode->vfs_inode))
+			continue;
+
 		if (em->start < logged_start)
 			logged_start = em->start;
 		if ((em->start + em->len - 1) > logged_end)
@@ -4374,31 +4483,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		num++;
 	}
 
-	/*
-	 * Add all prealloc extents beyond the inode's i_size to make sure we
-	 * don't lose them after doing a fast fsync and replaying the log.
-	 */
-	if (inode->flags & BTRFS_INODE_PREALLOC) {
-		struct rb_node *node;
-
-		for (node = rb_last(&tree->map); node; node = rb_prev(node)) {
-			em = rb_entry(node, struct extent_map, rb_node);
-			if (em->start < i_size_read(&inode->vfs_inode))
-				break;
-			if (!list_empty(&em->list))
-				continue;
-			/* Same as above loop. */
-			if (++num > 32768) {
-				list_del_init(&tree->modified_extents);
-				ret = -EFBIG;
-				goto process;
-			}
-			refcount_inc(&em->refs);
-			set_bit(EXTENT_FLAG_LOGGING, &em->flags);
-			list_add_tail(&em->list, &extents);
-		}
-	}
-
 	list_sort(NULL, &extents, extent_cmp);
 	btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
 	/*
@@ -4443,6 +4527,9 @@ process:
 	up_write(&inode->dio_sem);
 
 	btrfs_release_path(path);
+	if (!ret)
+		ret = btrfs_log_prealloc_extents(trans, inode, path);
+
 	return ret;
 }
 
-- 
cgit v1.2.1


From 1a63c198ddb810c790101d693c7071cca703b3c7 Mon Sep 17 00:00:00 2001
From: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Date: Tue, 15 May 2018 16:51:26 +0900
Subject: btrfs: property: Set incompat flag if lzo/zstd compression is set

Incompat flag of LZO/ZSTD compression should be set at:

 1. mount time (-o compress/compress-force)
 2. when defrag is done
 3. when property is set

Currently 3. is missing and this commit adds this.

This could lead to a filesystem that uses ZSTD but is not marked as
such. If a kernel without a ZSTD support encounteres a ZSTD compressed
extent, it will handle that but this could be confusing to the user.

Typically the filesystem is mounted with the ZSTD option, but the
discrepancy can arise when a filesystem is never mounted with ZSTD and
then the property on some file is set (and some new extents are
written). A simple mount with -o compress=zstd will fix that up on an
unpatched kernel.

Same goes for LZO, but this has been around for a very long time
(2.6.37) so it's unlikely that a pre-LZO kernel would be used.

Fixes: 5c1aab1dd544 ("btrfs: Add zstd support")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add user visible impact ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 53a8c95828e3..dc6140013ae8 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode,
 				  const char *value,
 				  size_t len)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int type;
 
 	if (len == 0) {
@@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode,
 		return 0;
 	}
 
-	if (!strncmp("lzo", value, 3))
+	if (!strncmp("lzo", value, 3)) {
 		type = BTRFS_COMPRESS_LZO;
-	else if (!strncmp("zlib", value, 4))
+		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+	} else if (!strncmp("zlib", value, 4)) {
 		type = BTRFS_COMPRESS_ZLIB;
-	else if (!strncmp("zstd", value, len))
+	} else if (!strncmp("zstd", value, len)) {
 		type = BTRFS_COMPRESS_ZSTD;
-	else
+		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+	} else {
 		return -EINVAL;
+	}
 
 	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
 	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
-- 
cgit v1.2.1


From 02a3307aa9c20b4f6626255b028f07f6cfa16feb Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Wed, 16 May 2018 01:37:36 +0800
Subject: btrfs: fix reading stale metadata blocks after degraded raid1 mounts

If a btree block, aka. extent buffer, is not available in the extent
buffer cache, it'll be read out from the disk instead, i.e.

btrfs_search_slot()
  read_block_for_search()  # hold parent and its lock, go to read child
    btrfs_release_path()
    read_tree_block()  # read child

Unfortunately, the parent lock got released before reading child, so
commit 5bdd3536cbbe ("Btrfs: Fix block generation verification race") had
used 0 as parent transid to read the child block.  It forces
read_tree_block() not to check if parent transid is different with the
generation id of the child that it reads out from disk.

A simple PoC is included in btrfs/124,

0. A two-disk raid1 btrfs,

1. Right after mkfs.btrfs, block A is allocated to be device tree's root.

2. Mount this filesystem and put it in use, after a while, device tree's
   root got COW but block A hasn't been allocated/overwritten yet.

3. Umount it and reload the btrfs module to remove both disks from the
   global @fs_devices list.

4. mount -odegraded dev1 and write some data, so now block A is allocated
   to be a leaf in checksum tree.  Note that only dev1 has the latest
   metadata of this filesystem.

5. Umount it and mount it again normally (with both disks), since raid1
   can pick up one disk by the writer task's pid, if btrfs_search_slot()
   needs to read block A, dev2 which does NOT have the latest metadata
   might be read for block A, then we got a stale block A.

6. As parent transid is not checked, block A is marked as uptodate and
   put into the extent buffer cache, so the future search won't bother
   to read disk again, which means it'll make changes on this stale
   one and make it dirty and flush it onto disk.

To avoid the problem, parent transid needs to be passed to
read_tree_block().

In order to get a valid parent transid, we need to hold the parent's
lock until finishing reading child.

This patch needs to be slightly adapted for stable kernels, the
&first_key parameter added to read_tree_block() is from 4.16+
(581c1760415c4). The fix is to replace 0 by 'gen'.

Fixes: 5bdd3536cbbe ("Btrfs: Fix block generation verification race")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 63488f0b850f..8c68961925b1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2436,10 +2436,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	btrfs_release_path(p);
-
 	ret = -EAGAIN;
-	tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
 			      &first_key);
 	if (!IS_ERR(tmp)) {
 		/*
@@ -2454,6 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	} else {
 		ret = PTR_ERR(tmp);
 	}
+
+	btrfs_release_path(p);
 	return ret;
 }
 
-- 
cgit v1.2.1


From 2b8773313494ede83a26fb372466e634564002ed Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 27 Apr 2018 12:21:51 +0300
Subject: btrfs: Split btrfs_del_delalloc_inode into 2 functions

This is in preparation of fixing delalloc inodes leakage on transaction
abort. Also export the new function.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/inode.c | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2771cc56a622..0d422c9908b8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3182,6 +3182,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes);
 
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+				struct btrfs_inode *inode);
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d241285a0d2a..8e604e7071f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1742,12 +1742,12 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
 	spin_unlock(&root->delalloc_lock);
 }
 
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
-				     struct btrfs_inode *inode)
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+				struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 
-	spin_lock(&root->delalloc_lock);
 	if (!list_empty(&inode->delalloc_inodes)) {
 		list_del_init(&inode->delalloc_inodes);
 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
@@ -1760,6 +1760,13 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
 			spin_unlock(&fs_info->delalloc_root_lock);
 		}
 	}
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+				     struct btrfs_inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	__btrfs_del_delalloc_inode(root, inode);
 	spin_unlock(&root->delalloc_lock);
 }
 
-- 
cgit v1.2.1


From fe816d0f1d4c31c4c31d42ca78a87660565fc800 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 27 Apr 2018 12:21:53 +0300
Subject: btrfs: Fix delalloc inodes invalidation during transaction abort

When a transaction is aborted btrfs_cleanup_transaction is called to
cleanup all the various in-flight bits and pieces which migth be
active. One of those is delalloc inodes - inodes which have dirty
pages which haven't been persisted yet. Currently the process of
freeing such delalloc inodes in exceptional circumstances such as
transaction abort boiled down to calling btrfs_invalidate_inodes whose
sole job is to invalidate the dentries for all inodes related to a
root. This is in fact wrong and insufficient since such delalloc inodes
will likely have pending pages or ordered-extents and will be linked to
the sb->s_inode_list. This means that unmounting a btrfs instance with
an aborted transaction could potentially lead inodes/their pages
visible to the system long after their superblock has been freed. This
in turn leads to a "use-after-free" situation once page shrink is
triggered. This situation could be simulated by running generic/019
which would cause such inodes to be left hanging, followed by
generic/176 which causes memory pressure and page eviction which lead
to touching the freed super block instance. This situation is
additionally detected by the unmount code of VFS with the following
message:

"VFS: Busy inodes after unmount of Self-destruct in 5 seconds.  Have a nice day..."

Additionally btrfs hits WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
in free_fs_root for the same reason.

This patch aims to rectify the sitaution by doing the following:

1. Change btrfs_destroy_delalloc_inodes so that it calls
invalidate_inode_pages2 for every inode on the delalloc list, this
ensures that all the pages of the inode are released. This function
boils down to calling btrfs_releasepage. During test I observed cases
where inodes on the delalloc list were having an i_count of 0, so this
necessitates using igrab to be sure we are working on a non-freed inode.

2. Since calling btrfs_releasepage might queue delayed iputs move the
call out to btrfs_cleanup_transaction in btrfs_error_commit_super before
calling run_delayed_iputs for the last time. This is necessary to ensure
that delayed iputs are run.

Note: this patch is tagged for 4.14 stable but the fix applies to older
versions too but needs to be backported manually due to conflicts.

CC: stable@vger.kernel.org # 4.14.x: 2b8773313494: btrfs: Split btrfs_del_delalloc_inode into 2 functions
CC: stable@vger.kernel.org # 4.14.x
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add comment to igrab ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60caa68c3618..c3504b4d281b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3818,6 +3818,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
 
 	btrfs_free_qgroup_config(fs_info);
+	ASSERT(list_empty(&fs_info->delalloc_roots));
 
 	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
 		btrfs_info(fs_info, "at unmount delalloc count %lld",
@@ -4125,15 +4126,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
 
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
 {
+	/* cleanup FS via transaction */
+	btrfs_cleanup_transaction(fs_info);
+
 	mutex_lock(&fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(fs_info);
 	mutex_unlock(&fs_info->cleaner_mutex);
 
 	down_write(&fs_info->cleanup_work_sem);
 	up_write(&fs_info->cleanup_work_sem);
-
-	/* cleanup FS via transaction */
-	btrfs_cleanup_transaction(fs_info);
 }
 
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
@@ -4258,19 +4259,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 	list_splice_init(&root->delalloc_inodes, &splice);
 
 	while (!list_empty(&splice)) {
+		struct inode *inode = NULL;
 		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
 					       delalloc_inodes);
-
-		list_del_init(&btrfs_inode->delalloc_inodes);
-		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-			  &btrfs_inode->runtime_flags);
+		__btrfs_del_delalloc_inode(root, btrfs_inode);
 		spin_unlock(&root->delalloc_lock);
 
-		btrfs_invalidate_inodes(btrfs_inode->root);
-
+		/*
+		 * Make sure we get a live inode and that it'll not disappear
+		 * meanwhile.
+		 */
+		inode = igrab(&btrfs_inode->vfs_inode);
+		if (inode) {
+			invalidate_inode_pages2(inode->i_mapping);
+			iput(inode);
+		}
 		spin_lock(&root->delalloc_lock);
 	}
-
 	spin_unlock(&root->delalloc_lock);
 }
 
@@ -4286,7 +4291,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
 	while (!list_empty(&splice)) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					 delalloc_root);
-		list_del_init(&root->delalloc_root);
 		root = btrfs_grab_fs_root(root);
 		BUG_ON(!root);
 		spin_unlock(&fs_info->delalloc_root_lock);
-- 
cgit v1.2.1


From 02ee654d3a04563c67bfe658a05384548b9bb105 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 17 May 2018 15:16:51 +0800
Subject: btrfs: fix crash when trying to resume balance without the resume
 flag

We set the BTRFS_BALANCE_RESUME flag in the btrfs_recover_balance()
only, which isn't called during the remount. So when resuming from
the paused balance we hit the bug:

 kernel: kernel BUG at fs/btrfs/volumes.c:3890!
 ::
 kernel:  balance_kthread+0x51/0x60 [btrfs]
 kernel:  kthread+0x111/0x130
 ::
 kernel: RIP: btrfs_balance+0x12e1/0x1570 [btrfs] RSP: ffffba7d0090bde8

Reproducer:
  On a mounted filesystem:

  btrfs balance start --full-balance /btrfs
  btrfs balance pause /btrfs
  mount -o remount,ro /dev/sdb /btrfs
  mount -o remount,rw /dev/sdb /btrfs

To fix this set the BTRFS_BALANCE_RESUME flag in
btrfs_resume_balance_async().

CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 292266f6ab9c..be3fc701f389 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4052,6 +4052,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 		return 0;
 	}
 
+	/*
+	 * A ro->rw remount sequence should continue with the paused balance
+	 * regardless of who pauses it, system or the user as of now, so set
+	 * the resume flag.
+	 */
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
+	spin_unlock(&fs_info->balance_lock);
+
 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
 	return PTR_ERR_OR_ZERO(tsk);
 }
-- 
cgit v1.2.1


From 7f7ccc2ccc2e70c6054685f5e3522efa81556830 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Fri, 11 May 2018 08:11:44 +0200
Subject: proc: do not access cmdline nor environ from file-backed areas

proc_pid_cmdline_read() and environ_read() directly access the target
process' VM to retrieve the command line and environment. If this
process remaps these areas onto a file via mmap(), the requesting
process may experience various issues such as extra delays if the
underlying device is slow to respond.

Let's simply refuse to access file-backed areas in these functions.
For this we add a new FOLL_ANON gup flag that is passed to all calls
to access_remote_vm(). The code already takes care of such failures
(including unmapped areas). Accesses via /proc/pid/mem were not
changed though.

This was assigned CVE-2018-1120.

Note for stable backports: the patch may apply to kernels prior to 4.11
but silently miss one location; it must be checked that no call to
access_remote_vm() keeps zero as the last argument.

Reported-by: Qualys Security Advisory <qsa@qualys.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b2ede6abcdf..1a76d751cf3c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -261,7 +261,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 	 * Inherently racy -- command line shares address space
 	 * with code and data.
 	 */
-	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON);
 	if (rv <= 0)
 		goto out_free_page;
 
@@ -279,7 +279,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			int nr_read;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -325,7 +325,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 				bool final;
 
 				_count = min3(count, len, PAGE_SIZE);
-				nr_read = access_remote_vm(mm, p, page, _count, 0);
+				nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 				if (nr_read < 0)
 					rv = nr_read;
 				if (nr_read <= 0)
@@ -946,7 +946,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 		max_len = min_t(size_t, PAGE_SIZE, count);
 		this_len = min(max_len, this_len);
 
-		retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
+		retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
 
 		if (retval <= 0) {
 			ret = retval;
-- 
cgit v1.2.1


From 66072c29328717072fd84aaff3e070e3f008ba77 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 18 May 2018 16:09:16 -0700
Subject: hfsplus: stop workqueue when fill_super() failed

syzbot is reporting ODEBUG messages at hfsplus_fill_super() [1].  This
is because hfsplus_fill_super() forgot to call cancel_delayed_work_sync().

As far as I can see, it is hfsplus_mark_mdb_dirty() from
hfsplus_new_inode() in hfsplus_fill_super() that calls
queue_delayed_work().  Therefore, I assume that hfsplus_new_inode() does
not fail if queue_delayed_work() was called, and the out_put_hidden_dir
label is the appropriate location to call cancel_delayed_work_sync().

[1] https://syzkaller.appspot.com/bug?id=a66f45e96fdbeb76b796bf46eb25ea878c42a6c9

Link: http://lkml.kernel.org/r/964a8b27-cd69-357c-fe78-76b066056201@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+4f2e5f086147d543ab03@syzkaller.appspotmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Ernesto A. Fernandez <ernesto.mnd.fernandez@gmail.com>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 513c357c734b..a6c0f54c48c3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 out_put_hidden_dir:
+	cancel_delayed_work_sync(&sbi->sync_work);
 	iput(sbi->hidden_dir);
 out_put_root:
 	dput(sb->s_root);
-- 
cgit v1.2.1