From e57cf21e9787c081db4db6afa02e6e70112ee410 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 19 Feb 2015 17:51:39 -0800 Subject: Btrfs: fix allocation size calculations in alloc_btrfs_bio Since commit 8e5cfb55d3f (Btrfs: Make raid_map array be inlined in btrfs_bio structure), the raid map array is allocated along with the btrfs bio in alloc_btrfs_bio. The calculation used to decide how much we need to allocate was using the wrong parameter passed into the allocation function. The passed in real_stripes will be zero if a target replace operation is not currently running. We want to use total_stripes instead. Signed-off-by: Chris Mason Reported-by: David Sterba Tested-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd4d1315aaa9..8222f6f74147 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4903,10 +4903,17 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) { struct btrfs_bio *bbio = kzalloc( + /* the size of the btrfs_bio */ sizeof(struct btrfs_bio) + + /* plus the variable array for the stripes */ sizeof(struct btrfs_bio_stripe) * (total_stripes) + + /* plus the variable array for the tgt dev */ sizeof(int) * (real_stripes) + - sizeof(u64) * (real_stripes), + /* + * plus the raid_map, which includes both the tgt dev + * and the stripes + */ + sizeof(u64) * (total_stripes), GFP_NOFS); if (!bbio) return NULL; -- cgit v1.2.3 From dfcc70a8c868fe03276fa59864149708fb41930b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 23 Feb 2015 22:34:17 +1100 Subject: xfs: Fix quota type in quota structures when reusing quota file For filesystems without separate project quota inode field in the superblock we just reuse project quota file for group quotas (and vice versa) if project quota file is allocated and we need group quota file. When we reuse the file, quota structures on disk suddenly have wrong type stored in d_flags though. Nobody really cares about this (although structure type reported to userspace was wrong as well) except that after commit 14bf61ffe6ac (quota: Switch ->get_dqblk() and ->set_dqblk() to use bytes as space units) assertion in xfs_qm_scall_getquota() started to trigger on xfs/106 test (apparently I was testing without XFS_DEBUG so I didn't notice when submitting the above commit). Fix the problem by properly resetting ddq->d_flags when running quotacheck for a quota file. CC: stable@vger.kernel.org Reported-by: Al Viro Signed-off-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 53cc2aaf8d2b..fbbb9e62e274 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -836,6 +836,11 @@ xfs_qm_reset_dqcounts( */ xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, "xfs_quotacheck"); + /* + * Reset type in case we are reusing group quota file for + * project quotas or vice versa + */ + ddq->d_flags = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; -- cgit v1.2.3 From 5885ebda878b47c4b4602d4b0410cb4b282af024 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Feb 2015 22:37:08 +1100 Subject: xfs: ensure truncate forces zeroed blocks to disk A new fsync vs power fail test in xfstests indicated that XFS can have unreliable data consistency when doing extending truncates that require block zeroing. The blocks beyond EOF get zeroed in memory, but we never force those changes to disk before we run the transaction that extends the file size and exposes those blocks to userspace. This can result in the blocks not being correctly zeroed after a crash. Because in-memory behaviour is correct, tools like fsx don't pick up any coherency problems - it's not until the filesystem is shutdown or the system crashes after writing the truncate transaction to the journal but before the zeroed data in the page cache is flushed that the issue is exposed. Fix this by also flushing the dirty data in memory region between the old size and new size when we've found blocks that need zeroing in the truncate process. Reported-by: Liu Bo cc: Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 14 ++++++++++---- fs/xfs/xfs_inode.h | 9 +++++---- fs/xfs/xfs_iops.c | 36 ++++++++++++++---------------------- 3 files changed, 29 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ce615d12fb44..a2e1cb8a568b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -397,7 +397,8 @@ STATIC int /* error (positive) */ xfs_zero_last_block( struct xfs_inode *ip, xfs_fsize_t offset, - xfs_fsize_t isize) + xfs_fsize_t isize, + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); @@ -425,6 +426,7 @@ xfs_zero_last_block( zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; + *did_zeroing = true; return xfs_iozero(ip, isize, zero_len); } @@ -443,7 +445,8 @@ int /* error (positive) */ xfs_zero_eof( struct xfs_inode *ip, xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + xfs_fsize_t isize, /* current inode size */ + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_zero_fsb; @@ -465,7 +468,7 @@ xfs_zero_eof( * We only zero a part of that block so it is handled specially. */ if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize); + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); if (error) return error; } @@ -525,6 +528,7 @@ xfs_zero_eof( if (error) return error; + *did_zeroing = true; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); } @@ -567,13 +571,15 @@ restart: * having to redo all checks before. */ if (*pos > i_size_read(inode)) { + bool zero = false; + if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); goto restart; } - error = xfs_zero_eof(ip, *pos, i_size_read(inode)); + error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); if (error) return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 86cd6b39bed7..a1cd55f3f351 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -384,10 +384,11 @@ enum xfs_prealloc_flags { XFS_PREALLOC_INVISIBLE = (1 << 4), }; -int xfs_update_prealloc_flags(struct xfs_inode *, - enum xfs_prealloc_flags); -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); -int xfs_iozero(struct xfs_inode *, loff_t, size_t); +int xfs_update_prealloc_flags(struct xfs_inode *ip, + enum xfs_prealloc_flags flags); +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, + xfs_fsize_t isize, bool *did_zeroing); +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); #define IHOLD(ip) \ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d919ad7b16bf..e53a90331422 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -751,6 +751,7 @@ xfs_setattr_size( int error; uint lock_flags = 0; uint commit_flags = 0; + bool did_zeroing = false; trace_xfs_setattr(ip); @@ -794,20 +795,16 @@ xfs_setattr_size( return error; /* - * Now we can make the changes. Before we join the inode to the - * transaction, take care of the part of the truncation that must be - * done without the inode lock. This needs to be done before joining - * the inode to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data block beyond EOF that we may expose on + * file extension. */ if (newsize > oldsize) { - /* - * Do the first part of growing a file: zero any data in the - * last block that is beyond the old EOF. We need to do this - * before the inode is joined to the transaction to modify - * i_size. - */ - error = xfs_zero_eof(ip, newsize, oldsize); + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); if (error) return error; } @@ -817,23 +814,18 @@ xfs_setattr_size( * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files - * problem. - * - * Only flush from the on disk size to the smaller of the in memory - * file size or the new size as that's the range we really care about - * here and prevents waiting for other data not within the range we - * care about here. + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. */ - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + if (newsize > ip->i_d.di_size && + (oldsize != ip->i_d.di_size || did_zeroing)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* - * Wait for all direct I/O to complete. - */ + /* Now wait for all direct I/O to complete. */ inode_dio_wait(inode); /* -- cgit v1.2.3 From fc921566f4fcf4499e9a6d010391c00be199ab85 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 24 Feb 2015 10:12:55 +1100 Subject: xfs: Ensure we have target_ip for RENAME_EXCHANGE We shouldn't get here with RENAME_EXCHANGE set and no target_ip, but let's be defensive, because xfs_cross_rename() will dereference it. Spotted by Coverity. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..6163767aa856 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2867,6 +2867,10 @@ xfs_rename( * Handle RENAME_EXCHANGE flags */ if (flags & RENAME_EXCHANGE) { + if (target_ip == NULL) { + error = -EINVAL; + goto error_return; + } error = xfs_cross_rename(tp, src_dp, src_name, src_ip, target_dp, target_name, target_ip, &free_list, &first_block, spaceres); -- cgit v1.2.3 From 83d5f01858b56db69c8e4ca5389ef7c29bfdb5dd Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 24 Feb 2015 10:15:18 +1100 Subject: xfs: cancel failed transaction in xfs_fs_commit_blocks() If xfs_trans_reserve fails we don't cancel the transaction, and we'll leak the allocated transaction pointer. Spotted by Coverity. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_pnfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4b33ef112400..365dd57ea760 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -300,8 +300,10 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) + if (error) { + xfs_trans_cancel(tp, 0); goto out_drop_iolock; + } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); -- cgit v1.2.3 From 2a559a8bdeae853b6a8abb477c88875e1d4de591 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 23 Feb 2015 11:34:10 +0000 Subject: eCryptfs: ensure copy to crypt_stat->cipher does not overrun The patch 237fead61998: "[PATCH] ecryptfs: fs/Makefile and fs/Kconfig" from Oct 4, 2006, leads to the following static checker warning: fs/ecryptfs/crypto.c:846 ecryptfs_new_file_context() error: off-by-one overflow 'crypt_stat->cipher' size 32. rl = '0-32' There is a mismatch between the size of ecryptfs_crypt_stat.cipher and ecryptfs_mount_crypt_stat.global_default_cipher_name causing the copy of the cipher name to cause a off-by-one string copy error. This fix ensures the space reserved for this string is the same size including the trailing zero at the end throughout ecryptfs. This fix avoids increasing the size of ecryptfs_crypt_stat.cipher and also ecryptfs_parse_tag_70_packet_silly_stack.cipher_string and instead reduces the of ECRYPTFS_MAX_CIPHER_NAME_SIZE to 31 and includes the + 1 for the end of string terminator. NOTE: An overflow is not possible in practice since the value copied into global_default_cipher_name is validated by ecryptfs_code_for_cipher_string() at mount time. None of the allowed cipher strings are long enough to cause the potential buffer overflow fixed by this patch. Signed-off-by: Colin Ian King Reported-by: Dan Carpenter [tyhicks: Added the NOTE about the overflow not being triggerable] Signed-off-by: Tyler Hicks --- fs/ecryptfs/ecryptfs_kernel.h | 4 ++-- fs/ecryptfs/keystore.c | 2 +- fs/ecryptfs/main.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 90d1882b306f..5ba029e627cc 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key) } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 -#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 #define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ #define ECRYPTFS_SALT_BYTES 2 @@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat { struct crypto_ablkcipher *tfm; struct crypto_hash *hash_tfm; /* Crypto context for generating * the initialization vectors */ - unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; struct list_head keysig_list; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 917bd5c9776a..6bd67e2011f0 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { struct blkcipher_desc desc; char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; char iv[ECRYPTFS_MAX_IV_BYTES]; - char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; }; /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1895d60f4122..c095d3264259 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); - BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } -- cgit v1.2.3 From c876486be17aeefe0da569f3d111cbd8de6f675d Mon Sep 17 00:00:00 2001 From: Andrew Elble Date: Wed, 25 Feb 2015 17:46:27 -0500 Subject: nfsd: fix clp->cl_revoked list deletion causing softlock in nfsd commit 2d4a532d385f ("nfsd: ensure that clp->cl_revoked list is protected by clp->cl_lock") removed the use of the reaplist to clean out clp->cl_revoked. It failed to change list_entry() to walk clp->cl_revoked.next instead of reaplist.next Fixes: 2d4a532d385f ("nfsd: ensure that clp->cl_revoked list is protected by clp->cl_lock") Cc: stable@vger.kernel.org Reported-by: Eric Meddaugh Tested-by: Eric Meddaugh Signed-off-by: Andrew Elble Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f6b2a09f793f..d2f2c37dc2db 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1638,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp) nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_revoked)) { - dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } -- cgit v1.2.3 From be36e185bd26388355d3ea1847278b96ab350792 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 Feb 2015 17:04:17 -0500 Subject: NFSv4: nfs4_open_recover_helper() must set share access The share access mode is now specified as an argument in the nfs4_opendata, and so nfs4_open_recover_helper() needs to call nfs4_map_atomic_open_share() in order to set it. Fixes: 6ae373394c42 ("NFSv4.1: Ask for no delegation on OPEN if using O_DIRECT") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 88180ac5ea0e..4e41340e957d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1552,6 +1552,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; + opendata->o_arg.share_access = nfs4_map_atomic_open_share( + NFS_SB(opendata->dentry->d_sb), + fmode, 0); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); -- cgit v1.2.3 From 957ed60b53b519064a54988c4e31e0087e47d091 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 27 Feb 2015 15:51:56 -0800 Subject: nilfs2: fix potential memory overrun on inode Each inode of nilfs2 stores a root node of a b-tree, and it turned out to have a memory overrun issue: Each b-tree node of nilfs2 stores a set of key-value pairs and the number of them (in "bn_nchildren" member of nilfs_btree_node struct), as well as a few other "bn_*" members. Since the value of "bn_nchildren" is used for operations on the key-values within the b-tree node, it can cause memory access overrun if a large number is incorrectly set to "bn_nchildren". For instance, nilfs_btree_node_lookup() function determines the range of binary search with it, and too large "bn_nchildren" leads nilfs_btree_node_get_key() in that function to overrun. As for intermediate b-tree nodes, this is prevented by a sanity check performed when each node is read from a drive, however, no sanity check has been done for root nodes stored in inodes. This patch fixes the issue by adding missing sanity check against b-tree root nodes so that it's called when on-memory inodes are read from ifile, inode metadata file. Signed-off-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/btree.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b2e3ff347620..ecdbae19a766 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -31,6 +31,8 @@ #include "alloc.h" #include "dat.h" +static void __nilfs_btree_init(struct nilfs_bmap *bmap); + static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { struct nilfs_btree_path *path; @@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, return ret; } +/** + * nilfs_btree_root_broken - verify consistency of btree root node + * @node: btree root node to be examined + * @ino: inode number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, + unsigned long ino) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level > NILFS_BTREE_LEVEL_MAX || + nchildren < 0 || + nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { + pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", + ino, level, flags, nchildren); + ret = 1; + } + return ret; +} + int nilfs_btree_broken_node_block(struct buffer_head *bh) { int ret; @@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, /* convert and insert */ dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; - nilfs_btree_init(btree); + __nilfs_btree_init(btree); if (nreq != NULL) { nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); @@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_gather_data = NULL, }; -int nilfs_btree_init(struct nilfs_bmap *bmap) +static void __nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; bmap->b_nchildren_per_block = NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); - return 0; +} + +int nilfs_btree_init(struct nilfs_bmap *bmap) +{ + int ret = 0; + + __nilfs_btree_init(bmap); + + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), + bmap->b_inode->i_ino)) + ret = -EIO; + return ret; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) -- cgit v1.2.3 From aa5accea404b2b92d39c1924cfeb90f6082f6389 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 Feb 2015 22:54:19 -0500 Subject: NFS: Ensure that buffered writes wait for O_DIRECT writes to complete The O_DIRECT code will grab the inode->i_mutex and flush out buffered writes, before scheduling a read or a write. However there is no equivalent in the buffered write code to wait for O_DIRECT to complete. Fixes a reported issue in xfstests generic/133, when first performing an O_DIRECT write followed by a buffered write. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/file.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 94712fc781fa..c045c7169fa0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -372,6 +372,10 @@ start: nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; + /* + * Wait for O_DIRECT to complete + */ + nfs_inode_dio_wait(mapping->host); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) -- cgit v1.2.3 From 140e049c64ce848392adbf4678983ecc76888dde Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 17:42:42 -0500 Subject: NFS: Add a helper to set attribute barriers Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 16 ++++++++++++++++ include/linux/nfs_fs.h | 1 + 2 files changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 83107be3dd01..b0cbc1ba82da 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1260,6 +1260,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_fattr_init); +/** + * nfs_fattr_set_barrier + * @fattr: attributes + * + * Used to set a barrier after an attribute was updated. This + * barrier ensures that older attributes from RPC calls that may + * have raced with our update cannot clobber these new values. + * Note that you are still responsible for ensuring that other + * operations which change the attribute on the server do not + * collide. + */ +void nfs_fattr_set_barrier(struct nfs_fattr *fattr) +{ + fattr->gencount = nfs_inc_attr_generation_counter(); +} + struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 2f77e0c651c8..3a4ffb5856cd 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -369,6 +369,7 @@ extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ct extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); +extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); -- cgit v1.2.3 From f044636d972246d451e06226cc1675d5da389762 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 16:09:04 -0500 Subject: NFS: Add attribute update barriers to nfs_setattr_update_inode() Ensure that other operations which raced with our setattr RPC call cannot revert the file attribute changes that were made on the server. To do so, we artificially bump the attribute generation counter on the inode so that all calls to nfs_fattr_init() that precede ours will be dropped. The motivation for the patch came from Chuck Lever's reports of readaheads racing with truncate operations and causing the file size to be reverted. Reported-by: Chuck Lever Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 17 ++++++++++++----- fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 6 +++--- fs/nfs/proc.c | 2 +- include/linux/nfs_fs.h | 2 +- 5 files changed, 18 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b0cbc1ba82da..3a2d127de499 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr); * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. + * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { @@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) if (err) goto out; - spin_lock(&inode->i_lock); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + spin_lock(&inode->i_lock); out: return err; } @@ -585,10 +586,15 @@ out: * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ -void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, + struct nfs_fattr *fattr) { + /* Barrier: bump the attribute generation count. */ + nfs_fattr_set_barrier(fattr); + + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { - spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_gid = attr->ia_gid; nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); - spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } + nfs_update_inode(inode, fattr); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 78e557c3ab87..11109a137c0c 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e41340e957d..c499e02a58ca 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2416,8 +2416,8 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr, sattr, state, label, olabel); if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } @@ -3291,7 +3291,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); if (status == 0) { - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); } nfs4_label_free(label); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b09cc23d6f43..6202bc0f11bb 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 3a4ffb5856cd..f26e64e0aff8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -356,7 +356,7 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); -extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); +extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); -- cgit v1.2.3 From f5062003465c20cfe584d9129a463322ad5cf4ea Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 19:34:32 -0500 Subject: NFS: Set an attribute barrier on all updates Ensure that we update the attribute barrier even if there were no invalidations, provided that this value is newer than the old one. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3a2d127de499..299bf7171a4d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1738,6 +1738,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; + /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { @@ -1745,6 +1746,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } + /* Set the barrier to be more recent than this fattr */ + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ -- cgit v1.2.3 From a08a8cd375db9769588257e7782f6b6b68561b88 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 17:36:09 -0500 Subject: NFS: Add attribute update barriers to NFS writebacks Ensure that other operations that race with our write RPC calls cannot revert the file size updates that were made on the server. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 25 ++++++++++++++++++++++--- fs/nfs/internal.h | 1 + fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 2 +- fs/nfs/proc.c | 4 +--- fs/nfs/write.c | 30 ++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 7 files changed, 57 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 299bf7171a4d..ff9a6795da46 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1491,7 +1491,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** - * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes * @@ -1501,11 +1501,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); * * This function is mainly designed to be used by the ->write_done() functions. */ -int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int status; - spin_lock(&inode->i_lock); /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { @@ -1537,6 +1536,26 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b802fb3a2d99..9e6475bc5ba2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 11109a137c0c..1f11d2533ee4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c499e02a58ca..b022e64b76a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4237,7 +4237,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); - nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr); + nfs_writeback_update_inode(hdr); } return 0; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 6202bc0f11bb..c63189acd052 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = hdr->inode; - if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 595d81e354d1..849ed784d6ac 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1377,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode) return 0; } +static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return; + if (argp->offset + resp->count != fattr->size) + return; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + return; + /* Set attribute barrier */ + nfs_fattr_set_barrier(fattr); +} + +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) +{ + struct nfs_fattr *fattr = hdr->res.fattr; + struct inode *inode = hdr->inode; + + if (fattr == NULL) + return; + spin_lock(&inode->i_lock); + nfs_writeback_check_extend(hdr, fattr); + nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_writeback_update_inode); + /* * This function is called when the WRITE call is complete. */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f26e64e0aff8..59b1516b9fd4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -343,6 +343,7 @@ extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); +extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); -- cgit v1.2.3 From 8f8ba1d739b7047e2e1d91735716af2799ff2b1e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 17:54:58 -0500 Subject: NFSv4: Add attribute update barriers to delegreturn and pNFS layoutcommit Ensure that other operations that race with delegreturn and layoutcommit cannot revert the attribute updates that were made on the server. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ff9a6795da46..cd094d652199 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1555,6 +1555,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa int status; spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; -- cgit v1.2.3 From 00fb4c9f8421c9aac3947d36ffe8e049b95f7ab1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 17:57:14 -0500 Subject: NFS: Remove size hack in nfs_inode_attrs_need_update() Prior to this patch, we used to always OK attribute updates that extended the file size on the assumption that we might be performing writeback. Now that we have attribute barriers to protect the writeback related updates, we should remove this hack, as it can cause truncate() operations to apparently be reverted if/when a readahead or getattr RPC call races with our on-the-wire SETATTR. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index cd094d652199..fef65d1e024e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1238,13 +1238,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } -static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) - return 0; - return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1393,7 +1386,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || nfs_ctime_need_update(inode, fattr) || - nfs_size_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } -- cgit v1.2.3 From 92d64e47b67b5e7fe1b5358402ab222a32ec3479 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 19:48:26 -0500 Subject: NFS: Fix nfs_post_op_update_inode() to set an attribute barrier nfs_post_op_update_inode() is called after a self-induced attribute update. Ensure that it also sets the barrier. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fef65d1e024e..c66c1df467f4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1475,6 +1475,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) int status; spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 3235b40303b6f609c446275d0e7f6f9f4fe94156 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 19:52:06 -0500 Subject: NFSv4: Set a barrier in the update_changeattr() helper Ensure that we don't regress the changes that were made to the directory. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 1 + fs/nfs/nfs4proc.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c66c1df467f4..5026c44a98e1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1249,6 +1249,7 @@ unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } +EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b022e64b76a5..a211daf58c32 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -901,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); dir->i_version = cinfo->after; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); } -- cgit v1.2.3 From 6c441c254eea2354d686be7f5544bcd79fb6a61f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 22 Feb 2015 16:35:36 -0500 Subject: NFS: Don't invalidate a submounted dentry in nfs_prime_dcache() If we're traversing a directory which contains a submounted filesystem, or one that has a referral, the NFS server that is processing the READDIR request will often return information for the underlying (mounted-on) directory. It may, or may not, also return filehandle information. If this happens, and the lookup in nfs_prime_dcache() returns the dentry for the submounted directory, the filehandle comparison will fail, and we call d_invalidate(). Post-commit 8ed936b5671bf ("vfs: Lazily remove mounts on unlinked files and directories."), this means the entire subtree is unmounted. The following minimal patch addresses this problem by punting on the invalidation if there is a submount. Kudos to Neil Brown for having tracked down this issue (see link). Reported-by: Nix Link: http://lkml.kernel.org/r/87iofju9ht.fsf@spindle.srvr.nix Cc: stable@vger.kernel.org # 3.18+ Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9b0c55cb2a2e..4ad7fff9ccaf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -469,6 +469,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct inode *inode; int status; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -479,6 +481,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) dentry = d_lookup(parent, &filename); if (dentry != NULL) { + /* Is there a mountpoint here? If so, just exit */ + if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, + &entry->fattr->fsid)) + goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(dentry->d_inode, entry->fattr); -- cgit v1.2.3 From 1ae04b252351188cfa66af1b8b0628512a72dd7b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 Feb 2015 16:15:00 -0500 Subject: NFSv3: Use the readdir fileid as the mounted-on-fileid When we call readdirplus, set the fileid normally returned by readdir as the mounted-on-fileid, since that is commonly the case if there is a mountpoint. To ensure that we get it right, we only set the flag if the readdir fileid differs from the one returned in the readdirplus attributes. This again means that we can avoid the issues described in commit 2ef47eb1aee17 ("NFS: Fix use of nfs_attr_use_mounted_on_fileid()"), which only fixed NFSv4. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3xdr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 2a932fdc57cb..53852a4bd88b 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1987,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (entry->fattr->fileid != entry->ino) { + entry->fattr->mounted_on_fileid = entry->ino; + entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) -- cgit v1.2.3 From fa9233699cc1dc236f4cf42245d13e40966938c5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 Feb 2015 18:51:32 -0500 Subject: NFS: Don't require a filehandle to refresh the inode in nfs_prime_dcache() If the server does not return a valid set of attributes that we can use to either create a file or refresh the inode, then there is no value in calling nfs_prime_dcache(). However if we're just refreshing the inode using the attributes that the server returned, then it shouldn't matter whether or not we have a filehandle, as long as we check the fsid+fileid combination. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4ad7fff9ccaf..c19e16f0b2d0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, return 0; } +/* Match file and dirent using either filehandle or fileid + * Note: caller is responsible for checking the fsid + */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct nfs_inode *nfsi; + if (dentry->d_inode == NULL) goto different; - if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) - goto different; - return 1; + + nfsi = NFS_I(dentry->d_inode); + if (entry->fattr->fileid == nfsi->fileid) + return 1; + if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) + return 1; different: return 0; } @@ -469,6 +477,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct inode *inode; int status; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) + return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; if (filename.name[0] == '.') { -- cgit v1.2.3 From 7c0af9ffb7bb4e5355470fa60b3eb711ddf226fa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 12:54:46 -0500 Subject: NFSv4: Don't call put_rpccred() under the rcu_read_lock() put_rpccred() can sleep. Fixes: 8f649c3762547 ("NFSv4: Fix the locking in nfs_inode_reclaim_delegation()") Cc: stable@vger.kernel.org # 2.6.35+ Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a1f0685b42ff..2e37d8315d92 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -181,8 +181,8 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); spin_unlock(&delegation->lock); - put_rpccred(oldcred); rcu_read_unlock(); + put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); } else { /* We appear to have raced with a delegation return. */ -- cgit v1.2.3 From 4d884fceaa2c838abb598778813e93f6d9fea723 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2015 17:17:43 +0000 Subject: Btrfs: fix fsync race leading to ordered extent memory leaks We can have multiple fsync operations against the same file during the same transaction and they can collect the same ordered extents while they don't complete (still accessible from the inode's ordered tree). If this happens, those ordered extents will never get their reference counts decremented to 0, leading to memory leaks and inode leaks (an iput for an ordered extent's inode is scheduled only when the ordered extent's refcount drops to 0). The following sequence diagram explains this race: CPU 1 CPU 2 btrfs_sync_file() btrfs_sync_file() mutex_lock(inode->i_mutex) btrfs_log_inode() btrfs_get_logged_extents() --> collects ordered extent X --> increments ordered extent X's refcount btrfs_submit_logged_extents() mutex_unlock(inode->i_mutex) mutex_lock(inode->i_mutex) btrfs_sync_log() btrfs_wait_logged_extents() --> list_del_init(&ordered->log_list) btrfs_log_inode() btrfs_get_logged_extents() --> Adds ordered extent X to logged_list because at this point: list_empty(&ordered->log_list) && test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags) == 0 --> Increments ordered extent X's refcount --> check if ordered extent's io is finished or not, start it if necessary and wait for it to finish --> sets bit BTRFS_ORDERED_LOGGED on ordered extent X's flags and adds it to trans->ordered btrfs_sync_log() finishes btrfs_submit_logged_extents() btrfs_log_inode() finishes mutex_unlock(inode->i_mutex) btrfs_sync_file() finishes btrfs_sync_log() btrfs_wait_logged_extents() --> Sees ordered extent X has the bit BTRFS_ORDERED_LOGGED set in its flags --> X's refcount is untouched btrfs_sync_log() finishes btrfs_sync_file() finishes btrfs_commit_transaction() --> called by transaction kthread for e.g. btrfs_wait_pending_ordered() --> waits for ordered extent X to complete --> decrements ordered extent X's refcount by 1 only, corresponding to the increment done by the fsync task ran by CPU 1 In the scenario of the above diagram, after the transaction commit, the ordered extent will remain with a refcount of 1 forever, leaking the ordered extent structure and preventing the i_count of its inode from ever decreasing to 0, since the delayed iput is scheduled only when the ordered extent's refcount drops to 0, preventing the inode from ever being evicted by the VFS. Fix this by using the flag BTRFS_ORDERED_LOGGED differently. Use it to mean that an ordered extent is already being processed by an fsync call, which will attach it to the current transaction, preventing it from being collected by subsequent fsync operations against the same inode. This race was introduced with the following change (added in 3.19 and backported to stable 3.18 and 3.17): Btrfs: make sure logged extents complete in the current transaction V3 commit 50d9aa99bd35c77200e0e3dd7a72274f8304701f I ran into this issue while running xfstests/generic/113 in a loop, which failed about 1 out of 10 runs with the following warning in dmesg: [ 2612.440038] WARNING: CPU: 4 PID: 22057 at fs/btrfs/disk-io.c:3558 free_fs_root+0x36/0x133 [btrfs]() [ 2612.442810] Modules linked in: btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop processor parport_pc parport psmouse therma l_sys i2c_piix4 serio_raw pcspkr evdev microcode button i2c_core ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom virtio_scsi ata_generic virtio_pci ata_piix virtio_ring libata virtio flo ppy e1000 scsi_mod [last unloaded: btrfs] [ 2612.452711] CPU: 4 PID: 22057 Comm: umount Tainted: G W 3.19.0-rc5-btrfs-next-4+ #1 [ 2612.454921] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 2612.457709] 0000000000000009 ffff8801342c3c78 ffffffff8142425e ffff88023ec8f2d8 [ 2612.459829] 0000000000000000 ffff8801342c3cb8 ffffffff81045308 ffff880046460000 [ 2612.461564] ffffffffa036da56 ffff88003d07b000 ffff880046460000 ffff880046460068 [ 2612.463163] Call Trace: [ 2612.463719] [] dump_stack+0x4c/0x65 [ 2612.464789] [] warn_slowpath_common+0xa1/0xbb [ 2612.466026] [] ? free_fs_root+0x36/0x133 [btrfs] [ 2612.467247] [] warn_slowpath_null+0x1a/0x1c [ 2612.468416] [] free_fs_root+0x36/0x133 [btrfs] [ 2612.469625] [] btrfs_drop_and_free_fs_root+0x93/0x9b [btrfs] [ 2612.471251] [] btrfs_free_fs_roots+0xa4/0xd6 [btrfs] [ 2612.472536] [] ? wait_for_completion+0x24/0x26 [ 2612.473742] [] close_ctree+0x1f3/0x33c [btrfs] [ 2612.475477] [] ? destroy_workqueue+0x148/0x1ba [ 2612.476695] [] btrfs_put_super+0x19/0x1b [btrfs] [ 2612.477911] [] generic_shutdown_super+0x73/0xef [ 2612.479106] [] kill_anon_super+0x13/0x1e [ 2612.480226] [] btrfs_kill_super+0x17/0x23 [btrfs] [ 2612.481471] [] deactivate_locked_super+0x3b/0x50 [ 2612.482686] [] deactivate_super+0x3f/0x43 [ 2612.483791] [] cleanup_mnt+0x59/0x78 [ 2612.484842] [] __cleanup_mnt+0x12/0x14 [ 2612.485900] [] task_work_run+0x8f/0xbc [ 2612.486960] [] do_notify_resume+0x5a/0x6b [ 2612.488083] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 2612.489333] [] int_signal+0x12/0x17 [ 2612.490353] ---[ end trace 54a960a6bdcb8d93 ]--- [ 2612.557253] VFS: Busy inodes after unmount of sdb. Self-destruct in 5 seconds. Have a nice day... Kmemleak confirmed the ordered extent leak (and btrfs inode specific structures such as delayed nodes): $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff880154290db0 (size 576): comm "btrfsck", pid 21980, jiffies 4295542503 (age 1273.412s) hex dump (first 32 bytes): 01 40 00 00 01 00 00 00 b0 1d f1 4e 01 88 ff ff .@.........N.... 00 00 00 00 00 00 00 00 c8 0d 29 54 01 88 ff ff ..........)T.... backtrace: [] kmemleak_update_trace+0x4c/0x6a [] radix_tree_node_alloc+0x6d/0x83 [] __radix_tree_create+0x109/0x190 [] radix_tree_insert+0x30/0xac [] btrfs_get_or_create_delayed_node+0x130/0x187 [btrfs] [] btrfs_delayed_delete_inode_ref+0x32/0xac [btrfs] [] __btrfs_unlink_inode+0xee/0x288 [btrfs] [] btrfs_unlink_inode+0x1e/0x40 [btrfs] [] btrfs_unlink+0x60/0x9b [btrfs] [] vfs_unlink+0x9c/0xed [] do_unlinkat+0x12c/0x1fa [] SyS_unlinkat+0x29/0x2b [] system_call_fastpath+0x12/0x17 [] 0xffffffffffffffff unreferenced object 0xffff88014ef11db0 (size 576): comm "rm", pid 22009, jiffies 4295542593 (age 1273.052s) hex dump (first 32 bytes): 02 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 c8 1d f1 4e 01 88 ff ff ...........N.... backtrace: [] kmemleak_update_trace+0x4c/0x6a [] radix_tree_node_alloc+0x6d/0x83 [] __radix_tree_create+0x109/0x190 [] radix_tree_insert+0x30/0xac [] btrfs_get_or_create_delayed_node+0x130/0x187 [btrfs] [] btrfs_delayed_delete_inode_ref+0x32/0xac [btrfs] [] __btrfs_unlink_inode+0xee/0x288 [btrfs] [] btrfs_unlink_inode+0x1e/0x40 [btrfs] [] btrfs_unlink+0x60/0x9b [btrfs] [] vfs_unlink+0x9c/0xed [] do_unlinkat+0x12c/0x1fa [] SyS_unlinkat+0x29/0x2b [] system_call_fastpath+0x12/0x17 [] 0xffffffffffffffff unreferenced object 0xffff8800336feda8 (size 584): comm "aio-stress", pid 22031, jiffies 4295543006 (age 1271.400s) hex dump (first 32 bytes): 00 40 3e 00 00 00 00 00 00 00 8f 42 00 00 00 00 .@>........B.... 00 00 01 00 00 00 00 00 00 00 01 00 00 00 00 00 ................ backtrace: [] create_object+0x172/0x29a [] kmemleak_alloc+0x25/0x41 [] kmemleak_alloc_recursive.constprop.52+0x16/0x18 [] kmem_cache_alloc+0xf7/0x198 [] __btrfs_add_ordered_extent+0x43/0x309 [btrfs] [] btrfs_add_ordered_extent_dio+0x12/0x14 [btrfs] [] btrfs_get_blocks_direct+0x3ef/0x571 [btrfs] [] do_blockdev_direct_IO+0x62a/0xb47 [] __blockdev_direct_IO+0x34/0x36 [] btrfs_direct_IO+0x16a/0x1e8 [btrfs] [] generic_file_direct_write+0xb8/0x12d [] btrfs_file_write_iter+0x24b/0x42f [btrfs] [] aio_run_iocb+0x2b7/0x32e [] do_io_submit+0x26e/0x2ff [] SyS_io_submit+0x10/0x12 [] system_call_fastpath+0x12/0x17 CC: # 3.19, 3.18 and 3.17 Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 534544e08f76..157cc54fc634 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -452,9 +452,7 @@ void btrfs_get_logged_extents(struct inode *inode, continue; if (entry_end(ordered) <= start) break; - if (!list_empty(&ordered->log_list)) - continue; - if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); @@ -511,8 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) - list_add_tail(&ordered->trans_list, &trans->ordered); + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); -- cgit v1.2.3 From 0c0ef4bc842ba6b593bb94f9fb8b653fe18c5ed8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Feb 2015 09:43:51 -0500 Subject: Btrfs: abort the transaction if we fail to update the free space cache inode Our gluster boxes were hitting a problem where they'd run out of space when updating the block group cache and therefore wouldn't be able to update the free space inode. This is a problem because this is how we invalidate the cache and protect ourselves from errors further down the stack, so if this fails we have to abort the transaction so we make sure we don't end up with stale free space cache. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 28ce5c8004d4..92146a5afdc1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3208,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, return 0; } + if (trans->aborted) + return 0; again: inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -3243,6 +3245,20 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } WARN_ON(ret); if (i_size_read(inode) > 0) { -- cgit v1.2.3 From e8c1c76e804b18120e6977fc092769c043876212 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 15 Feb 2015 22:38:54 +0000 Subject: Btrfs: add missing inode update when punching hole When punching a file hole if we endup only zeroing parts of a page, because the start offset isn't a multiple of the sector size or the start offset and length fall within the same page, we were not updating the inode item. This prevented an fsync from doing anything, if no other file changes happened in the current transaction, because the fields in btrfs_inode used to check if the inode needs to be fsync'ed weren't updated. This issue is easy to reproduce and the following excerpt from the xfstest case I made shows how to trigger it: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file. $XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, this makes btrfs update some btrfs inode specific fields # that are used to track if the inode needs to be written/updated to the fsync # log or not. After this fsync, the new values for those fields indicate that # a subsequent fsync does not need to touch the fsync log. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Force a commit of the current transaction. After this point, any operation # that modifies the data or metadata of our file, should update those fields in # the btrfs inode with values that make the next fsync operation write to the # fsync log. sync # Punch a hole in our file. This small range affects only 1 page. # This made the btrfs hole punching implementation write only some zeroes in # one page, but it did not update the btrfs inode fields used to determine if # the next fsync needs to write to the fsync log. $XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo # Another variation of the previously mentioned case. $XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo # Now fsync the file. This was a no-operation because the previous hole punch # operation didn't update the inode's fields mentioned before, so they remained # with the values they had after the first fsync - that is, they indicate that # it is not needed to write to fsync log. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Enable writes and mount the fs. This makes the fsync log replay code run. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Because the last fsync didn't do anything, here the file content matched what # it was after the first fsync, before the holes were punched, and not what it # was after the holes were punched. echo "File content after:" od -t x1 $SCRATCH_MNT/foo This issue has been around since 2012, when the punch hole implementation was added, commit 2aaa66558172 ("Btrfs: add hole punching"). A test case for xfstests follows soon. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4090259569b..b476e5645034 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2276,6 +2276,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; + bool truncated_page = false; + bool updated_inode = false; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2307,13 +2309,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < ino_size) + if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, len, 0); + } else { + ret = 0; + } goto out_only_mutex; } /* zero back part of the first page */ if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2349,6 +2356,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, tail_start + tail_len, 0, 1); if (ret) @@ -2358,8 +2366,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (lockend < lockstart) { - mutex_unlock(&inode->i_mutex); - return 0; + ret = 0; + goto out_only_mutex; } while (1) { @@ -2507,6 +2515,7 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); + updated_inode = true; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out_free: @@ -2516,6 +2525,22 @@ out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); out_only_mutex: + if (!updated_inode && truncated_page && !ret && !err) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + } else { + err = btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + } + } mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; -- cgit v1.2.3 From 5dfe2be7ead15863fd7b3fcc8bd69e470fae2bec Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 23 Feb 2015 19:48:52 +0000 Subject: Btrfs: fix off-by-one logic error in btrfs_realloc_node The end_slot variable actually matches the number of pointers in the node and not the last slot (which is 'nritems - 1'). Therefore in order to check that the current slot in the for loop doesn't match the last one, the correct logic is to check if 'i' is less than 'end_slot - 1' and not 'end_slot - 2'. Fix this and set end_slot to be 'nritems - 1', as it's less confusing since the variable name implies it's inclusive rather then exclusive. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 993642199326..6d67f32e648d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1645,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, parent_nritems = btrfs_header_nritems(parent); blocksize = root->nodesize; - end_slot = parent_nritems; + end_slot = parent_nritems - 1; - if (parent_nritems == 1) + if (parent_nritems <= 1) return 0; btrfs_set_lock_blocking(parent); - for (i = start_slot; i < end_slot; i++) { + for (i = start_slot; i <= end_slot; i++) { int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1669,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, other = btrfs_node_blockptr(parent, i - 1); close = close_blocks(blocknr, other, blocksize); } - if (!close && i < end_slot - 2) { + if (!close && i < end_slot) { other = btrfs_node_blockptr(parent, i + 1); close = close_blocks(blocknr, other, blocksize); } -- cgit v1.2.3 From 5cdf83edb8e41cad1ec8eab2d402b4f9d9eb7ee0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 23 Feb 2015 19:50:49 +0000 Subject: Btrfs: do not ignore errors from btrfs_lookup_xattr in do_setxattr The return value from btrfs_lookup_xattr() can be a pointer encoding an error, therefore deal with it. This fixes commit 5f5bc6b1e2d5 ("Btrfs: make xattr replace operations atomic"). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/xattr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 47b19465f0dc..883b93623bc5 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -111,6 +111,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, name, name_len, -1); if (!di && (flags & XATTR_REPLACE)) ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); else if (di) ret = btrfs_delete_one_dir_name(trans, root, path, di); goto out; @@ -127,10 +129,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans, ASSERT(mutex_is_locked(&inode->i_mutex)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); - if (!di) { + if (!di) ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + if (ret) goto out; - } btrfs_release_path(path); di = NULL; } -- cgit v1.2.3 From 1932b7be973b554ffe20a5bba6ffaed6fa995cdc Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 24 Feb 2015 18:57:18 +0100 Subject: btrfs: fix lost return value due to variable shadowing A block-local variable stores error code but btrfs_get_blocks_direct may not return it in the end as there's a ret defined in the function scope. CC: # 3.6+ Fixes: d187663ef24c ("Btrfs: lock extents as we map them in DIO") Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8564d8ce03de..91a87f53be3c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7288,7 +7288,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { int type; - int ret; u64 block_start, orig_start, orig_block_len, ram_bytes; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) -- cgit v1.2.3 From 84471e2429ed82fdbac0c56d5b2a18d450f99f6a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 28 Feb 2015 22:29:22 +0000 Subject: Btrfs: incremental send, don't rename a directory too soon There's one more case where we can't issue a rename operation for a directory as soon as we process it. We used to delay directory renames only if they have some ancestor directory with a higher inode number that got renamed too, but there's another case where we need to delay the rename too - when a directory A is renamed to the old name of a directory B but that directory B has its rename delayed because it has now (in the send root) an ancestor with a higher inode number that was renamed. If we don't delay the directory rename in this case, the receiving end of the send stream will attempt to rename A to the old name of B before B got renamed to its new name, which results in a "directory not empty" error. So fix this by delaying directory renames for this case too. Steps to reproduce: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/a $ mkdir /mnt/b $ mkdir /mnt/c $ touch /mnt/a/file $ btrfs subvolume snapshot -r /mnt /mnt/snap1 $ mv /mnt/c /mnt/x $ mv /mnt/a /mnt/x/y $ mv /mnt/b /mnt/a $ btrfs subvolume snapshot -r /mnt /mnt/snap2 $ btrfs send /mnt/snap1 -f /tmp/1.send $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.send $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt2 $ btrfs receive /mnt2 -f /tmp/1.send $ btrfs receive /mnt2 -f /tmp/2.send ERROR: rename b -> a failed. Directory not empty A test case for xfstests follows soon. Reported-by: Ames Cornish Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/send.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 156 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fe5857223515..d6033f540cc7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -230,6 +230,7 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; + bool is_orphan; struct list_head update_refs; }; @@ -2984,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 ino_gen, u64 parent_ino, struct list_head *new_refs, - struct list_head *deleted_refs) + struct list_head *deleted_refs, + const bool is_orphan) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2999,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; + pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3131,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) rmdir_ino = dm->rmdir_ino; free_waiting_dir_move(sctx, dm); - ret = get_first_ref(sctx->parent_root, pm->ino, - &parent_ino, &parent_gen, name); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, parent_ino, parent_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); + if (pm->is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } if (ret < 0) goto out; @@ -3150,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) LIST_HEAD(deleted_refs); ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs); + &pm->update_refs, &deleted_refs, + pm->is_orphan); if (ret < 0) goto out; if (rmdir_ino) { @@ -3283,6 +3291,127 @@ out: return ret; } +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(sctx->parent_root, path, + parent_ref->name, parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, + &left_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, + &right_gen, NULL, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + if (is_waiting_for_move(sctx, di_key.objectid)) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { @@ -3349,7 +3478,8 @@ out: sctx->cur_inode_gen, ino, &sctx->new_refs, - &sctx->deleted_refs); + &sctx->deleted_refs, + false); if (!ret) ret = 1; } @@ -3372,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int did_overwrite = 0; int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool can_rename = true; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -3490,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan) { + if (is_orphan && can_rename) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -3503,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = fs_path_copy(valid_path, cur->full_path); if (ret < 0) goto out; - } else { + } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* * Dirs can't be linked, so move it. For moved -- cgit v1.2.3 From 369d6b7f00977eb9090212d4a47ac71f3ec5c217 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 2 Mar 2015 16:46:09 -0500 Subject: NFS: Fix stateid used for NFS v4 closes After 566fcec60 the client uses the "current stateid" from the nfs4_state structure to close a file. This could potentially contain a delegation stateid, which is disallowed by the protocol and causes servers to return NFS4ERR_BAD_STATEID. This patch restores the (correct) behavior of sending the open stateid to close a file. Reported-by: Olga Kornievskaia Fixes: 566fcec60 (NFSv4: Fix an atomicity problem in CLOSE) Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a211daf58c32..732526e04cd5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2655,7 +2655,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, - &state->stateid)) { + &state->open_stateid)) { rpc_restart_call_prepare(task); goto out_release; } @@ -2691,7 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid); + nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { -- cgit v1.2.3 From b04b22f4ca691280f0ab3f77954f5a21500881e7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 13:59:38 -0500 Subject: NFSv4: Ensure that we don't reap a delegation that is being returned Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 2e37d8315d92..d9caf73eef48 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -815,12 +815,14 @@ restart: inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - - if (delegation != NULL) - nfs_free_delegation(delegation); + if (delegation != NULL) { + delegation = nfs_detach_delegation(NFS_I(inode), + delegation, server); + if (delegation != NULL) + nfs_free_delegation(delegation); + } iput(inode); goto restart; } -- cgit v1.2.3 From ade04647dd56881e285983af3db702d56ee97e86 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 Feb 2015 14:25:50 -0500 Subject: NFSv4: Ensure we honour NFS_DELEGATION_RETURNING in nfs_inode_set_delegation() Ensure that nfs_inode_set_delegation() doesn't inadvertently detach a delegation that is already in the process of being returned. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d9caf73eef48..5ca502b5f877 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -370,7 +370,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, + if (test_and_set_bit(NFS_DELEGATION_RETURNING, + &old_delegation->flags)) + goto out; + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); if (freeme == NULL) goto out; -- cgit v1.2.3 From 9f0f8e12c48e4bb89192a0de876c77dc1fbfaa75 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 09:57:34 -0500 Subject: NFSv4: Pin the superblock while we're returning the delegation This patch ensures that the superblock doesn't go ahead and disappear underneath us while the state manager thread is returning delegations. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5ca502b5f877..be313e791e67 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -474,14 +474,20 @@ restart: super_list) { if (!nfs_delegation_need_return(delegation)) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); + nfs_sb_deactive(server->super); if (!err) goto restart; set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); @@ -815,9 +821,14 @@ restart: if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { @@ -827,6 +838,7 @@ restart: nfs_free_delegation(delegation); } iput(inode); + nfs_sb_deactive(server->super); goto restart; } } -- cgit v1.2.3 From ec3ca4e57e00d52ff724b0ae49f4489667a9c311 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 14:05:05 -0500 Subject: NFSv4: Ensure we skip delegations that are already being returned In nfs_client_return_marked_delegations() and nfs_delegation_reap_unclaimed() we want to optimise the loop traversal by skipping delegations that are already in the process of being returned. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index be313e791e67..a6ad68865880 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -436,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + goto out; if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { @@ -447,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) ret = true; spin_unlock(&delegation->lock); } +out: return ret; } @@ -818,6 +821,9 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; -- cgit v1.2.3 From 6d65261a09adaa374c05de807f73a144d783669e Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 24 Feb 2015 19:28:10 -0600 Subject: eCryptfs: don't pass fs-specific ioctl commands through eCryptfs can't be aware of what to expect when after passing an arbitrary ioctl command through to the lower filesystem. The ioctl command may trigger an action in the lower filesystem that is incompatible with eCryptfs. One specific example is when one attempts to use the Btrfs clone ioctl command when the source file is in the Btrfs filesystem that eCryptfs is mounted on top of and the destination fd is from a new file created in the eCryptfs mount. The ioctl syscall incorrectly returns success because the command is passed down to Btrfs which thinks that it was able to do the clone operation. However, the result is an empty eCryptfs file. This patch allows the trim, {g,s}etflags, and {g,s}etversion ioctl commands through and then copies up the inode metadata from the lower inode to the eCryptfs inode to catch any changes made to the lower inode's metadata. Those five ioctl commands are mostly common across all filesystems but the whitelist may need to be further pruned in the future. https://bugzilla.kernel.org/show_bug.cgi?id=93691 https://launchpad.net/bugs/1305335 Signed-off-by: Tyler Hicks Cc: Rocko Cc: Colin Ian King Cc: stable@vger.kernel.org # v2.6.36+: c43f7b8 eCryptfs: Handle ioctl calls with unlocked and compat functions --- fs/ecryptfs/file.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index b07731e68c0b..fd39bad6f1bd 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -303,9 +303,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (lower_file->f_op->unlocked_ioctl) + if (!lower_file->f_op->unlocked_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + case FS_IOC_GETVERSION: + case FS_IOC_SETVERSION: rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #ifdef CONFIG_COMPAT @@ -315,9 +328,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (lower_file->f_op->compat_ioctl) + if (!lower_file->f_op->compat_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + case FS_IOC32_GETVERSION: + case FS_IOC32_SETVERSION: rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #endif -- cgit v1.2.3 From 874f946376de57c8d6230b30ad71f742883fee3a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Mar 2015 23:32:08 -0500 Subject: NFS: Fix a regression in the read() syscall When invalidating the page cache for a regular file, we want to first sync all dirty data to disk and then call invalidate_inode_pages2(). The latter relies on nfs_launder_page() and nfs_release_page() to deal respectively with dirty pages, and unstable written pages. When commit 9590544694bec ("NFS: avoid deadlocks with loop-back mounted NFS filesystems.") changed the behaviour of nfs_release_page(), then it made it possible for invalidate_inode_pages2() to fail with an EBUSY. Unfortunately, that error is then propagated back to read(). Let's therefore work around the problem for now by protecting the call to sync the data and invalidate_inode_pages2() so that they are atomic w.r.t. the addition of new writes. Later on, we can revisit whether or not we still need nfs_launder_page() and nfs_release_page(). Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ++-- fs/nfs/inode.c | 37 ++++++++++++++++++++++++++++++++++--- include/linux/nfs_fs.h | 1 + 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c045c7169fa0..41963ffca597 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -178,7 +178,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) @@ -199,7 +199,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5026c44a98e1..8edb7d049565 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1067,11 +1067,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) } /** - * nfs_revalidate_mapping - Revalidate the pagecache + * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping + * @may_lock - take inode->i_mutex? */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static int __nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping, + bool may_lock) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1120,7 +1123,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); + if (may_lock) { + mutex_lock(&inode->i_mutex); + ret = nfs_invalidate_mapping(inode, mapping); + mutex_unlock(&inode->i_mutex); + } else + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1130,6 +1138,29 @@ out: return ret; } +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, false); +} + +/** + * nfs_revalidate_mapping_protected - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex + * while invalidating the mapping. + */ +int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, true); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 59b1516b9fd4..b01ccf371fdc 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -356,6 +356,7 @@ extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); +extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, -- cgit v1.2.3 From ef070dcb3989f553f5d84edf555eebc7e204099d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Mar 2015 00:06:35 -0500 Subject: NFS: Don't write enable new pages while an invalidation is proceeding nfs_vm_page_mkwrite() should wait until the page cache invalidation is finished. This is the second patch in a 2 patch series to deprecate the NFS client's reliance on nfs_release_page() in the context of nfs_invalidate_mapping(). Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 +++ fs/nfs/inode.c | 1 + 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 41963ffca597..e679d24c39d3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -623,6 +623,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); + wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + lock_page(page); mapping = page_file_mapping(page); if (mapping != inode->i_mapping) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8edb7d049565..d42dff6d5e98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1035,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { + unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; -- cgit v1.2.3 From 48d66b9749e39e0d4cc37d635df3f18906af38a6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Mar 2015 20:28:59 -0500 Subject: NFSv4: Fix a race in NFSv4.1 server trunking discovery We do not want to allow a race with another NFS mount to cause nfs41_walk_client_list() to establish a lease on our nfs_client before we're done checking for trunking. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- fs/nfs/nfs4client.c | 9 ++++----- fs/nfs/nfs4state.c | 14 ++++++++++++-- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f9f4845db989..19874151e95c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat static bool nfs_client_init_is_complete(const struct nfs_client *clp) { - return clp->cl_cons_state != NFS_CS_INITING; + return clp->cl_cons_state <= NFS_CS_READY; } int nfs_wait_client_init_complete(const struct nfs_client *clp) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8646af9b11d2..86d6214ea022 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -621,6 +621,9 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + if (pos->rpc_ops != new->rpc_ops) continue; @@ -639,10 +642,6 @@ int nfs41_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - if (pos->cl_cons_state == NFS_CS_SESSION_INITING) { - nfs4_schedule_lease_recovery(pos); - status = nfs4_wait_clnt_recover(pos); - } spin_lock(&nn->nfs_client_lock); if (status < 0) break; @@ -668,7 +667,7 @@ int nfs41_walk_client_list(struct nfs_client *new, */ if (!nfs4_match_client_owner_id(pos, new)) continue; - +found: atomic_inc(&pos->cl_count); *result = pos; status = 0; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5ad908e9ce9c..d8b43f0e08fc 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -346,9 +346,19 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, status = nfs4_proc_exchange_id(clp, cred); if (status != NFS4_OK) return status; - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - return nfs41_walk_client_list(clp, result, cred); + status = nfs41_walk_client_list(clp, result, cred); + if (status < 0) + return status; + if (clp != *result) + return 0; + + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_manager(clp); + status = nfs_wait_client_init_complete(clp); + if (status < 0) + nfs_put_client(clp); + return status; } #endif /* CONFIG_NFS_V4_1 */ -- cgit v1.2.3 From e11259f920d8cb3550e0f311c064bdabe1bc3aaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Mar 2015 20:35:31 -0500 Subject: NFSv4.1: Clear the old state by our client id before establishing a new lease If the call to exchange-id returns with the EXCHGID4_FLAG_CONFIRMED_R flag set, then that means our lease was established by a previous mount instance. Ensure that we detect this situation, and that we clear the state held by that mount. Reported-by: Jorge Mora Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 +++++++++++---- fs/nfs/nfs4session.h | 1 + fs/nfs/nfs4state.c | 6 +++++- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 732526e04cd5..627f37c44456 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6897,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status == 0) { clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) + clp->cl_exchange_flags = res.flags; + /* Client ID is not confirmed */ + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); clp->cl_seqid = res.seqid; + } kfree(clp->cl_serverowner); clp->cl_serverowner = res.server_owner; @@ -7231,6 +7235,9 @@ static void nfs4_update_session(struct nfs4_session *session, struct nfs41_create_session_res *res) { nfs4_copy_sessionid(&session->sess_id, &res->sessionid); + /* Mark client id and session as being confirmed */ + session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state); session->flags = res->flags; memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs)); if (res->flags & SESSION4_BACK_CHAN) @@ -7326,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, dprintk("--> nfs4_proc_destroy_session\n"); /* session is still being setup */ - if (session->clp->cl_cons_state != NFS_CS_READY) - return status; + if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) + return 0; status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_destroy_session(session->clp, status); diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index fc46c7455898..e3ea2c5324d6 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -70,6 +70,7 @@ struct nfs4_session { enum nfs4_session_state { NFS4_SESSION_INITING, + NFS4_SESSION_ESTABLISHED, }; extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d8b43f0e08fc..f95e3b58bbc3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -353,7 +353,11 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, if (clp != *result) return 0; - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + /* Purge state if the client id was established in a prior instance */ + if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + else + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); nfs4_schedule_state_manager(clp); status = nfs_wait_client_init_complete(clp); if (status < 0) -- cgit v1.2.3 From 0164bf0239777811bdc3e01f45501174dc6db19d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2015 17:34:32 -0500 Subject: locks: fix fasync_struct memory leak in lease upgrade/downgrade handling Commit 8634b51f6ca2 (locks: convert lease handling to file_lock_context) introduced a regression in the handling of lease upgrade/downgrades. In the event that we already have a lease on a file and are going to either upgrade or downgrade it, we skip doing any list insertion or deletion and simply re-call lm_setup on the existing lease. As of commit 8634b51f6ca2 however, we end up calling lm_setup on the lease that was passed in, instead of on the existing lease. This causes us to leak the fasync_struct that was allocated in the event that there was not already an existing one (as it always appeared that there wasn't one). Fixes: 8634b51f6ca2 (locks: convert lease handling to file_lock_context) Reported-and-Tested-by: Daniel Wagner Signed-off-by: Jeff Layton --- fs/locks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 365c82e1b3a9..f1bad681fc1c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1665,7 +1665,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr } if (my_fl != NULL) { - error = lease->fl_lmops->lm_change(my_fl, arg, &dispose); + lease = my_fl; + error = lease->fl_lmops->lm_change(lease, arg, &dispose); if (error) goto out; goto out_setup; -- cgit v1.2.3 From f5c0a122800c301eecef93275b0c5d58bb4c15d9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 2 Mar 2015 12:51:02 -0500 Subject: Btrfs: remove extra run_delayed_refs in update_cowonly_root This got added with my dirty_bgs patch, it's not needed. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 038fcf6051e0..323c6541d3dc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1052,9 +1052,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; } return 0; -- cgit v1.2.3 From 3a8b36f378060d20062a0918e99fae39ff077bf0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 1 Mar 2015 20:36:00 +0000 Subject: Btrfs: fix data loss in the fast fsync path When using the fast file fsync code path we can miss the fact that new writes happened since the last file fsync and therefore return without waiting for the IO to finish and write the new extents to the fsync log. Here's an example scenario where the fsync will miss the fact that new file data exists that wasn't yet durably persisted: 1. fs_info->last_trans_committed == N - 1 and current transaction is transaction N (fs_info->generation == N); 2. do a buffered write; 3. fsync our inode, this clears our inode's full sync flag, starts an ordered extent and waits for it to complete - when it completes at btrfs_finish_ordered_io(), the inode's last_trans is set to the value N (via btrfs_update_inode_fallback -> btrfs_update_inode -> btrfs_set_inode_last_trans); 4. transaction N is committed, so fs_info->last_trans_committed is now set to the value N and fs_info->generation remains with the value N; 5. do another buffered write, when this happens btrfs_file_write_iter sets our inode's last_trans to the value N + 1 (that is fs_info->generation + 1 == N + 1); 6. transaction N + 1 is started and fs_info->generation now has the value N + 1; 7. transaction N + 1 is committed, so fs_info->last_trans_committed is set to the value N + 1; 8. fsync our inode - because it doesn't have the full sync flag set, we only start the ordered extent, we don't wait for it to complete (only in a later phase) therefore its last_trans field has the value N + 1 set previously by btrfs_file_write_iter(), and so we have: inode->last_trans <= fs_info->last_trans_committed (N + 1) (N + 1) Which made us not log the last buffered write and exit the fsync handler immediately, returning success (0) to user space and resulting in data loss after a crash. This can actually be triggered deterministically and the following excerpt from a testcase I made for xfstests triggers the issue. It moves a dummy file across directories and then fsyncs the old parent directory - this is just to trigger a transaction commit, so moving files around isn't directly related to the issue but it was chosen because running 'sync' for example does more than just committing the current transaction, as it flushes/waits for all file data to be persisted. The issue can also happen at random periods, since the transaction kthread periodicaly commits the current transaction (about every 30 seconds by default). The body of the test is: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our main test file 'foo', the one we check for data loss. # By doing an fsync against our file, it makes btrfs clear the 'needs_full_sync' # bit from its flags (btrfs inode specific flags). $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 8K" \ -c "fsync" $SCRATCH_MNT/foo | _filter_xfs_io # Now create one other file and 2 directories. We will move this second file # from one directory to the other later because it forces btrfs to commit its # currently open transaction if we fsync the old parent directory. This is # necessary to trigger the data loss bug that affected btrfs. mkdir $SCRATCH_MNT/testdir_1 touch $SCRATCH_MNT/testdir_1/bar mkdir $SCRATCH_MNT/testdir_2 # Make sure everything is durably persisted. sync # Write more 8Kb of data to our file. $XFS_IO_PROG -c "pwrite -S 0xbb 8K 8K" $SCRATCH_MNT/foo | _filter_xfs_io # Move our 'bar' file into a new directory. mv $SCRATCH_MNT/testdir_1/bar $SCRATCH_MNT/testdir_2/bar # Fsync our first directory. Because it had a file moved into some other # directory, this made btrfs commit the currently open transaction. This is # a condition necessary to trigger the data loss bug. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir_1 # Now fsync our main test file. If the fsync succeeds, we expect the 8Kb of # data we wrote previously to be persisted and available if a crash happens. # This did not happen with btrfs, because of the transaction commit that # happened when we fsynced the parent directory. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Now check that all data we wrote before are available. echo "File content after log replay:" od -t x1 $SCRATCH_MNT/foo status=0 exit The expected golden output for the test, which is what we get with this fix applied (or when running against ext3/4 and xfs), is: wrote 8192/8192 bytes at offset 0 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 8192/8192 bytes at offset 8192 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) File content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0020000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb * 0040000 Without this fix applied, the output shows the test file does not have the second 8Kb extent that we successfully fsynced: wrote 8192/8192 bytes at offset 0 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 8192/8192 bytes at offset 8192 XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) File content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0020000 So fix this by skipping the fsync only if we're doing a full sync and if the inode's last_trans is <= fs_info->last_trans_committed, or if the inode is already in the log. Also remove setting the inode's last_trans in btrfs_file_write_iter since it's useless/unreliable. Also because btrfs_file_write_iter no longer sets inode->last_trans to fs_info->generation + 1, don't set last_trans to 0 if we bail out and don't bail out if last_trans is 0, otherwise something as simple as the following example wouldn't log the second write on the last fsync: 1. write to file 2. fsync file 3. fsync file |--> btrfs_inode_in_log() returns true and it set last_trans to 0 4. write to file |--> btrfs_file_write_iter() no longers sets last_trans, so it remained with a value of 0 5. fsync |--> inode->last_trans == 0, so it bails out without logging the second write A test case for xfstests will be sent soon. CC: Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/file.c | 56 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b476e5645034..6351947c9bb0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_unlock(&inode->i_mutex); /* - * we want to make sure fsync finds this change - * but we haven't joined a transaction running right now. - * - * Later on, someone is sure to update the inode and get the - * real transid recorded. - * - * We set last_trans now to the fs_info generation + 1, - * this will either be one more than the running transaction - * or the generation used for the next transaction if there isn't - * one running right now. - * * We also have to set last_sub_trans to the current log transid, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); @@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * check the transaction that last modified this inode - * and see if its already been committed - */ - if (!BTRFS_I(inode)->last_trans) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * if the last transaction that changed this file was before - * the current transaction, we can bail out now without any - * syncing + * If the last transaction that changed this file was before the current + * transaction and we have the full sync flag set in our inode, we can + * bail out now without any syncing. + * + * Note that we can't bail out if the full sync flag isn't set. This is + * because when the full sync flag is set we start all ordered extents + * and wait for them to fully complete - when they complete they update + * the inode's last_trans field through: + * + * btrfs_finish_ordered_io() -> + * btrfs_update_inode_fallback() -> + * btrfs_update_inode() -> + * btrfs_set_inode_last_trans() + * + * So we are sure that last_trans is up to date and can do this check to + * bail out safely. For the fast path, when the full sync flag is not + * set in our inode, we can not do it because we start only our ordered + * extents and don't wait for them to complete (that is when + * btrfs_finish_ordered_io runs), so here at this point their last_trans + * value might be less than or equals to fs_info->last_trans_committed, + * and setting a speculative last_trans for an inode when a buffered + * write is made (such as fs_info->generation + 1 for example) would not + * be reliable since after setting the value and before fsync is called + * any number of transactions can start and commit (transaction kthread + * commits the current transaction periodically), and a transaction + * commit does not start nor waits for ordered extents to complete. */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed) { - BTRFS_I(inode)->last_trans = 0; - + (full_sync && BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed)) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever -- cgit v1.2.3 From dd9ef135e3542ffc621c4eb7f0091870ec7a1504 Mon Sep 17 00:00:00 2001 From: Quentin Casasnovas Date: Tue, 3 Mar 2015 16:31:38 +0100 Subject: Btrfs:__add_inode_ref: out of bounds memory read when looking for extended ref. Improper arithmetics when calculting the address of the extended ref could lead to an out of bounds memory read and kernel panic. Signed-off-by: Quentin Casasnovas Reviewed-by: David Sterba cc: stable@vger.kernel.org # v3.7+ Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f96996a1b70c..9a1c1711f360 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1012,7 +1012,7 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)base + cur_offset; + extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name_len = btrfs_inode_extref_name_len(leaf, extref); -- cgit v1.2.3