1 files changed, 46 insertions, 81 deletions
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d919ad7b16bf..2f1839e4dd1b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -187,6 +187,8 @@ xfs_generic_create(
 	else
 		d_instantiate(dentry, inode);
 
+	xfs_finish_inode_setup(ip);
+
  out_free_acl:
 	if (default_acl)
 		posix_acl_release(default_acl);
@@ -195,6 +197,7 @@ xfs_generic_create(
 	return error;
 
  out_cleanup_inode:
+	xfs_finish_inode_setup(ip);
 	if (!tmpfile)
 		xfs_cleanup_inode(dir, inode, dentry);
 	iput(inode);
@@ -367,9 +370,11 @@ xfs_vn_symlink(
 		goto out_cleanup_inode;
 
 	d_instantiate(dentry, inode);
+	xfs_finish_inode_setup(cip);
 	return 0;
 
  out_cleanup_inode:
+	xfs_finish_inode_setup(cip);
 	xfs_cleanup_inode(dir, inode, dentry);
 	iput(inode);
  out:
@@ -389,7 +394,7 @@ xfs_vn_rename(
 	struct xfs_name	oname;
 	struct xfs_name	nname;
 
-	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
 	/* if we are exchanging files, we need to set i_mode of both files */
@@ -751,6 +756,7 @@ xfs_setattr_size(
 	int			error;
 	uint			lock_flags = 0;
 	uint			commit_flags = 0;
+	bool			did_zeroing = false;
 
 	trace_xfs_setattr(ip);
 
@@ -765,6 +771,7 @@ xfs_setattr_size(
 		return error;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
 	ASSERT(S_ISREG(ip->i_d.di_mode));
 	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -794,20 +801,16 @@ xfs_setattr_size(
 		return error;
 
 	/*
-	 * Now we can make the changes.  Before we join the inode to the
-	 * transaction, take care of the part of the truncation that must be
-	 * done without the inode lock.  This needs to be done before joining
-	 * the inode to the transaction, because the inode cannot be unlocked
-	 * once it is a part of the transaction.
+	 * File data changes must be complete before we start the transaction to
+	 * modify the inode.  This needs to be done before joining the inode to
+	 * the transaction because the inode cannot be unlocked once it is a
+	 * part of the transaction.
+	 *
+	 * Start with zeroing any data block beyond EOF that we may expose on
+	 * file extension.
 	 */
 	if (newsize > oldsize) {
-		/*
-		 * Do the first part of growing a file: zero any data in the
-		 * last block that is beyond the old EOF.  We need to do this
-		 * before the inode is joined to the transaction to modify
-		 * i_size.
-		 */
-		error = xfs_zero_eof(ip, newsize, oldsize);
+		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
 		if (error)
 			return error;
 	}
@@ -817,75 +820,42 @@ xfs_setattr_size(
 	 * any previous writes that are beyond the on disk EOF and the new
 	 * EOF that have not been written out need to be written here.  If we
 	 * do not write the data out, we expose ourselves to the null files
-	 * problem.
-	 *
-	 * Only flush from the on disk size to the smaller of the in memory
-	 * file size or the new size as that's the range we really care about
-	 * here and prevents waiting for other data not within the range we
-	 * care about here.
+	 * problem. Note that this includes any block zeroing we did above;
+	 * otherwise those blocks may not be zeroed after a crash.
 	 */
-	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+	if (newsize > ip->i_d.di_size &&
+	    (oldsize != ip->i_d.di_size || did_zeroing)) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
 			return error;
 	}
 
-	/*
-	 * Wait for all direct I/O to complete.
-	 */
+	/* Now wait for all direct I/O to complete. */
 	inode_dio_wait(inode);
 
 	/*
-	 * Do all the page cache truncate work outside the transaction context
-	 * as the "lock" order is page lock->log space reservation.  i.e.
-	 * locking pages inside the transaction can ABBA deadlock with
-	 * writeback. We have to do the VFS inode size update before we truncate
-	 * the pagecache, however, to avoid racing with page faults beyond the
-	 * new EOF they are not serialised against truncate operations except by
-	 * page locks and size updates.
+	 * We've already locked out new page faults, so now we can safely remove
+	 * pages from the page cache knowing they won't get refaulted until we
+	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
+	 * complete. The truncate_setsize() call also cleans partial EOF page
+	 * PTEs on extending truncates and hence ensures sub-page block size
+	 * filesystems are correctly handled, too.
 	 *
-	 * Hence we are in a situation where a truncate can fail with ENOMEM
-	 * from xfs_trans_reserve(), but having already truncated the in-memory
-	 * version of the file (i.e. made user visible changes). There's not
-	 * much we can do about this, except to hope that the caller sees ENOMEM
-	 * and retries the truncate operation.
+	 * We have to do all the page cache truncate work outside the
+	 * transaction context as the "lock" order is page lock->log space
+	 * reservation as defined by extent allocation in the writeback path.
+	 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+	 * having already truncated the in-memory version of the file (i.e. made
+	 * user visible changes). There's not much we can do about this, except
+	 * to hope that the caller sees ENOMEM and retries the truncate
+	 * operation.
 	 */
 	error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
 	if (error)
 		return error;
 	truncate_setsize(inode, newsize);
 
-	/*
-	 * The "we can't serialise against page faults" pain gets worse.
-	 *
-	 * If the file is mapped then we have to clean the page at the old EOF
-	 * when extending the file. Extending the file can expose changes the
-	 * underlying page mapping (e.g. from beyond EOF to a hole or
-	 * unwritten), and so on the next attempt to write to that page we need
-	 * to remap it for write. i.e. we need .page_mkwrite() to be called.
-	 * Hence we need to clean the page to clean the pte and so a new write
-	 * fault will be triggered appropriately.
-	 *
-	 * If we do it before we change the inode size, then we can race with a
-	 * page fault that maps the page with exactly the same problem. If we do
-	 * it after we change the file size, then a new page fault can come in
-	 * and allocate space before we've run the rest of the truncate
-	 * transaction. That's kinda grotesque, but it's better than have data
-	 * over a hole, and so that's the lesser evil that has been chosen here.
-	 *
-	 * The real solution, however, is to have some mechanism for locking out
-	 * page faults while a truncate is in progress.
-	 */
-	if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
-		error = filemap_write_and_wait_range(
-				VFS_I(ip)->i_mapping,
-				round_down(oldsize, PAGE_CACHE_SIZE),
-				round_up(oldsize, PAGE_CACHE_SIZE) - 1);
-		if (error)
-			return error;
-	}
-
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 	if (error)
@@ -983,9 +953,13 @@ xfs_vn_setattr(
 		uint		iolock = XFS_IOLOCK_EXCL;
 
 		xfs_ilock(ip, iolock);
-		error = xfs_break_layouts(dentry->d_inode, &iolock);
-		if (!error)
+		error = xfs_break_layouts(dentry->d_inode, &iolock, true);
+		if (!error) {
+			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+			iolock |= XFS_MMAPLOCK_EXCL;
+
 			error = xfs_setattr_size(ip, iattr);
+		}
 		xfs_iunlock(ip, iolock);
 	} else {
 		error = xfs_setattr_nonsize(ip, iattr, 0);
@@ -1236,16 +1210,12 @@ xfs_diflags_to_iflags(
 }
 
 /*
- * Initialize the Linux inode, set up the operation vectors and
- * unlock the inode.
- *
- * When reading existing inodes from disk this is called directly
- * from xfs_iget, when creating a new inode it is called from
- * xfs_ialloc after setting up the inode.
+ * Initialize the Linux inode and set up the operation vectors.
  *
- * We are always called with an uninitialised linux inode here.
- * We need to initialise the necessary fields and take a reference
- * on it.
+ * When reading existing inodes from disk this is called directly from xfs_iget,
+ * when creating a new inode it is called from xfs_ialloc after setting up the
+ * inode. These callers have different criteria for clearing XFS_INEW, so leave
+ * it up to the caller to deal with unlocking the inode appropriately.
  */
 void
 xfs_setup_inode(
@@ -1332,9 +1302,4 @@ xfs_setup_inode(
 		inode_has_no_xattr(inode);
 		cache_no_acl(inode);
 	}
-
-	xfs_iflags_clear(ip, XFS_INEW);
-	barrier();
-
-	unlock_new_inode(inode);
 }