3 files changed, 77 insertions, 118 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 61205e3bbefa..1da5753d886d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,6 +196,16 @@ struct btrfs_inode {
 	struct list_head delayed_iput;
 	long delayed_iput_count;
 
+	/*
+	 * To avoid races between lockless (i_mutex not held) direct IO writes
+	 * and concurrent fsync requests. Direct IO writes must acquire read
+	 * access on this semaphore for creating an extent map and its
+	 * corresponding ordered extent. The fast fsync path must acquire write
+	 * access on this semaphore before it collects ordered extents and
+	 * extent maps.
+	 */
+	struct rw_semaphore dio_sem;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ee9be4199e7c..c1ee4ade2d87 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7145,6 +7145,43 @@ out:
 	return em;
 }
 
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+						  const u64 start,
+						  const u64 len,
+						  const u64 orig_start,
+						  const u64 block_start,
+						  const u64 block_len,
+						  const u64 orig_block_len,
+						  const u64 ram_bytes,
+						  const int type)
+{
+	struct extent_map *em = NULL;
+	int ret;
+
+	down_read(&BTRFS_I(inode)->dio_sem);
+	if (type != BTRFS_ORDERED_NOCOW) {
+		em = create_pinned_em(inode, start, len, orig_start,
+				      block_start, block_len, orig_block_len,
+				      ram_bytes, type);
+		if (IS_ERR(em))
+			goto out;
+	}
+	ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+					   len, block_len, type);
+	if (ret) {
+		if (em) {
+			free_extent_map(em);
+			btrfs_drop_extent_cache(inode, start,
+						start + len - 1, 0);
+		}
+		em = ERR_PTR(ret);
+	}
+ out:
+	up_read(&BTRFS_I(inode)->dio_sem);
+
+	return em;
+}
+
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 						  u64 start, u64 len)
 {
@@ -7160,43 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	if (ret)
 		return ERR_PTR(ret);
 
-	/*
-	 * Create the ordered extent before the extent map. This is to avoid
-	 * races with the fast fsync path that would lead to it logging file
-	 * extent items that point to disk extents that were not yet written to.
-	 * The fast fsync path collects ordered extents into a local list and
-	 * then collects all the new extent maps, so we must create the ordered
-	 * extent first and make sure the fast fsync path collects any new
-	 * ordered extents after collecting new extent maps as well.
-	 * The fsync path simply can not rely on inode_dio_wait() because it
-	 * causes deadlock with AIO.
-	 */
-	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-					   ins.offset, ins.offset, 0);
-	if (ret) {
-		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-		return ERR_PTR(ret);
-	}
-
+	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+				     ins.objectid, ins.offset, ins.offset,
+				     ins.offset, 0);
 	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
-
-	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-			      ins.offset, ins.offset, ins.offset, 0);
-	if (IS_ERR(em)) {
-		struct btrfs_ordered_extent *oe;
-
+	if (IS_ERR(em))
 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-		oe = btrfs_lookup_ordered_extent(inode, start);
-		ASSERT(oe);
-		if (WARN_ON(!oe))
-			return em;
-		set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-		set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-		btrfs_remove_ordered_extent(inode, oe);
-		/* Once for our lookup and once for the ordered extents tree. */
-		btrfs_put_ordered_extent(oe);
-		btrfs_put_ordered_extent(oe);
-	}
+
 	return em;
 }
 
@@ -7670,57 +7677,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		if (can_nocow_extent(inode, start, &len, &orig_start,
 				     &orig_block_len, &ram_bytes) == 1 &&
 		    btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+			struct extent_map *em2;
 
-			/*
-			 * Create the ordered extent before the extent map. This
-			 * is to avoid races with the fast fsync path because it
-			 * collects ordered extents into a local list and then
-			 * collects all the new extent maps, so we must create
-			 * the ordered extent first and make sure the fast fsync
-			 * path collects any new ordered extents after
-			 * collecting new extent maps as well. The fsync path
-			 * simply can not rely on inode_dio_wait() because it
-			 * causes deadlock with AIO.
-			 */
-			ret = btrfs_add_ordered_extent_dio(inode, start,
-					   block_start, len, len, type);
+			em2 = btrfs_create_dio_extent(inode, start, len,
+						      orig_start, block_start,
+						      len, orig_block_len,
+						      ram_bytes, type);
 			btrfs_dec_nocow_writers(root->fs_info, block_start);
-			if (ret) {
-				free_extent_map(em);
-				goto unlock_err;
-			}
-
 			if (type == BTRFS_ORDERED_PREALLOC) {
 				free_extent_map(em);
-				em = create_pinned_em(inode, start, len,
-						       orig_start,
-						       block_start, len,
-						       orig_block_len,
-						       ram_bytes, type);
-				if (IS_ERR(em)) {
-					struct btrfs_ordered_extent *oe;
-
-					ret = PTR_ERR(em);
-					oe = btrfs_lookup_ordered_extent(inode,
-									 start);
-					ASSERT(oe);
-					if (WARN_ON(!oe))
-						goto unlock_err;
-					set_bit(BTRFS_ORDERED_IOERR,
-						&oe->flags);
-					set_bit(BTRFS_ORDERED_IO_DONE,
-						&oe->flags);
-					btrfs_remove_ordered_extent(inode, oe);
-					/*
-					 * Once for our lookup and once for the
-					 * ordered extents tree.
-					 */
-					btrfs_put_ordered_extent(oe);
-					btrfs_put_ordered_extent(oe);
-					goto unlock_err;
-				}
+				em = em2;
+			}
+			if (em2 && IS_ERR(em2)) {
+				ret = PTR_ERR(em2);
+				goto unlock_err;
 			}
-
 			goto unlock;
 		}
 	}
@@ -9281,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->delayed_iput);
 	RB_CLEAR_NODE(&ei->rb_node);
+	init_rwsem(&ei->dio_sem);
 
 	return inode;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a24a0ba523d6..003a826f4cff 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 	INIT_LIST_HEAD(&extents);
 
+	down_write(&BTRFS_I(inode)->dio_sem);
 	write_lock(&tree->lock);
 	test_gen = root->fs_info->last_trans_committed;
 
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 
 	list_sort(NULL, &extents, extent_cmp);
+	btrfs_get_logged_extents(inode, logged_list, start, end);
 	/*
-	 * Collect any new ordered extents within the range. This is to
-	 * prevent logging file extent items without waiting for the disk
-	 * location they point to being written. We do this only to deal
-	 * with races against concurrent lockless direct IO writes.
+	 * Some ordered extents started by fsync might have completed
+	 * before we could collect them into the list logged_list, which
+	 * means they're gone, not in our logged_list nor in the inode's
+	 * ordered tree. We want the application/user space to know an
+	 * error happened while attempting to persist file data so that
+	 * it can take proper action. If such error happened, we leave
+	 * without writing to the log tree and the fsync must report the
+	 * file data write error and not commit the current transaction.
 	 */
-	btrfs_get_logged_extents(inode, logged_list, start, end);
+	ret = btrfs_inode_check_errors(inode);
+	if (ret)
+		ctx->io_err = ret;
 process:
 	while (!list_empty(&extents)) {
 		em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
 	}
 	WARN_ON(!list_empty(&extents));
 	write_unlock(&tree->lock);
+	up_write(&BTRFS_I(inode)->dio_sem);
 
 	btrfs_release_path(path);
 	return ret;
@@ -4623,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
 	/*
-	 * Collect ordered extents only if we are logging data. This is to
-	 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-	 * will process the ordered extents if they still exists at the time,
-	 * because when we collect them we test and set for the flag
-	 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-	 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-	 * not processing the ordered extents is that we end up logging the
-	 * corresponding file extent items, based on the extent maps in the
-	 * inode's extent_map_tree's modified_list, without logging the
-	 * respective checksums (since the may still be only attached to the
-	 * ordered extents and have not been inserted in the csum tree by
-	 * btrfs_finish_ordered_io() yet).
-	 */
-	if (inode_only == LOG_INODE_ALL)
-		btrfs_get_logged_extents(inode, &logged_list, start, end);
-
-	/*
 	 * a brute force approach to making sure we get the most uptodate
 	 * copies of everything.
 	 */
@@ -4846,21 +4838,6 @@ log_extents:
 			goto out_unlock;
 	}
 	if (fast_search) {
-		/*
-		 * Some ordered extents started by fsync might have completed
-		 * before we collected the ordered extents in logged_list, which
-		 * means they're gone, not in our logged_list nor in the inode's
-		 * ordered tree. We want the application/user space to know an
-		 * error happened while attempting to persist file data so that
-		 * it can take proper action. If such error happened, we leave
-		 * without writing to the log tree and the fsync must report the
-		 * file data write error and not commit the current transaction.
-		 */
-		err = btrfs_inode_check_errors(inode);
-		if (err) {
-			ctx->io_err = err;
-			goto out_unlock;
-		}
 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
 						&logged_list, ctx, start, end);
 		if (ret) {