From a1d3c4786a4b9c71c0767aa656a759968f7554b6 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 4 Aug 2011 17:15:33 +0200
Subject: btrfs: btrfs_multi_bio replaced with btrfs_bio

btrfs_bio is a bio abstraction able to split and not complete after the last
bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio
tracks the mirror_num used to read data which can be used for error
correction purposes.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent-tree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..119f842c1d4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1770,18 +1770,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 {
 	int ret;
 	u64 discarded_bytes = 0;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 
 
 	/* Tell the block device(s) that the sectors can be discarded */
 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-			      bytenr, &num_bytes, &multi, 0);
+			      bytenr, &num_bytes, &bbio, 0);
 	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
+		struct btrfs_bio_stripe *stripe = bbio->stripes;
 		int i;
 
 
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 			if (!stripe->dev->can_discard)
 				continue;
 
@@ -1800,7 +1800,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 			 */
 			ret = 0;
 		}
-		kfree(multi);
+		kfree(bbio);
 	}
 
 	if (actual_bytes)
-- 
cgit v1.2.1


From 0cbbdf7c9c46467bfb7129c30236f36a679ab244 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 14 Jul 2011 16:02:04 -0400
Subject: Btrfs: kill reserved_bytes in inode

reserved_bytes is not used for anything in the inode, remove it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..03edac4f7771 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3122,7 +3122,6 @@ commit_trans:
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
-	BTRFS_I(inode)->reserved_bytes += bytes;
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3144,7 +3143,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = BTRFS_I(inode)->space_info;
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
-	BTRFS_I(inode)->reserved_bytes -= bytes;
 	spin_unlock(&data_sinfo->lock);
 }
 
-- 
cgit v1.2.1


From fb25e9141ab843794d5cdef3936ccb58435e2371 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 26 Jul 2011 17:00:46 -0400
Subject: Btrfs: use bytes_may_use for all ENOSPC reservations

We have been using bytes_reserved for metadata reservations, which is wrong
since we use that to keep track of outstanding reservations from the allocator.
This resulted in us doing a lot of silly things to make sure we don't allocate a
bunch of metadata chunks since we never had a real view of how much space was
actually in use by metadata.

This passes Arne's enospc test and xfstests as well as my own enospc tests.
Hopefully this will get us moving in the right direction.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 163 +++++++++++++++++++++++++++----------------------
 1 file changed, 89 insertions(+), 74 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 03edac4f7771..fbe6278f466b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -52,6 +52,21 @@ enum {
 	CHUNK_ALLOC_LIMITED = 2,
 };
 
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+	RESERVE_FREE = 0,
+	RESERVE_ALLOC = 1,
+	RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +96,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -3128,9 +3145,7 @@ commit_trans:
 }
 
 /*
- * called when we are clearing an delalloc extent from the
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
+ * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
@@ -3163,6 +3178,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
 			      int force)
 {
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
 	u64 thresh;
@@ -3170,6 +3186,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (force == CHUNK_ALLOC_FORCE)
 		return 1;
 
+	/*
+	 * We need to take into account the global rsv because for all intents
+	 * and purposes it's used space.  Don't worry about locking the
+	 * global_rsv, it doesn't change except when the transaction commits.
+	 */
+	num_allocated += global_rsv->size;
+
 	/*
 	 * in limited mode, we want to have some free space up to
 	 * about 1% of the FS size.
@@ -3317,7 +3340,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	space_info = block_rsv->space_info;
 
 	smp_mb();
-	reserved = space_info->bytes_reserved;
+	reserved = space_info->bytes_may_use;
 	progress = space_info->reservation_progress;
 
 	if (reserved == 0)
@@ -3341,9 +3364,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved)
-			reclaimed += reserved - space_info->bytes_reserved;
-		reserved = space_info->bytes_reserved;
+		if (reserved > space_info->bytes_may_use)
+			reclaimed += reserved - space_info->bytes_may_use;
+		reserved = space_info->bytes_may_use;
 		spin_unlock(&space_info->lock);
 
 		loops++;
@@ -3401,7 +3424,6 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
-
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3443,7 +3465,7 @@ again:
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
 		if (unused >= num_bytes) {
-			space_info->bytes_reserved += orig_bytes;
+			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
 			/*
@@ -3614,7 +3636,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		}
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
-			space_info->bytes_reserved -= num_bytes;
+			space_info->bytes_may_use -= num_bytes;
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -3825,12 +3847,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
-		sinfo->bytes_reserved += num_bytes;
+		sinfo->bytes_may_use += num_bytes;
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
-		sinfo->bytes_reserved -= num_bytes;
+		sinfo->bytes_may_use -= num_bytes;
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4133,7 +4155,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
-			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4185,7 +4206,6 @@ static int pin_down_extent(struct btrfs_root *root,
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
-		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
@@ -4212,46 +4232,55 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	return 0;
 }
 
-/*
- * update size of reserved extents. this function may return -EAGAIN
- * if 'reserve' is true or 'sinfo' is false.
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:	The cache we are manipulating
+ * @num_bytes:	The number of bytes in question
+ * @reserve:	One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
  */
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				u64 num_bytes, int reserve, int sinfo)
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve)
 {
+	struct btrfs_space_info *space_info = cache->space_info;
 	int ret = 0;
-	if (sinfo) {
-		struct btrfs_space_info *space_info = cache->space_info;
-		spin_lock(&space_info->lock);
-		spin_lock(&cache->lock);
-		if (reserve) {
-			if (cache->ro) {
-				ret = -EAGAIN;
-			} else {
-				cache->reserved += num_bytes;
-				space_info->bytes_reserved += num_bytes;
-			}
-		} else {
-			if (cache->ro)
-				space_info->bytes_readonly += num_bytes;
-			cache->reserved -= num_bytes;
-			space_info->bytes_reserved -= num_bytes;
-			space_info->reservation_progress++;
-		}
-		spin_unlock(&cache->lock);
-		spin_unlock(&space_info->lock);
-	} else {
-		spin_lock(&cache->lock);
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (reserve != RESERVE_FREE) {
 		if (cache->ro) {
 			ret = -EAGAIN;
 		} else {
-			if (reserve)
-				cache->reserved += num_bytes;
-			else
-				cache->reserved -= num_bytes;
+			cache->reserved += num_bytes;
+			space_info->bytes_reserved += num_bytes;
+			if (reserve == RESERVE_ALLOC) {
+				BUG_ON(space_info->bytes_may_use < num_bytes);
+				space_info->bytes_may_use -= num_bytes;
+			}
 		}
-		spin_unlock(&cache->lock);
+	} else {
+		if (cache->ro)
+			space_info->bytes_readonly += num_bytes;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+		space_info->reservation_progress++;
 	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
 	return ret;
 }
 
@@ -4322,7 +4351,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		} else if (cache->reserved_pinned > 0) {
 			len = min(len, cache->reserved_pinned);
 			cache->reserved_pinned -= len;
-			cache->space_info->bytes_reserved += len;
+			cache->space_info->bytes_may_use += len;
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
@@ -4701,27 +4730,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
-		ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
-		if (ret == -EAGAIN) {
-			/* block group became read-only */
-			btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
-			goto out;
-		}
+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
 
-		ret = 1;
-		spin_lock(&block_rsv->lock);
-		if (block_rsv->reserved < block_rsv->size) {
-			block_rsv->reserved += buf->len;
-			ret = 0;
-		}
-		spin_unlock(&block_rsv->lock);
-
-		if (ret) {
-			spin_lock(&cache->space_info->lock);
-			cache->space_info->bytes_reserved -= buf->len;
-			cache->space_info->reservation_progress++;
-			spin_unlock(&cache->space_info->lock);
-		}
 		goto out;
 	}
 pin:
@@ -4881,6 +4891,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	int last_ptr_loop = 0;
 	int loop = 0;
 	int index = 0;
+	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
@@ -5200,8 +5212,8 @@ checks:
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
-					    (data & BTRFS_BLOCK_GROUP_DATA));
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+						  alloc_type);
 		if (ret == -EAGAIN) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
@@ -5323,7 +5335,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 	int index = 0;
 
 	spin_lock(&info->lock);
-	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+	       (unsigned long long)info->flags,
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved -
 				    info->bytes_readonly),
@@ -5425,7 +5438,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		ret = btrfs_discard_extent(root, start, len, NULL);
 
 	btrfs_add_free_space(cache, start, len);
-	btrfs_update_reserved_bytes(cache, len, 0, 1);
+	btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -5628,7 +5641,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 		put_caching_control(caching_ctl);
 	}
 
-	ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+					  RESERVE_ALLOC_NO_ACCOUNT);
 	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -6594,7 +6608,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
 	    sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_reserved += cache->reserved_pinned;
+		sinfo->bytes_may_use += cache->reserved_pinned;
 		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
@@ -6962,7 +6976,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 					struct btrfs_space_info,
 					list);
 		if (space_info->bytes_pinned > 0 ||
-		    space_info->bytes_reserved > 0) {
+		    space_info->bytes_reserved > 0 ||
+		    space_info->bytes_may_use > 0) {
 			WARN_ON(1);
 			dump_space_info(space_info, 0, 0);
 		}
-- 
cgit v1.2.1


From 7709cde33f12db71efb377fae4ae7aab6c94ebc6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 4 Aug 2011 10:25:02 -0400
Subject: Btrfs: calculate checksum space correctly

We have not been reserving enough space for checksums.  We were just reserving
bytes for the checksum items themselves, we were not taking into account having
to cow the tree and such.  This patch adds a csum_bytes counter to the inode for
keeping track of the number of bytes outstanding we have for checksums.  Then we
calculate how many leaves would be required for the checksums we are given and
use that to reserve space.  This adds a significant amount of bytes to our
reservations, but we will handle this later.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 109 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fbe6278f466b..4add1ac2dda0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3984,11 +3984,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
 	unsigned dropped_extents = 0;
 
-	spin_lock(&BTRFS_I(inode)->lock);
 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 
@@ -3998,19 +4006,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
 	 */
 	if (BTRFS_I(inode)->outstanding_extents >=
 	    BTRFS_I(inode)->reserved_extents)
-		goto out;
+		return 0;
 
 	dropped_extents = BTRFS_I(inode)->reserved_extents -
 		BTRFS_I(inode)->outstanding_extents;
 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
-out:
-	spin_unlock(&BTRFS_I(inode)->lock);
 	return dropped_extents;
 }
 
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *	reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+				   int reserve)
 {
-	return num_bytes >>= 3;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 csum_size;
+	int num_csums_per_leaf;
+	int num_csums;
+	int old_csums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+	    BTRFS_I(inode)->csum_bytes == 0)
+		return 0;
+
+	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	if (reserve)
+		BTRFS_I(inode)->csum_bytes += num_bytes;
+	else
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+	num_csums_per_leaf = (int)div64_u64(csum_size,
+					    sizeof(struct btrfs_csum_item) +
+					    sizeof(struct btrfs_disk_key));
+	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	num_csums = num_csums + num_csums_per_leaf - 1;
+	num_csums = num_csums / num_csums_per_leaf;
+
+	old_csums = old_csums + num_csums_per_leaf - 1;
+	old_csums = old_csums / num_csums_per_leaf;
+
+	/* No change, no need to reserve more */
+	if (old_csums == num_csums)
+		return 0;
+
+	if (reserve)
+		return btrfs_calc_trans_metadata_size(root,
+						      num_csums - old_csums);
+
+	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
 }
 
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -4037,9 +4096,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
 	}
+	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	to_reserve += calc_csum_metadata_size(inode, num_bytes);
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret) {
 		unsigned dropped;
@@ -4047,8 +4106,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		 * We don't need the return value since our reservation failed,
 		 * we just need to clean up our counter.
 		 */
+		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
 		WARN_ON(dropped > 1);
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+		spin_unlock(&BTRFS_I(inode)->lock);
 		return ret;
 	}
 
@@ -4057,6 +4119,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4064,9 +4135,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	unsigned dropped;
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
+	spin_lock(&BTRFS_I(inode)->lock);
 	dropped = drop_outstanding_extent(inode);
 
-	to_free = calc_csum_metadata_size(inode, num_bytes);
+	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	spin_unlock(&BTRFS_I(inode)->lock);
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
@@ -4074,6 +4147,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 				to_free);
 }
 
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
 	int ret;
@@ -4091,6 +4179,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 {
 	btrfs_delalloc_release_metadata(inode, num_bytes);
-- 
cgit v1.2.1


From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Aug 2011 10:25:38 -0400
Subject: Btrfs: kill the durable block rsv stuff

This is confusing code and isn't used by anything anymore, so delete it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 100 +++++++++----------------------------------------
 1 file changed, 17 insertions(+), 83 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4add1ac2dda0..30c0558eae84 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
-		WARN_ON(cache->reserved_pinned > 0);
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 	spin_lock_init(&rsv->lock);
 	atomic_set(&rsv->usage, 1);
 	rsv->priority = 6;
-	INIT_LIST_HEAD(&rsv->list);
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 {
 	if (rsv && atomic_dec_and_test(&rsv->usage)) {
 		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		if (!rsv->durable)
-			kfree(rsv);
+		kfree(rsv);
 	}
 }
 
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv)
-{
-	block_rsv->durable = 1;
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-}
-
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
@@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		ret = 0;
 	} else {
 		num_bytes -= block_rsv->reserved;
-		if (block_rsv->durable &&
-		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-			commit_trans = 1;
+		commit_trans = 1;
 	}
 	spin_unlock(&block_rsv->lock);
 	if (!ret)
@@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	}
 
 	if (commit_trans) {
+		struct btrfs_space_info *sinfo = block_rsv->space_info;
+
 		if (trans)
 			return -EAGAIN;
+
+		spin_lock(&sinfo->lock);
+		if (sinfo->bytes_pinned < num_bytes) {
+			spin_unlock(&sinfo->lock);
+			return -ENOSPC;
+		}
+		spin_unlock(&sinfo->lock);
+
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
@@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
 	update_global_block_rsv(fs_info);
 }
 
@@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		cache->space_info->bytes_pinned -= len;
-		if (cache->ro) {
+		if (cache->ro)
 			cache->space_info->bytes_readonly += len;
-		} else if (cache->reserved_pinned > 0) {
-			len = min(len, cache->reserved_pinned);
-			cache->reserved_pinned -= len;
-			cache->space_info->bytes_may_use += len;
-		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
 	}
@@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *unpin;
-	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *next_rsv;
 	u64 start;
 	u64 end;
-	int idx;
 	int ret;
 
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_for_each_entry_safe(block_rsv, next_rsv,
-				 &fs_info->durable_block_rsv_list, list) {
-
-		idx = trans->transid & 0x1;
-		if (block_rsv->freed[idx] > 0) {
-			block_rsv_add_bytes(block_rsv,
-					    block_rsv->freed[idx], 0);
-			block_rsv->freed[idx] = 0;
-		}
-		if (atomic_read(&block_rsv->usage) == 0) {
-			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-
-			if (block_rsv->freed[0] == 0 &&
-			    block_rsv->freed[1] == 0) {
-				list_del_init(&block_rsv->list);
-				kfree(block_rsv);
-			}
-		} else {
-			btrfs_block_rsv_release(root, block_rsv, 0);
-		}
-	}
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-
 	return 0;
 }
 
@@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, root, buf->start);
 			if (!ret)
-				goto pin;
+				goto out;
 		}
 
 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			pin_down_extent(root, cache, buf->start, buf->len, 1);
-			goto pin;
+			goto out;
 		}
 
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
-
-		goto out;
-	}
-pin:
-	if (block_rsv->durable && !cache->ro) {
-		ret = 0;
-		spin_lock(&cache->lock);
-		if (!cache->ro) {
-			cache->reserved_pinned += buf->len;
-			ret = 1;
-		}
-		spin_unlock(&cache->lock);
-
-		if (ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->freed[trans->transid & 0x1] += buf->len;
-			spin_unlock(&block_rsv->lock);
-		}
 	}
 out:
 	/*
@@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
-	    sinfo->total_bytes) {
+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+	    min_allocable_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_may_use += cache->reserved_pinned;
-		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
-- 
cgit v1.2.1


From dabdb6408cb801644fa613c7432da012640b348c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 12:50:18 -0400
Subject: Btrfs: kill unused parts of block_rsv

The priority and refill_used flags are not used anymore, and neither is the
usage counter, so just remove them from btrfs_block_rsv.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 30c0558eae84..5395cc639270 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3659,8 +3659,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
-	atomic_set(&rsv->usage, 1);
-	rsv->priority = 6;
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3681,10 +3679,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
-	if (rsv && atomic_dec_and_test(&rsv->usage)) {
-		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		kfree(rsv);
-	}
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	kfree(rsv);
 }
 
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
@@ -3734,13 +3730,10 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     num_bytes, 0);
-		if (!ret) {
-			block_rsv_add_bytes(block_rsv, num_bytes, 0);
-			return 0;
-		}
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		return 0;
 	}
 
 	if (commit_trans) {
@@ -3859,16 +3852,12 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	fs_info->chunk_block_rsv.space_info = space_info;
-	fs_info->chunk_block_rsv.priority = 10;
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->global_block_rsv.priority = 10;
-	fs_info->global_block_rsv.refill_used = 1;
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
-	fs_info->empty_block_rsv.priority = 10;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
-- 
cgit v1.2.1


From 13553e5221d6901a33b3f2157a389de085c161fe Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 13:33:21 -0400
Subject: Btrfs: don't try to commit in btrfs_block_rsv_check

We will try and reserve metadata bytes in btrfs_block_rsv_check and if we cannot
because we have a transaction open it will return EAGAIN, so we do not need to
try and commit the transaction again.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5395cc639270..6356ef2f0c80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3708,7 +3708,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  u64 min_reserved, int min_factor)
 {
 	u64 num_bytes = 0;
-	int commit_trans = 0;
 	int ret = -ENOSPC;
 
 	if (!block_rsv)
@@ -3720,13 +3719,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (min_reserved > num_bytes)
 		num_bytes = min_reserved;
 
-	if (block_rsv->reserved >= num_bytes) {
+	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
-	} else {
+	else
 		num_bytes -= block_rsv->reserved;
-		commit_trans = 1;
-	}
 	spin_unlock(&block_rsv->lock);
+
 	if (!ret)
 		return 0;
 
@@ -3736,26 +3734,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	if (commit_trans) {
-		struct btrfs_space_info *sinfo = block_rsv->space_info;
-
-		if (trans)
-			return -EAGAIN;
-
-		spin_lock(&sinfo->lock);
-		if (sinfo->bytes_pinned < num_bytes) {
-			spin_unlock(&sinfo->lock);
-			return -ENOSPC;
-		}
-		spin_unlock(&sinfo->lock);
-
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		ret = btrfs_commit_transaction(trans, root);
-		return 0;
-	}
-
-	return -ENOSPC;
+	return ret;
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
-- 
cgit v1.2.1


From 5e962c7850c273b483acc747b41bd5cddf631049 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 14:03:37 -0400
Subject: Btrfs: kill btrfs_truncate_reserve_metadata

Since we've optimized the truncate path, we no longer require this function.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 31 -------------------------------
 1 file changed, 31 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6356ef2f0c80..d05967e9d613 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3858,37 +3858,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
 
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
-				    struct btrfs_block_rsv *rsv)
-{
-	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
-	u64 num_bytes;
-	int ret;
-
-	/*
-	 * Truncate should be freeing data, but give us 2 items just in case it
-	 * needs to use some space.  We may want to be smarter about this in the
-	 * future.
-	 */
-	num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-
-	/* We already have enough bytes, just return */
-	if (rsv->reserved >= num_bytes)
-		return 0;
-
-	num_bytes -= rsv->reserved;
-
-	/*
-	 * You should have reserved enough space before hand to do this, so this
-	 * should not fail.
-	 */
-	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
-	BUG_ON(ret);
-
-	return 0;
-}
-
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-- 
cgit v1.2.1


From 482e6dc5261406fdb921946e70b51467b0305bad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 10:31:56 -0400
Subject: Btrfs: allow callers to specify if flushing can occur for
 btrfs_block_rsv_check

If you run xfstest 224 it you will get lots of messages about not being able to
delete inodes and that they will be cleaned up next mount.  This is because
btrfs_block_rsv_check was not calling reserve_metadata_bytes with the ability to
flush, so if there was not enough space, it simply failed.  But in truncate and
evict case we could easily flush space to try and get enough space to do our
work, so make btrfs_block_rsv_check take a flush argument to pass down to
reserve_metadata_bytes.  Now xfstests 224 runs fine without all those
complaints.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d05967e9d613..a71fcf506531 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3705,7 +3705,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor)
+			  u64 min_reserved, int min_factor, int flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3728,7 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0);
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
-- 
cgit v1.2.1


From 7ed49f187c82821e35f8869399bcf90822a74a23 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 15:45:52 -0400
Subject: Btrfs: fix space leak when we fail to make an allocation

When changing back to using a spin_lock to protect the extent counters I decided
that since we would only be dropping our original extent, it was ok to just drop
the extent and return.  However since somebody else could have come in and done
a reservation, we need to do the normal song and dance to clear the reservation
out properly.  So calculate how much space we need to free, and then subtract
what we just attempted to reserve.  If it's more then we know we need to drop
those bytes from the delalloc block rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a71fcf506531..99ab5716baad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4025,16 +4025,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret) {
+		u64 to_free = 0;
 		unsigned dropped;
-		/*
-		 * We don't need the return value since our reservation failed,
-		 * we just need to clean up our counter.
-		 */
+
 		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
-		WARN_ON(dropped > 1);
-		BTRFS_I(inode)->csum_bytes -= num_bytes;
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 		spin_unlock(&BTRFS_I(inode)->lock);
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+		/*
+		 * Somebody could have come in and twiddled with the
+		 * reservation, so if we have to free more than we would have
+		 * reserved from this reservation go ahead and release those
+		 * bytes.
+		 */
+		to_free -= to_reserve;
+		if (to_free)
+			btrfs_block_rsv_release(root, block_rsv, to_free);
 		return ret;
 	}
 
-- 
cgit v1.2.1


From 7f70150896ebd1169d9c43484c8c424f755353c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 22 Aug 2011 15:23:19 -0400
Subject: Btrfs: don't increase the block_rsv's size when emergency allocating
 space

If we have to emergency reserve space we need to not increase the block_rsv
size, otherwise we'll leak space.  Take for instance delalloc, say we reserve
4k, and we use that 4k, and then we have to emergency allocate another 4k, we
bump the size up to 8k, however we've only accounted for 4k in reservations in
all of our supporting logic, so we'll go to free the 4k and end up having a size
of 4k, which will cause us to later not free as much space.  I saw this doing
testing where I wasn't reserving enough space for something but was still
leaking space, very frustrating.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 99ab5716baad..1f1d3e8dcec9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5707,9 +5707,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
 					     0);
 		if (!ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->size += blocksize;
-			spin_unlock(&block_rsv->lock);
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
 			ret = block_rsv_use_bytes(global_rsv, blocksize);
-- 
cgit v1.2.1


From c09544e07f8cdc455ed8615d4c067d694c33bd18 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 10:19:10 -0400
Subject: Btrfs: handle enospc accounting for free space inodes

Since free space inodes now use normal checksumming we need to make sure to
account for their metadata use.  So reserve metadata space, and then if we fail
to write out the metadata we can just release it, otherwise it will be freed up
when the io completes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1f1d3e8dcec9..ccdc4d12e8d4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2755,16 +2755,20 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, num_pages);
+	ret = btrfs_delalloc_reserve_space(inode, num_pages);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
-	if (!ret)
+	if (!ret) {
 		dcs = BTRFS_DC_SETUP;
-	btrfs_free_reserved_data_space(inode, num_pages);
+		btrfs_free_reserved_data_space(inode, num_pages);
+	} else {
+		btrfs_delalloc_release_space(inode, num_pages);
+	}
+
 out_put:
 	iput(inode);
 out_free:
@@ -4002,9 +4006,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
 	u64 to_reserve = 0;
 	unsigned nr_extents = 0;
+	int flush = 1;
 	int ret;
 
-	if (btrfs_transaction_in_commit(root->fs_info))
+	if (btrfs_is_free_space_inode(root, inode))
+		flush = 0;
+
+	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4023,7 +4031,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
-- 
cgit v1.2.1


From 4c13d758b7e79c14a0026c1f783f0c79e339b7bb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 11:31:29 -0400
Subject: Btrfs: use the transactions block_rsv for the csum root

The alloc warnings everybody has been seeing is because we have been reserving
space for csums, but we weren't actually using that space.  So make
get_block_rsv() return the trans->block_rsv if we're modifying the csum root.
Also set the trans->block_rsv to NULL so that if we modify the csum root when
running delayed ref's that comes out of the global reserve like it's supposed
to.  With this patch I'm not seeing those alloc warnings anymore.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ccdc4d12e8d4..53f6dbdab510 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3565,10 +3565,12 @@ out:
 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-	if (root->ref_cows)
+	struct btrfs_block_rsv *block_rsv = NULL;
+
+	if (root->ref_cows || root == root->fs_info->csum_root)
 		block_rsv = trans->block_rsv;
-	else
+
+	if (!block_rsv)
 		block_rsv = root->block_rsv;
 
 	if (!block_rsv)
@@ -3865,12 +3867,13 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
+	struct btrfs_block_rsv *block_rsv;
+
 	if (!trans->bytes_reserved)
 		return;
 
-	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
-	btrfs_block_rsv_release(root, trans->block_rsv,
-				trans->bytes_reserved);
+	block_rsv = &root->fs_info->trans_block_rsv;
+	btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
-- 
cgit v1.2.1


From d02c9955ded7fc56dd1edc987558b084ccb03eb4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 11:40:48 -0400
Subject: Btrfs: don't get the block_rsv in btrfs_free_tree_block

Since the durable block rsv stuff has been killed there is no need to get the
block_rsv in btrfs_free_tree_block anymore.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53f6dbdab510..d236df790156 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4707,7 +4707,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref)
 {
-	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_group_cache *cache = NULL;
 	int ret;
 
@@ -4722,10 +4721,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	if (!last_ref)
 		return;
 
-	block_rsv = get_block_rsv(trans, root);
 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-	if (block_rsv->space_info != cache->space_info)
-		goto out;
 
 	if (btrfs_header_generation(buf) == trans->transid) {
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-- 
cgit v1.2.1


From 4a92b1b8d2810db4ea0c34616b94c0b3810fa027 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 12:34:28 -0400
Subject: Btrfs: stop passing a trans handle all around the reservation code

The only thing that we need to have a trans handle for is in
reserve_metadata_bytes and thats to know how much flushing we can do.  So
instead of passing it around, just check current->journal_info for a
trans_handle so we know if we can commit a transaction to try and free up space
or not.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d236df790156..9bb71597aa54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3404,29 +3404,34 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
-/*
- * Retries tells us how many times we've called reserve_metadata_bytes.  The
- * idea is if this is the first call (retries == 0) then we will add to our
- * reserved count if we can't make the allocation in order to hold our place
- * while we go and try and free up space.  That way for retries > 1 we don't try
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
  *
- * However if we don't intend to retry this reservation, pass -1 as retries so
- * that it short circuits this logic.
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
  */
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
+static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
 				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
+	struct btrfs_trans_handle *trans;
 	u64 unused;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3689,8 +3694,7 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes)
 {
@@ -3699,7 +3703,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3708,8 +3712,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
+int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved, int min_factor, int flush)
 {
@@ -3734,7 +3737,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4034,7 +4037,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5689,8 +5692,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5711,8 +5713,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
-					     0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
-- 
cgit v1.2.1


From 455757c322cc0a0f2a692c5625dd88aaf6a7b889 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 19 Sep 2011 12:26:24 -0400
Subject: Btrfs: delay iput when deleting a block group

I kept getting warnings from evict because we were calling
btrfs_start_transaction() with a transaction already started when doing a
balance.  This is because we remove a block group which requires a transaction,
and the put the last reference on the cache inode.  Instead of doing this we
need to delay the iput so it is done not within a transaction having started.
This gets rid of our warnings.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9bb71597aa54..5498bdacd4c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7266,7 +7266,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&block_group->lock);
 		}
 		/* One for our lookup ref */
-		iput(inode);
+		btrfs_add_delayed_iput(inode);
 	}
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-- 
cgit v1.2.1


From ef3be45722317f8c2fb0e861065df0c3830ff9ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 22 Sep 2011 14:30:02 -0400
Subject: Btrfs: check unused against how much space we actually want

There is a bug that may lead to early ENOSPC in our reservation code.  We've
been checking against num_bytes which may be above and beyond what we want to
actually reserve, which could give us a false ENOSPC.  Fix this by making sure
the unused space is above how much we want to reserve and not how much we're
trying to flush.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5498bdacd4c3..fd65f6bc676c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3472,7 +3472,7 @@ again:
 	 */
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
-		if (unused >= num_bytes) {
+		if (unused >= orig_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
-- 
cgit v1.2.1


From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 17:12:22 -0400
Subject: Btrfs: allow us to overcommit our enospc reservations

One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases.  There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve.  However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit.  So this
patch adds chunk free space accounting so we always know how much unallocated
space we have.  Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit.  In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much.  Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space.  This makes this fio test

[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test

go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system.  This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 14 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fd65f6bc676c..25b69d0f9135 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
+ * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes, int flush, int check)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
-	u64 unused;
+	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
@@ -3459,9 +3460,9 @@ again:
 	}
 
 	ret = -ENOSPC;
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
 
 	/*
 	 * The idea here is that we've not already over-reserved the block group
@@ -3470,9 +3471,8 @@ again:
 	 * lets start flushing stuff first and then come back and try to make
 	 * our reservation.
 	 */
-	if (unused <= space_info->total_bytes) {
-		unused = space_info->total_bytes - unused;
-		if (unused >= orig_bytes) {
+	if (used <= space_info->total_bytes) {
+		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
@@ -3489,10 +3489,43 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
-		num_bytes = unused - space_info->total_bytes +
+		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
 
+	if (ret && !check) {
+		u64 profile = btrfs_get_alloc_profile(root, 0);
+		u64 avail;
+
+		spin_lock(&root->fs_info->free_chunk_lock);
+		avail = root->fs_info->free_chunk_space;
+
+		/*
+		 * If we have dup, raid1 or raid10 then only half of the free
+		 * space is actually useable.
+		 */
+		if (profile & (BTRFS_BLOCK_GROUP_DUP |
+			       BTRFS_BLOCK_GROUP_RAID1 |
+			       BTRFS_BLOCK_GROUP_RAID10))
+			avail >>= 1;
+
+		/*
+		 * If we aren't flushing don't let us overcommit too much, say
+		 * 1/8th of the space.  If we can flush, let it overcommit up to
+		 * 1/2 of the space.
+		 */
+		if (flush)
+			avail >>= 3;
+		else
+			avail >>= 1;
+		 spin_unlock(&root->fs_info->free_chunk_lock);
+
+		if (used + orig_bytes < space_info->total_bytes + avail) {
+			space_info->bytes_may_use += orig_bytes;
+			ret = 0;
+		}
+	}
+
 	/*
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
-- 
cgit v1.2.1


From 73bc187680f94bed498f8a669103cad290e41180 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 3 Oct 2011 14:07:49 -0400
Subject: Btrfs: introduce mount option no_space_cache

Some users have requested this and I've found I needed a way to disable cache
loading without actually clearing the cache, so introduce the no_space_cache
option.  Before we check the super blocks cache generation field and if it was
populated we always turned space caching on.  Now we check this and set the
space cache option on, and then parse the mount options so that if we want it
off it get's turned off.  Then we check the mount option all the places we do
the caching work instead of checking the super's cache generation.  This makes
things more consistent and lets us turn space caching off.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 25b69d0f9135..f9711a82fc54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -481,7 +481,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	 * we likely hold important locks.
 	 */
 	if (trans && (!trans->transaction->in_commit) &&
-	    (root && root != root->fs_info->tree_root)) {
+	    (root && root != root->fs_info->tree_root) &&
+	    btrfs_test_opt(root, SPACE_CACHE)) {
 		spin_lock(&cache->lock);
 		if (cache->cached != BTRFS_CACHE_NO) {
 			spin_unlock(&cache->lock);
@@ -4223,7 +4224,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 
-		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		if (btrfs_test_opt(root, SPACE_CACHE) &&
 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
 
@@ -7038,13 +7039,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	path->reada = 1;
 
 	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
-	if (cache_gen != 0 &&
+	if (btrfs_test_opt(root, SPACE_CACHE) &&
 	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
-	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
-		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
-- 
cgit v1.2.1


From 9a82ca659d8bfd99afc0e89bbde2202322df5755 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 16:35:28 -0400
Subject: Btrfs: take overflow into account in reserving space

My overcommit stuff can be a little racy when we're filling up the disk with
fs_mark and we overcommit into things that quickly get used up for data.  So use
num_bytes to see if we have enough available space so we're less likely to
overcommit ourselves out of the ability to make reservations.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f9711a82fc54..f95e55083bdb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3521,7 +3521,7 @@ again:
 			avail >>= 1;
 		 spin_unlock(&root->fs_info->free_chunk_lock);
 
-		if (used + orig_bytes < space_info->total_bytes + avail) {
+		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		}
-- 
cgit v1.2.1


From 5b0e95bf607ddd59b39f52d3d55e6581c817b530 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 6 Oct 2011 08:58:24 -0400
Subject: Btrfs: inline checksums into the disk free space cache

Yeah yeah I know this is how we used to do it and then I changed it, but damnit
I'm changing it back.  The fact is that writing out checksums will modify
metadata, which could cause us to dirty a block group we've already written out,
so we have to truncate it and all of it's checksums and re-write it which will
write new checksums which could dirty a blockg roup that has already been
written and you see where I'm going with this?  This can cause unmount or really
anything that depends on a transaction to commit to take it's sweet damned time
to happen.  So go back to the way it was, only this time we're specifically
setting NODATACOW because we can't go through the COW pathway anyway and we're
doing our own built-in cow'ing by truncating the free space cache.  The other
new thing is once we truncate the old cache and preallocate the new space, we
don't need to do that song and dance at all for the rest of the transaction, we
can just overwrite the existing space with the new cache if the block group
changes for whatever reason, and the NODATACOW will let us do this fine.  So
keep track of which transaction we last cleared our cache in and if we cleared
it in this transaction just say we're all setup and carry on.  This survives
xfstests and stress.sh.

The inode cache will continue to use the normal csum infrastructure since it
only gets written once and there will be no more modifications to the fs tree in
a transaction commit.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f95e55083bdb..0abf70c984e9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2717,6 +2717,13 @@ again:
 		goto again;
 	}
 
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
 	/*
 	 * We want to set the generation to 0, that way if anything goes wrong
 	 * from here on out we know not to trust this cache when we load up next
@@ -2756,19 +2763,16 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_delalloc_reserve_space(inode, num_pages);
+	ret = btrfs_check_data_free_space(inode, num_pages);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
-	if (!ret) {
+	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-		btrfs_free_reserved_data_space(inode, num_pages);
-	} else {
-		btrfs_delalloc_release_space(inode, num_pages);
-	}
+	btrfs_free_reserved_data_space(inode, num_pages);
 
 out_put:
 	iput(inode);
@@ -2776,6 +2780,8 @@ out_free:
 	btrfs_release_path(path);
 out:
 	spin_lock(&block_group->lock);
+	if (!ret)
+		block_group->cache_generation = trans->transid;
 	block_group->disk_cache_state = dcs;
 	spin_unlock(&block_group->lock);
 
-- 
cgit v1.2.1


From 4b91c14f913f649d4302b3677b85c4ce87a3d8e7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 7 Oct 2011 11:55:34 -0400
Subject: Btrfs: wait for ordered extents if we didn't reclaim enough

I noticed recently that my overcommit patch was causing one of my enospc tests
to fail 25% of the time with early ENOSPC.  This is because my overcommit patch
was letting us go way over board, but it wasn't waiting long enough to let the
delalloc shrinker do it's job.  The problem is we just start writeback and wait
a little bit hoping we flush enough, but we only free up delalloc space by
having the writes complete all the way.  We do this by waiting for ordered
extents, which we do but only if we already free'd enough for the reservation,
which isn't right, we should flush ordered extents if we didn't reclaim enough
in case that will push us over the edge.  With this patch I've not seen a
failure in this enospc test after running it in a loop for an hour.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0abf70c984e9..fc0de6880045 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3406,7 +3406,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
-	if (reclaimed >= to_reclaim && !trans)
+	if (reclaimed < to_reclaim && !trans)
 		btrfs_wait_ordered_extents(root, 0, 0);
 	return reclaimed >= to_reclaim;
 }
-- 
cgit v1.2.1


From bbb495c2ed9d34894cb3d6d366866fc92a2e1adc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 13:37:45 -0400
Subject: Btrfs: don't check bytes_pinned to determine if we should commit the
 transaction

Before the only reason to commit the transaction to recover space in
reserve_metadata_bytes() was if there were enough pinned_bytes to satisfy our
reservation.  But now we have the delayed inode stuff which will hold it's
reservations until we commit the transaction.  So say we max out our reservation
by creating a bunch of files but don't have any pinned bytes we will ENOSPC out
early even though we could commit the transaction and get that space back.  So
now just unconditionally commit the transaction since currently there is no way
to know how much metadata space is being reserved by delayed inode stuff.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc0de6880045..79365a40cb3a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3568,17 +3568,6 @@ again:
 		goto again;
 	}
 
-	/*
-	 * Not enough space to be reclaimed, don't bother committing the
-	 * transaction.
-	 */
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < orig_bytes)
-		ret = -ENOSPC;
-	spin_unlock(&space_info->lock);
-	if (ret)
-		goto out;
-
 	ret = -EAGAIN;
 	if (trans)
 		goto out;
-- 
cgit v1.2.1


From f104d044376aadcee74605d66b8d9dc2e145782c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 13:56:58 -0400
Subject: Btrfs: wait for ordered extents if we're in trouble when shrinking
 delalloc

The only way we actually reclaim delalloc space is waiting for the IO to
completely finish.  Usually we kick off a bunch of IO and wait for a little bit
and hope we can make our reservation, and usually this works out pretty well.
With overcommit however we can get seriously underwater if we're filling up the
disk quickly, so we need to be able to force the delalloc shrinker to wait for
the ordered IO to finish to give us a better chance of actually reclaiming
enough space to get our reservation.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 79365a40cb3a..96cbc5104959 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3334,7 +3334,8 @@ out:
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim, int sync)
+			   struct btrfs_root *root, u64 to_reclaim,
+			   bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
@@ -3387,11 +3388,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
 
-		time_left = schedule_timeout_interruptible(1);
+		if (wait_ordered && !trans) {
+			btrfs_wait_ordered_extents(root, 0, 0);
+		} else {
+			time_left = schedule_timeout_interruptible(1);
 
-		/* We were interrupted, exit */
-		if (time_left)
-			break;
+			/* We were interrupted, exit */
+			if (time_left)
+				break;
+		}
 
 		/* we've kicked the IO a few times, if anything has been freed,
 		 * exit.  There is no sense in looping here for a long time
@@ -3406,8 +3411,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
-	if (reclaimed < to_reclaim && !trans)
-		btrfs_wait_ordered_extents(root, 0, 0);
+
 	return reclaimed >= to_reclaim;
 }
 
@@ -3438,6 +3442,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
+	bool wait_ordered = false;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
@@ -3496,6 +3501,7 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
+		wait_ordered = true;
 		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
@@ -3530,6 +3536,8 @@ again:
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
+		} else {
+			wait_ordered = true;
 		}
 	}
 
@@ -3552,7 +3560,7 @@ again:
 	 * We do synchronous shrinking since we don't actually unreserve
 	 * metadata until after the IO is completed.
 	 */
-	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
 	if (ret < 0)
 		goto out;
 
@@ -3564,6 +3572,7 @@ again:
 	 * so go back around and try again.
 	 */
 	if (retries < 2) {
+		wait_ordered = true;
 		retries++;
 		goto again;
 	}
-- 
cgit v1.2.1


From 877da174301dde9062b915da4c8103048be49702 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:02:10 -0400
Subject: Btrfs: allow shrink_delalloc flush the needed reclaimed pages

Currently we only allow a maximum of 2 megabytes of pages to be flushed at a
time.  This was ok before, but now we have overcommit which will screw us in a
heartbeat if we are quickly filling the disk.  So instead pick either 2
megabytes or the number of pages we need to reclaim to be safe again, which ever
is larger.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96cbc5104959..424ae82855c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3343,7 +3343,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	long time_left;
-	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
 	unsigned long progress;
 
@@ -3366,7 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	}
 
 	max_reclaim = min(reserved, to_reclaim);
-
+	nr_pages = max_t(unsigned long, nr_pages,
+			 max_reclaim >> PAGE_CACHE_SHIFT);
 	while (loops < 1024) {
 		/* have the flusher threads jump in and do some IO */
 		smp_mb();
-- 
cgit v1.2.1


From b24e03db0df3e9164c9649db12fecc8c2d81b0d1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:40:17 -0400
Subject: Btrfs: release trans metadata bytes before flushing delayed refs

We started setting trans->block_rsv = NULL to allow the delayed refs flushing
stuff to use the right block_rsv and then just made
btrfs_trans_release_metadata() unconditionally use the trans block rsv.  The
problem with this is we need to reserve some space in the transaction and then
migrate it to the global block rsv, so we need to be able to free that out
properly.  So instead just move btrfs_trans_release_metadata() before the
delayed ref flushing and use trans->block_rsv for the freeing.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 424ae82855c8..eb4fe56b08bb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3909,13 +3909,10 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-
 	if (!trans->bytes_reserved)
 		return;
 
-	block_rsv = &root->fs_info->trans_block_rsv;
-	btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
-- 
cgit v1.2.1


From 36ba022ac0b748dd543f43430b03198e899426c9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 18 Oct 2011 12:15:48 -0400
Subject: Btrfs: seperate out btrfs_block_rsv_check out into 2 different
 functions

Currently btrfs_block_rsv_check does 2 things, it will either refill a block
reserve like in the truncate or refill case, or it will check to see if there is
enough space in the global reserve and possibly refill it.  However because of
overcommit we could be well overcommitting ourselves just to try and refill the
global reserve, when really we should just be committing the transaction.  So
breack this out into btrfs_block_rsv_refill and btrfs_block_rsv_check.  Refill
will try to reserve more metadata if it can and btrfs_block_rsv_check will not,
it will only tell you if the factor of the total space is still reserved.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index eb4fe56b08bb..a5f1421eeee9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3422,7 +3422,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
- * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3433,7 +3432,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush, int check)
+				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
@@ -3507,7 +3506,7 @@ again:
 			(orig_bytes * (retries + 1));
 	}
 
-	if (ret && !check) {
+	if (ret) {
 		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
@@ -3742,7 +3741,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3752,8 +3751,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 }
 
 int btrfs_block_rsv_check(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor, int flush)
+			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3762,11 +3760,26 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 		return 0;
 
 	spin_lock(&block_rsv->lock);
-	if (min_factor > 0)
-		num_bytes = div_factor(block_rsv->size, min_factor);
-	if (min_reserved > num_bytes)
-		num_bytes = min_reserved;
+	num_bytes = div_factor(block_rsv->size, min_factor);
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	spin_unlock(&block_rsv->lock);
 
+	return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = min_reserved;
 	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
 	else
@@ -3776,7 +3789,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4073,7 +4086,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5728,7 +5741,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5749,7 +5762,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
-- 
cgit v1.2.1


From 7e355b83efa80e5f5821591c13c17649594d82ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 18 Oct 2011 13:07:31 -0400
Subject: Btrfs: if we have a lot of pinned space, commit the transaction

Mitch kept hitting a panic because he was getting ENOSPC.  One of my previous
patches makes it so we are much better at not allocating new metadata chunks.
Unfortunately coupled with the overcommit patch this works us into a bit of a
problem if we are removing a bunch of space and end up chewing up all of our
space with pinned extents.  We can allocate chunks fine and overflow is ok, but
the only way to reclaim this space is to commit the transaction.  So if we go to
overcommit, first check and see how much pinned space we have.  If we have more
than 80% of the free space chewed up with pinned extents, just commit the
transaction, this will free up enough space for our reservation and we won't
have this problem anymore.  With this patch Mitch's test doesn't blow up
anymore.  Thanks,

Reported-and-tested-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5f1421eeee9..4eb7d2ba38f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3510,6 +3510,20 @@ again:
 		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
+		/*
+		 * If we have a lot of space that's pinned, don't bother doing
+		 * the overcommit dance yet and just commit the transaction.
+		 */
+		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+		do_div(avail, 10);
+		if (space_info->bytes_pinned >= avail && flush && !trans &&
+		    !committed) {
+			space_info->flush = 1;
+			flushing = true;
+			spin_unlock(&space_info->lock);
+			goto commit;
+		}
+
 		spin_lock(&root->fs_info->free_chunk_lock);
 		avail = root->fs_info->free_chunk_space;
 
@@ -3581,6 +3595,7 @@ again:
 	if (trans)
 		goto out;
 
+commit:
 	ret = -ENOSPC;
 	if (committed)
 		goto out;
-- 
cgit v1.2.1


From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 2 Oct 2011 13:56:53 +0300
Subject: Btrfs: pass the correct root to lookup_free_space_inode()

Free space items are located in tree of tree roots, not in the extent
tree.  It didn't pop up because lookup_free_space_inode() grabs the
inode all the time instead of actually searching the tree.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4eb7d2ba38f8..6cfcc9060c83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	inode = lookup_free_space_inode(root, block_group, path);
+	inode = lookup_free_space_inode(tree_root, block_group, path);
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
 		BUG_ON(ret);
-- 
cgit v1.2.1


From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 9 Sep 2011 17:34:35 +0800
Subject: Btrfs: fix race between multi-task space allocation and caching space

The task may fail to get free space though it is enough when multi-task
space allocation and caching space happen at the same time.

	Task1			Caching Thread		Task2
	------------------------------------------------------------------------
	find_free_extent
	  The space has not
	  be cached, and start
	  caching thread. And
	  wait for it.
				cache space, if
				the space is > 2MB
				wake up Task1
							find_free_extent
							  get all the space that
							  is cached.
	  try to allocate space,
	  but there is no space
	  now.
	trigger BUG_ON()

The message is following:
btrfs allocation failed flags 1, wanted 4096
space_info has 1040187392 free, is not full
space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0
block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
------------[ cut here ]------------
kernel BUG at fs/btrfs/inode.c:835!
 [<ffffffffa031261b>] __extent_writepage+0x1bf/0x5ce [btrfs]
 [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108
 [<ffffffffa02f8ada>] ? wait_current_trans+0x23/0xec [btrfs]
 [<ffffffff810c3fbf>] ? find_get_pages_tag+0x73/0xe2
 [<ffffffffa0312d12>] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs]
 [<ffffffffa0312e74>] extent_writepages+0x3e/0x53 [btrfs]
 [<ffffffff8110ad2c>] ? do_sync_write+0xc6/0x103
 [<ffffffffa0302d6e>] ? btrfs_submit_direct+0x414/0x414 [btrfs]
 [<ffffffff811380fa>] ? fsnotify+0x236/0x266
 [<ffffffffa02fc930>] btrfs_writepages+0x22/0x24 [btrfs]
 [<ffffffff810cc215>] do_writepages+0x1c/0x25
 [<ffffffff810c4958>] __filemap_fdatawrite_range+0x4e/0x50
 [<ffffffff810c4982>] filemap_write_and_wait_range+0x28/0x51
 [<ffffffffa0306b2e>] btrfs_sync_file+0x7d/0x198 [btrfs]
 [<ffffffff8110aa26>] ? fsnotify_modify+0x5d/0x65
 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21
 [<ffffffff8112d170>] vfs_fsync+0x17/0x19
 [<ffffffff8112d316>] do_fsync+0x29/0x3e
 [<ffffffff8112d348>] sys_fsync+0xb/0xf
 [<ffffffff81468352>] system_call_fastpath+0x16/0x1b
[SNIP]
RIP  [<ffffffffa02fe08c>] cow_file_range+0x1c4/0x32b [btrfs]

We fix this bug by trying to allocate the space again if there are block groups
in caching.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6cfcc9060c83..cef355f1328a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
+	bool have_caching_bg = false;
 	u64 ideal_cache_percent = 0;
 	u64 ideal_cache_offset = 0;
 
@@ -5036,6 +5037,7 @@ ideal_cache:
 		}
 	}
 search:
+	have_caching_bg = false;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
@@ -5244,6 +5246,8 @@ refill_cluster:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
+			if (!cached)
+				have_caching_bg = true;
 			goto loop;
 		}
 checks:
@@ -5294,6 +5298,9 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+		goto search;
+
 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 		goto search;
 
-- 
cgit v1.2.1


From dff51cd1c60856c28f5d22a571294c2b70b6b322 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 14 Jun 2011 12:52:17 +0200
Subject: btrfs: ratelimit WARN_ON in use_block_rsv

The WARN_ON under some circumstances heavily polute log and slow down
the machine. This is just a safety, as the warning should be fixed by
another patch, nevertheless, it still pops up during testing.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cef355f1328a..28c4809851a5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -5783,7 +5784,13 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return block_rsv;
 	if (ret) {
-		WARN_ON(1);
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL,
+				/*DEFAULT_RATELIMIT_BURST*/ 2);
+		if (__ratelimit(&_rs)) {
+			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+			WARN_ON(1);
+		}
 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
-- 
cgit v1.2.1


From e688b7252f784c2479d559f9f70ca8354752c5e7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Oct 2011 20:52:39 -0400
Subject: Btrfs: fix extent pinning bugs in the tree log

The tree log had two important bugs that could cause corruptions after a
crash.  Sometimes we were allowing tree log blocks to be reused after
the tree log was committed but before the transaction commit was done.

This allowed a future metadata write to overwrite the tree log data.  It
is fixed by adding a new variant of freeing reserved extents that always
pins them.  Credit goes to Stefan Behrens and Arne Jansen for many many
hours spent tracking this bug down.

During tree log replay, we do a pass through the tree log and pin all
the extents we find.  This makes sure the replay code won't go in and
use any of those blocks for new allocations during replay.  The problem
is the free space cache isn't honoring these pinned extents.  So the
allocator can end up handing them out, leading to all kinds of problems
during replay.

The fix here is to force any free space cache to load while we pin the
extents, and then to make sure we remove the pinned extents from the
free space rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 28c4809851a5..cb7626646bba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4344,6 +4344,34 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	return 0;
 }
 
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	BUG_ON(!cache);
+
+	/*
+	 * pull in the free space cache (if any) so that our pin
+	 * removes the free space from the cache.  We have load_only set
+	 * to one because the slow code to read in the free extents does check
+	 * the pinned extents.
+	 */
+	cache_block_group(cache, trans, root, 1);
+
+	pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+	/* remove us from the free space cache (if we're there at all) */
+	btrfs_remove_free_space(cache, bytenr, num_bytes);
+	btrfs_put_block_group(cache);
+	return 0;
+}
+
 /**
  * btrfs_update_reserved_bytes - update the block_group and space info counters
  * @cache:	The cache we are manipulating
@@ -5487,7 +5515,8 @@ again:
 	return ret;
 }
 
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len, int pin)
 {
 	struct btrfs_block_group_cache *cache;
 	int ret = 0;
@@ -5502,8 +5531,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	if (btrfs_test_opt(root, DISCARD))
 		ret = btrfs_discard_extent(root, start, len, NULL);
 
-	btrfs_add_free_space(cache, start, len);
-	btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+	if (pin)
+		pin_down_extent(root, cache, start, len, 1);
+	else {
+		btrfs_add_free_space(cache, start, len);
+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+	}
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -5511,6 +5544,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	return ret;
 }
 
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      u64 parent, u64 root_objectid,
-- 
cgit v1.2.1


From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 13 Apr 2011 15:41:04 +0200
Subject: btrfs: separate superblock items out of fs_info

fs_info has now ~9kb, more than fits into one page. This will cause
mount failure when memory is too fragmented. Top space consumers are
super block structures super_copy and super_for_commit, ~2.8kb each.
Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64)

Add a wrapper for freeing fs_info and all of it's dynamically allocated
members.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb7626646bba..782eb3ea8edf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * about 1% of the FS size.
 	 */
 	if (force == CHUNK_ALLOC_LIMITED) {
-		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 		thresh = max_t(u64, 64 * 1024 * 1024,
 			       div_factor_fine(thresh, 1));
 
@@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
-	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
 	/* 256MB or 5% of the FS */
 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 	u64 num_bytes;
 	u64 meta_used;
 	u64 data_used;
-	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
 	spin_lock(&sinfo->lock);
@@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_lock);
-	old_val = btrfs_super_bytes_used(&info->super_copy);
+	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
 	spin_unlock(&info->delalloc_lock);
 
 	while (total) {
@@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		return -ENOMEM;
 	path->reada = 1;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (btrfs_test_opt(root, SPACE_CACHE) &&
-	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
@@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 	int mixed = 0;
 	int ret;
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		return 1;
 
-- 
cgit v1.2.1


From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion

I've been hitting warnings in use_block_rsv when running the delayed insertion
stuff.  It's because we will readjust global block rsv based on what is in use,
which means we could end up discarding reservations that are for the delayed
insertion stuff.  So instead create a seperate block rsv for the delayed
insertion stuff.  This will also make it easier to debug problems with the
delayed insertion reservations since we will know that only the delayed
insertion code touches this block_rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 782eb3ea8edf..01c1f08b976a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->delayed_block_rsv.space_info = space_info;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.1


From 663350ac38c67ca388acea6e876dc6d668c232b0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: be smarter about committing the transaction in
 reserve_metadata_bytes

Because of the overcommit stuff I had to make it so that we committed the
transaction all the time in reserve_metadata_bytes in case we had overcommitted
because of delayed items.  This was because previously we had no way of knowing
how much space was reserved for delayed items.  Now that we have the
delayed_block_rsv we can check it to see if committing the transaction would get
us anywhere.  This patch breaks out the committing logic into a helper function
that will check to see if committing the transaction would free enough space for
us to get anything done.  With this patch xfstests 83 goes from taking 445
seconds to taking 28 seconds on my box.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 86 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 19 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01c1f08b976a..5b84205e7685 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3334,12 +3334,12 @@ out:
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim,
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
 			   bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
+	struct btrfs_trans_handle *trans;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
@@ -3348,6 +3348,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	int loops = 0;
 	unsigned long progress;
 
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
@@ -3417,6 +3418,60 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+				  struct btrfs_space_info *space_info,
+				  u64 bytes, int force)
+{
+	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+	struct btrfs_trans_handle *trans;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	if (trans)
+		return -EAGAIN;
+
+	if (force)
+		goto commit;
+
+	/* See if there is enough pinned space to make this reservation */
+	spin_lock(&space_info->lock);
+	if (space_info->bytes_pinned >= bytes) {
+		spin_unlock(&space_info->lock);
+		goto commit;
+	}
+	spin_unlock(&space_info->lock);
+
+	/*
+	 * See if there is some space in the delayed insertion reservation for
+	 * this reservation.
+	 */
+	if (space_info != delayed_rsv->space_info)
+		return -ENOSPC;
+
+	spin_lock(&delayed_rsv->lock);
+	if (delayed_rsv->size < bytes) {
+		spin_unlock(&delayed_rsv->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&delayed_rsv->lock);
+
+commit:
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return -ENOSPC;
+
+	return btrfs_commit_transaction(trans, root);
+}
+
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
  * @root - the root we're allocating for
@@ -3436,7 +3491,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	struct btrfs_trans_handle *trans;
 	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
@@ -3445,7 +3499,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	bool flushing = false;
 	bool wait_ordered = false;
 
-	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3461,7 +3514,7 @@ again:
 		 * deadlock since we are waiting for the flusher to finish, but
 		 * hold the current transaction open.
 		 */
-		if (trans)
+		if (current->journal_info)
 			return -EAGAIN;
 		ret = wait_event_interruptible(space_info->wait,
 					       !space_info->flush);
@@ -3517,12 +3570,16 @@ again:
 		 */
 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
 		do_div(avail, 10);
-		if (space_info->bytes_pinned >= avail && flush && !trans &&
-		    !committed) {
+		if (space_info->bytes_pinned >= avail && flush && !committed) {
 			space_info->flush = 1;
 			flushing = true;
 			spin_unlock(&space_info->lock);
-			goto commit;
+			ret = may_commit_transaction(root, space_info,
+						     orig_bytes, 1);
+			if (ret)
+				goto out;
+			committed = true;
+			goto again;
 		}
 
 		spin_lock(&root->fs_info->free_chunk_lock);
@@ -3575,7 +3632,7 @@ again:
 	 * We do synchronous shrinking since we don't actually unreserve
 	 * metadata until after the IO is completed.
 	 */
-	ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
+	ret = shrink_delalloc(root, num_bytes, wait_ordered);
 	if (ret < 0)
 		goto out;
 
@@ -3592,21 +3649,12 @@ again:
 		goto again;
 	}
 
-	ret = -EAGAIN;
-	if (trans)
-		goto out;
-
-commit:
 	ret = -ENOSPC;
 	if (committed)
 		goto out;
 
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		goto out;
-	ret = btrfs_commit_transaction(trans, root);
+	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
 	if (!ret) {
-		trans = NULL;
 		committed = true;
 		goto again;
 	}
-- 
cgit v1.2.1


From c06a0e120a4e381a1c291c1fce3c6155c5791cae Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 4 Nov 2011 19:56:02 -0400
Subject: Btrfs: fix delayed insertion reservation

We all keep getting those stupid warnings from use_block_rsv when running
stress.sh, and it's because the delayed insertion stuff is being stupid.  It's
not the delayed insertion stuffs fault, it's all just stupid.  When marking an
inode dirty for oh say updating the time on it, we just do a
btrfs_join_transaction, which doesn't reserve any space.  This is stupid because
we're going to have to have space reserve to make this change, but we do it
because it's fast because chances are we're going to call it over and over again
and it doesn't matter.  Well thanks to the delayed insertion stuff this is
mostly the case, so we do actually need to make this reservation.  So if
trans->bytes_reserved is 0 then try to do a normal reservation.  If not return
ENOSPC which will make the btrfs_dirty_inode start a proper transaction which
will let it do the whole ENOSPC dance and reserve enough space for the delayed
insertion to steal the reservation from the transaction.

The other stupid thing we do is not reserve space for the inode when writing to
the thing.  Usually this is ok since we have to update the time so we'd have
already done all this work before we get to the endio stuff, so it doesn't
matter.  But this is stupid because we could write the data after the
transaction commits where we changed the mtime of the inode so we have to cow
all the way down to the inode anyway.  This used to be masked by the delalloc
reservation stuff, but because we delay the update it doesn't get masked in this
case.  So again the delayed insertion stuff bites us in the ass.  So if our
trans->block_rsv is delalloc, just steal the reservation from the delalloc
reserve.  Hopefully this won't bite us in the ass, but I've said that before.

With this patch stress.sh no longer spits out those stupid warnings (famous last
words).  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5b84205e7685..23e936c3de76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3814,6 +3814,24 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+				struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+		return 0;
+	}
+
+	return ret;
+}
+
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
-- 
cgit v1.2.1