From 890871be854b5f5e43e7ba2475f706209906cc24 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:24:52 -0400 Subject: Btrfs: switch extent_map to a rw lock There are two main users of the extent_map tree. The first is regular file inodes, where it is evenly spread between readers and writers. The second is the chunk allocation tree, which maps blocks from logical addresses to phyiscal ones, and it is 99.99% reads. The mapping tree is a point of lock contention during heavy IO workloads, so this commit switches things to a rw lock. Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/relocation.c') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c04f7f212602..4adab903fc2b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2646,9 +2646,9 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); while (1) { int ret; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; -- cgit v1.2.1 From 9655d2982b53fdb38a9e0f2f11315b99b92d66e2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:22:30 -0400 Subject: Btrfs: use a cached state for extent state operations during delalloc This changes the btrfs code to find delalloc ranges in the extent state tree to use the new state caching code from set/test bit. It reduces one of the biggest causes of rbtree searches in the writeback path. test_range_bit is also modified to take the cached state as a starting point while searching. Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/relocation.c') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4adab903fc2b..3be16ccc7eea 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2180,7 +2180,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, struct reloc_control *rc) { if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; return 0; } -- cgit v1.2.1 From 76dda93c6ae2c1dc3e6cde34569d6aca26b0c918 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 21 Sep 2009 16:00:26 -0400 Subject: Btrfs: add snapshot/subvolume destroy ioctl This patch adds snapshot/subvolume destroy ioctl. A subvolume that isn't being used and doesn't contains links to other subvolumes can be destroyed. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/relocation.c') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3be16ccc7eea..48a504260635 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3203,6 +3203,7 @@ static int check_extent_flags(u64 flags) return 0; } + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; @@ -3220,6 +3221,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (!path) return -ENOMEM; + rc->extents_found = 0; + rc->extents_skipped = 0; + rc->search_start = rc->block_group->key.objectid; clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, GFP_NOFS); @@ -3475,14 +3479,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { - mutex_lock(&fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(fs_info->tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - rc->extents_found = 0; rc->extents_skipped = 0; + mutex_lock(&fs_info->cleaner_mutex); + + btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); + + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; break; @@ -3530,6 +3535,26 @@ out: return err; } +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + root->root_item.drop_level = 0; + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + + ret = btrfs_end_transaction(trans, root->fs_info->tree_root); + BUG_ON(ret); + return 0; +} + /* * recover relocation interrupted by system crash. * @@ -3589,8 +3614,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); - goto out; + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + mark_garbage_root(reloc_root); } } -- cgit v1.2.1 From 0257bb82d21bedff26541bcf12f1461c23f9ed61 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 24 Sep 2009 09:17:31 -0400 Subject: Btrfs: relocate file extents in clusters The extent relocation code copy file extents one by one when relocating data block group. This is inefficient if file extents are small. This patch makes the relocation code copy file extents in clusters. So we can can make better use of read-ahead. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 237 +++++++++++++++++++++++++++++++------------------- 1 file changed, 148 insertions(+), 89 deletions(-) (limited to 'fs/btrfs/relocation.c') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 48a504260635..361ad323faac 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -121,6 +121,15 @@ struct inodevec { int nr; }; +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + struct reloc_control { /* block group to relocate */ struct btrfs_block_group_cache *block_group; @@ -2529,56 +2538,94 @@ out: } static noinline_for_stack -int relocate_inode_pages(struct inode *inode, u64 start, u64 len) +int setup_extent_mapping(struct inode *inode, u64 start, u64 end, + u64 block_start) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret = 0; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) { u64 page_start; u64 page_end; - unsigned long i; - unsigned long first_index; + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; unsigned long last_index; - unsigned int total_read = 0; - unsigned int total_dirty = 0; + unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; - struct btrfs_ordered_extent *ordered; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int nr = 0; int ret = 0; + if (!cluster->nr) + return 0; + ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) return -ENOMEM; + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + mutex_lock(&inode->i_mutex); - first_index = start >> PAGE_CACHE_SHIFT; - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; - /* make sure the dirty trick played by the caller work */ - while (1) { - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); - if (ret != -EBUSY) - break; - schedule_timeout(HZ/10); - } + i_size_write(inode, cluster->end + 1 - offset); + ret = setup_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); if (ret) goto out_unlock; file_ra_state_init(ra, inode->i_mapping); - for (i = first_index ; i <= last_index; i++) { - if (total_read % ra->ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, ra, NULL, i, - min(last_index, ra->ra_pages + i - 1)); - } - total_read++; -again: - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) - BUG_ON(1); - page = grab_cache_page(inode->i_mapping, i); + WARN_ON(cluster->start != cluster->boundary[0]); + while (index <= last_index) { + page = find_lock_page(inode->i_mapping, index); if (!page) { - ret = -ENOMEM; - goto out_unlock; + page_cache_sync_readahead(inode->i_mapping, + ra, NULL, index, + last_index + 1 - index); + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, + ra, NULL, page, index, + last_index + 1 - index); } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -2589,75 +2636,79 @@ again: goto out_unlock; } } - wait_on_page_writeback(page); page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } + + lock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); - if (i == first_index) - set_extent_bits(io_tree, page_start, page_end, + if (nr < cluster->nr && + page_start + offset == cluster->boundary[nr]) { + set_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, EXTENT_BOUNDARY, GFP_NOFS); + nr++; + } btrfs_set_extent_delalloc(inode, page_start, page_end); set_page_dirty(page); - total_dirty++; + dirty_page++; - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); + + index++; + if (nr < cluster->nr && + page_end + 1 + offset == cluster->boundary[nr]) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); + dirty_page = 0; + } + } + if (dirty_page) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); } + WARN_ON(nr != cluster->nr); out_unlock: mutex_unlock(&inode->i_mutex); kfree(ra); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); return ret; } static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; - u64 end = start + extent_key->offset - 1; - - em = alloc_extent_map(GFP_NOFS); - em->start = start; - em->len = extent_key->offset; - em->block_len = extent_key->offset; - em->block_start = extent_key->objectid; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + int ret; - /* setup extent map to cheat btrfs_readpage */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - while (1) { - int ret; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, end, 0); + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - return relocate_inode_pages(inode, start, extent_key->offset); + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; } #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 @@ -3208,6 +3259,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; + struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -3217,6 +3269,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) int ret; int err = 0; + cluster = kzalloc(sizeof(*cluster), GFP_NOFS); + if (!cluster) + return -ENOMEM; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3310,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_end_transaction(trans, rc->extent_root); trans = NULL; btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; - ret = relocate_data_extent(rc->data_inode, &key); + ret = relocate_data_extent(rc->data_inode, + &key, cluster); if (ret < 0) { err = ret; break; @@ -3332,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) btrfs_btree_balance_dirty(rc->extent_root, nr); } + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, cluster); + if (ret < 0) + err = ret; + } + + kfree(cluster); + rc->create_reloc_root = 0; smp_mb(); @@ -3352,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) + struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_inode_item *item; @@ -3372,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); btrfs_mark_buffer_dirty(leaf); @@ -3408,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (err) goto out; - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); + err = __insert_orphan_inode(trans, root, objectid); BUG_ON(err); key.objectid = objectid; @@ -3519,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } } - filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); + filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); -- cgit v1.2.1