diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/check-integrity.c | 163 | ||||
-rw-r--r-- | fs/btrfs/compression.c | 18 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 78 | ||||
-rw-r--r-- | fs/btrfs/dev-replace.c | 23 | ||||
-rw-r--r-- | fs/btrfs/dir-item.c | 10 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 49 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 211 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 41 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 1 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 2 | ||||
-rw-r--r-- | fs/btrfs/file.c | 51 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 117 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.h | 2 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 4 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 152 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 36 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 49 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.h | 12 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 90 | ||||
-rw-r--r-- | fs/btrfs/send.c | 49 | ||||
-rw-r--r-- | fs/btrfs/super.c | 94 | ||||
-rw-r--r-- | fs/btrfs/sysfs.c | 34 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 166 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 6 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 50 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 38 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 18 | ||||
-rw-r--r-- | fs/btrfs/xattr.c | 150 |
29 files changed, 1208 insertions, 508 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index cb7f3fe9c9f6..d897ef803b3b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,6 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "hash.h" @@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state, static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfsic_block_data_ctx *block_ctx_out, int mirror_num); -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out); static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); @@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block( l = NULL; next_block->generation = BTRFSIC_GENERATION_UNKNOWN; } else { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch (!= stored %llu).\n", - next_bytenr, next_block_ctx->dev->name, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, next_block), - next_block->logical_bytenr); - } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - next_bytenr, next_block_ctx->dev->name, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, next_block)); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block), + next_block->logical_bytenr); + else + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block)); + } next_block->logical_bytenr = next_bytenr; next_block->mirror_num = *mirror_nump; @@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data( return -1; } if (!block_was_created) { - if (next_block->logical_bytenr != next_bytenr && + if ((state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) && + next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) { printk(KERN_INFO @@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, return ret; } -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out) -{ - block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); - block_ctx_out->dev_bytenr = bytenr; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - if (NULL != block_ctx_out->dev) { - return 0; - } else { - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); - return -ENXIO; - } -} - static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) { if (block_ctx->mem_to_free) { @@ -1901,25 +1883,26 @@ again: dev_state, dev_bytenr); } - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch" - " (!= stored %llu).\n", - bytenr, dev_state->name, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block), - block->logical_bytenr); - else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - bytenr, dev_state->name, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + bytenr, dev_state->name, + dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, + block), + block->logical_bytenr); + else + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", + bytenr, dev_state->name, + dev_bytenr, block->mirror_num, + btrfsic_get_block_type(state, + block)); + } block->logical_bytenr = bytenr; } else { if (num_pages * PAGE_CACHE_SIZE < @@ -2002,24 +1985,13 @@ again: } } - if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, - processed_len, - bdev, &block_ctx); - else - ret = btrfsic_map_block(state, bytenr, processed_len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", bytenr); - goto continue_loop; - } - block_ctx.datav = mapped_datav; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ block_ctx.dev = dev_state; block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; if (is_metadata || state->include_extent_data) { block->never_written = 0; @@ -2133,10 +2105,6 @@ again: /* this is getting ugly for the * include_extent_data case... */ bytenr = 0; /* unknown */ - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.mem_to_free = NULL; - block_ctx.pagev = NULL; } else { processed_len = state->metablock_size; bytenr = btrfs_stack_header_bytenr( @@ -2149,22 +2117,15 @@ again: "Written block @%llu (%s/%llu/?)" " !found in hash table, M.\n", bytenr, dev_state->name, dev_bytenr); - - ret = btrfsic_map_block(state, bytenr, processed_len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", - dev_bytenr); - goto continue_loop; - } } - block_ctx.datav = mapped_datav; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ + block_ctx.dev = dev_state; block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; block = btrfsic_block_alloc(); if (NULL == block) { @@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root, root->sectorsize, PAGE_CACHE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_NOFS); - if (NULL == state) { - printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); - return -1; + state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!state) { + state = vzalloc(sizeof(*state)); + if (!state) { + printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n"); + return -1; + } } if (!btrfsic_is_initialized) { @@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - kfree(state); + if (is_vmalloc_addr(state)) + vfree(state); + else + kfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d3220d31d3cb..1bf411bc28fd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -224,16 +224,19 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline void end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, + const struct compressed_bio *cb) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long index = cb->start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; int i; int ret; + if (cb->errors) + mapping_set_error(inode->i_mapping, -EIO); + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, min_t(unsigned long, @@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, continue; } for (i = 0; i < ret; i++) { + if (cb->errors) + SetPageError(pages[i]); end_page_writeback(pages[i]); page_cache_release(pages[i]); } @@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err) tree->ops->writepage_end_io_hook(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, - NULL, 1); + NULL, + err ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; - end_compressed_writeback(inode, cb->start, cb->len); + end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ /* diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 150822ee0a0b..14a72ed14ef7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2929,7 +2929,7 @@ done: */ if (!p->leave_spinning) btrfs_set_path_blocking(p); - if (ret < 0) + if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 470e3177a7e8..e6fbbd74b716 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -607,6 +607,7 @@ struct btrfs_path { unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; + unsigned int skip_release_on_error:1; }; /* @@ -1170,6 +1171,7 @@ struct btrfs_space_info { struct percpu_counter total_bytes_pinned; struct list_head list; + struct list_head ro_bgs; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache { unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; int disk_cache_state; @@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache { /* For delayed block group creation or deletion of empty block groups */ struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; }; /* delayed seq elem */ @@ -1402,6 +1411,11 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; unsigned long mount_opt; + /* + * Track requests for actions that need to be done during transaction + * commit (like for some mount options). + */ + unsigned long pending_changes; unsigned long compress_type:4; int commit_interval; /* @@ -1729,6 +1743,12 @@ struct btrfs_fs_info { /* For btrfs to record security options */ struct security_mnt_opts security_opts; + + /* + * Chunks that can't be freed yet (under a trim/discard operation) + * and will be latter freed. Protected by fs_info->chunk_mutex. + */ + struct list_head pinned_chunks; }; struct btrfs_subvolume_writers { @@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) + #define btrfs_set_and_info(root, opt, fmt, args...) \ { \ if (!btrfs_test_opt(root, opt)) \ @@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args { } /* + * Requests for changes that need to be done during transaction commit. + * + * Internal mount options that are used for special handling of the real + * mount options (eg. cannot be set during remount and have to be set during + * transaction commit) + */ + +#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) +#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) +#define BTRFS_PENDING_COMMIT (2) + +#define btrfs_test_pending(info, opt) \ + test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_set_pending(info, opt) \ + set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_clear_pending(info, opt) \ + clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) + +/* + * Helpers for setting pending mount option changes. + * + * Expects corresponding macros + * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name + */ +#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), SET_##opt); \ + btrfs_clear_pending((info), CLEAR_##opt); \ + } \ +} while(0) + +#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), CLEAR_##opt); \ + btrfs_clear_pending((info), SET_##opt); \ + } \ +} while(0) + +/* * Inode flags */ #define BTRFS_INODE_NODATASUM (1 << 0) @@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start); + struct btrfs_root *root, u64 group_start, + struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); -int btrfs_start_nocow_write(struct btrfs_root *root); -void btrfs_end_nocow_write(struct btrfs_root *root); +int btrfs_start_write_no_snapshoting(struct btrfs_root *root); +void btrfs_end_write_no_snapshoting(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, + int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0bf41f8b1e23..ca6a3a3b6b6c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -417,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); - WARN_ON(ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } - return 0; + return ret; leave: dev_replace->srcdev = NULL; @@ -537,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; + return scrub_ret; } printk_in_rcu(KERN_INFO @@ -566,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); - btrfs_dev_replace_unlock(dev_replace); btrfs_rm_dev_replace_blocked(fs_info); - btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); btrfs_rm_dev_replace_unblocked(fs_info); @@ -589,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); if (!IS_ERR(trans)) diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index fc8df866e919..1752625fb4dd 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,10 +21,6 @@ #include "hash.h" #include "transaction.h" -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); - /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len) +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1bf9f897065d..30965120772b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + INIT_LIST_HEAD(&fs_info->pinned_chunks); + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2830,9 +2832,11 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } - /* Set the real inode map cache flag */ - if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) - btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + /* + * Mount does not set all options immediatelly, we can do it now and do + * not have to wait for transaction commit + */ + btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { @@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; + + lock_chunks(root); + while (!list_empty(&fs_info->pinned_chunks)) { + struct extent_map *em; + + em = list_first_entry(&fs_info->pinned_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + unlock_chunks(root); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, */ if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->root); + btrfs_super_root(sb)); if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->chunk_root); + printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", + btrfs_super_chunk_root(sb)); if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { @@ -4129,6 +4144,25 @@ again: return 0; } +static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); + btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47c1ba141082..222d6aea4a8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache) struct btrfs_caching_control *ctl; spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_STARTED) { - spin_unlock(&cache->lock); - return NULL; - } - - /* We're loading it the fast way, so we don't have a caching_ctl. */ if (!cache->caching_ctl) { spin_unlock(&cache->lock); return NULL; @@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, spin_unlock(&cache->lock); if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; } else { if (load_cache_only) { cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } } spin_unlock(&cache->lock); + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); @@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } spin_unlock(&cache->lock); wake_up(&caching_ctl->wait); @@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(root->fs_info, + next_bytenr); + return cache; + } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { @@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + INIT_LIST_HEAD(&found->ro_bgs); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); /* * No longer have used bytes in this block group, queue * it for deletion. @@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root, } spin_unlock(&info->unused_bgs_lock); } - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro = 1; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } out: @@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, /* * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. + * space_info. takes mirrors into account. */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group_cache *block_group; u64 free_bytes = 0; int factor; - list_for_each_entry(block_group, groups_list, list) { + /* It's df, we don't care if it's racey */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); if (!block_group->ro) { @@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) spin_unlock(&block_group->lock); } - - return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - int i; - u64 free_bytes = 0; - - spin_lock(&sinfo->lock); - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - if (!list_empty(&sinfo->block_groups[i])) - free_bytes += __btrfs_get_ro_block_group_free_space( - &sinfo->block_groups[i]); - spin_unlock(&sinfo->lock); return free_bytes; @@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, cache->bytes_super - btrfs_block_group_used(&cache->item); sinfo->bytes_readonly -= num_bytes; cache->ro = 0; + list_del_init(&cache->ro_list); spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); return cache; } @@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, int ret = 0; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - list_del_init(&block_group->bg_list); if (ret) - continue; + goto next; spin_lock(&block_group->lock); memcpy(&item, &block_group->item, sizeof(item)); @@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); } } @@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int ret; int index; int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); if (root->fs_info->first_logical_byte == block_group->key.objectid) root->fs_info->first_logical_byte = (u64)-1; @@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + list_del_init(&block_group->ro_list); if (list_empty(&block_group->space_info->block_groups[index])) { kobj = block_group->space_info->block_group_kobjs[index]; block_group->space_info->block_group_kobjs[index] = NULL; @@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } + if (block_group->has_caching_ctl) + caching_ctl = get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&root->fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &root->fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + atomic_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&root->fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + put_caching_control(caching_ctl); + put_caching_control(caching_ctl); + } + } btrfs_remove_free_space_cache(block_group); @@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + lock_chunks(root); + if (!list_empty(&em->list)) { + /* We're in the transaction->pending_chunks list. */ + free_extent_map(em); + } + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + /* + * The em might be in the pending_chunks list, so make sure the + * chunk mutex is locked, since remove_extent_mapping() will + * delete us from that list. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + + unlock_chunks(root); + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; - clear_extent_bits(&fs_info->freed_extents[0], start, end, + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); - clear_extent_bits(&fs_info->freed_extents[1], start, end, + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; @@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); +end_trans: btrfs_end_transaction(trans, root); next: btrfs_put_block_group(block_group); @@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk after the snapshot creation. + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). */ -void btrfs_end_nocow_write(struct btrfs_root *root) +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* @@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) wake_up(&root->subv_writers->wait); } -int btrfs_start_nocow_write(struct btrfs_root *root) +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) { if (atomic_read(&root->will_be_snapshoted)) return 0; @@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) */ smp_mb(); if (atomic_read(&root->will_be_snapshoted)) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); return 0; } return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf3f424e0013..4ebabd237153 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, clear = 1; again: if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; } spin_lock(&tree->lock); @@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree, state->state |= bits_to_set; } -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + const u64 flags) { if (cached_ptr && !(*cached_ptr)) { - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + if (!flags || (state->state & flags)) { *cached_ptr = state; atomic_inc(&state->refs); } } } +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_IOBITS | EXTENT_BOUNDARY); +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err = 0; u64 last_start; u64 last_end; + bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ prealloc = alloc_extent_state(mask); - if (!prealloc) + if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1234,6 +1255,7 @@ search_again: spin_unlock(&tree->lock); if (mask & __GFP_WAIT) cond_resched(); + first_iteration = false; goto again; } @@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, state = find_first_extent_bit_state(tree, start, bits); got_it: if (state) { - cache_state(state, cached_state); + cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; ret = 0; @@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, if (page_ops == 0) return 0; + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(inode->i_mapping, -EIO); + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, min_t(unsigned long, @@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_page_dirty_for_io(pages[i]); if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + if (page_ops & PAGE_SET_ERROR) + SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); if (page_ops & PAGE_UNLOCK) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6d4b938be986..ece9ce87edff 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -49,6 +49,7 @@ #define PAGE_SET_WRITEBACK (1 << 2) #define PAGE_END_WRITEBACK (1 << 3) #define PAGE_SET_PRIVATE2 (1 << 4) +#define PAGE_SET_ERROR (1 << 5) /* * page->private values. Every page that is controlled by the extent diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 225302b39afb..6a98bddd8f33 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) - list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a18ceabd99a8..e4090259569b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_nocow_write(root); + ret = btrfs_start_write_no_snapshoting(root); if (!ret) return -ENOSPC; @@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, btrfs_free_reserved_data_space(inode, reserve_bytes); else - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); break; } @@ -1632,7 +1632,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1661,7 +1661,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { btrfs_delalloc_release_space(inode, release_bytes); @@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t pos) { struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); ssize_t written; ssize_t written_buffered; loff_t endbyte; @@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, err = written_buffered; goto out; } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (err) goto out; written += written_buffered; @@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int ret; atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); return ret; @@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void) return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 33848196550e..030847bf7cec 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,10 +27,17 @@ #include "disk-io.h" #include "extent_io.h" #include "inode-map.h" +#include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +struct btrfs_trim_range { + u64 start; + u64 bytes; + struct list_head list; +}; + static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, int ret; struct btrfs_free_cluster *cluster = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); + struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { @@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, cluster = NULL; } } + + /* + * Make sure we don't miss any range that was removed from our rbtree + * because trimming is running. Otherwise after a umount+mount (or crash + * after committing the transaction) we would leak free space and get + * an inconsistent free space cache report from fsck. + */ + list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { + ret = io_ctl_add_entry(io_ctl, trim_entry->start, + trim_entry->bytes, NULL); + if (ret) + goto fail; + *entries += 1; + } + return 0; fail: return -ENOSPC; @@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_set_generation(&io_ctl, trans->transid); + mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ ret = write_cache_extent_entries(&io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) + if (ret) { + mutex_unlock(&ctl->cache_writeout_mutex); goto out_nospc; + } /* * Some spaces that are freed in the current transaction are pinned, @@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * committed, we shouldn't lose them. */ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) + if (ret) { + mutex_unlock(&ctl->cache_writeout_mutex); goto out_nospc; + } - /* At last, we write out all the bitmaps. */ + /* + * At last, we write out all the bitmaps and keep cache_writeout_mutex + * locked while doing it because a concurrent trim can be manipulating + * or freeing the bitmap. + */ ret = write_bitmap_entries(&io_ctl, &bitmap_list); + mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; @@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) ctl->start = block_group->key.objectid; ctl->private = block_group; ctl->op = &free_space_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); /* * we only want to have 32k of ram per block group for keeping @@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) static int do_trimming(struct btrfs_block_group_cache *block_group, u64 *total_trimmed, u64 start, u64 bytes, - u64 reserved_start, u64 reserved_bytes) + u64 reserved_start, u64 reserved_bytes, + struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; u64 trimmed = 0; @@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, if (!ret) *total_trimmed += trimmed; + mutex_lock(&ctl->cache_writeout_mutex); btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + list_del(&trim_entry->list); + mutex_unlock(&ctl->cache_writeout_mutex); if (update) { spin_lock(&space_info->lock); @@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, u64 bytes; while (start < end) { + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, start, 0, 1); if (!entry) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } @@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, node = rb_next(&entry->offset_index); if (!node) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto out; } entry = rb_entry(node, struct btrfs_free_space, @@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, if (entry->offset >= end) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } @@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, bytes = min(extent_start + extent_bytes, end) - start; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto next; } @@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, kmem_cache_free(btrfs_free_space_cachep, entry); spin_unlock(&ctl->tree_lock); + trim_entry.start = extent_start; + trim_entry.bytes = extent_bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes); + extent_start, extent_bytes, &trim_entry); if (ret) break; next: @@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, while (offset < end) { bool next_bitmap = false; + struct btrfs_trim_range trim_entry; + mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); if (!entry) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } @@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, ret2 = search_bitmap(ctl, entry, &start, &bytes); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } @@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, bytes = min(bytes, end - start); if (bytes < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto next; } @@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, free_bitmap(ctl, entry); spin_unlock(&ctl->tree_lock); + trim_entry.start = start; + trim_entry.bytes = bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes); + start, bytes, &trim_entry); if (ret) break; next: @@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, *trimmed = 0; + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + atomic_inc(&block_group->trimming); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); if (ret) - return ret; + goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + spin_lock(&block_group->lock); + if (atomic_dec_and_test(&block_group->trimming) && + block_group->removed) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + spin_unlock(&block_group->lock); + + em_tree = &block_group->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->key.objectid, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + lock_chunks(block_group->fs_info->chunk_root); + list_del_init(&em->list); + unlock_chunks(block_group->fs_info->chunk_root); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We've left one free space entry and other tasks trimming + * this block group have left 1 entry each one. Free them. + */ + __btrfs_remove_free_space_cache(block_group->free_space_ctl); + } else { + spin_unlock(&block_group->lock); + } return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 0cf4977ef70d..88b2238a0aed 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -38,6 +38,8 @@ struct btrfs_free_space_ctl { u64 start; struct btrfs_free_space_op *op; void *private; + struct mutex cache_writeout_mutex; + struct list_head trimming_ranges; }; struct btrfs_free_space_op { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 83d646bd2e4b..74faea3a516e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) root->root_key.objectid); if (IS_ERR(tsk)) { btrfs_warn(root->fs_info, "failed to start inode caching task"); - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE, "disabling inode map caching"); } } @@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root) ctl->start = 0; ctl->private = NULL; ctl->op = &free_ino_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); /* * Initially we allow to use 16K of ram to cache chunks of diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d23362f4464e..8de23355f6cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode) * are written in the same order that the flusher thread sent them * down. */ -static noinline int compress_file_range(struct inode *inode, +static noinline void compress_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, struct async_cow *async_cow, @@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); - /* - * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. - */ - if ((end - start + 1) <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - goto cleanup_and_bail_uncompressed; - actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -440,6 +432,14 @@ again: total_compressed = actual_end - start; + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if (total_compressed <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + /* we want to make sure that amount of ram required to uncompress * an extent is reasonable, so we limit the total size in ram * of a compressed extent to 128k. This is a crucial number @@ -527,7 +527,10 @@ cont: if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DEFRAG; + unsigned long page_error_op; + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, @@ -538,6 +541,7 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + page_error_op | PAGE_END_WRITEBACK); goto free_pages_out; } @@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed: *num_added += 1; } -out: - return ret; + return; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -629,8 +632,22 @@ free_pages_out: page_cache_release(pages[i]); } kfree(pages); +} - goto out; +static void free_async_extent_pages(struct async_extent *async_extent) +{ + int i; + + if (!async_extent->pages) + return; + + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; } /* @@ -639,7 +656,7 @@ free_pages_out: * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline int submit_compressed_extents(struct inode *inode, +static noinline void submit_compressed_extents(struct inode *inode, struct async_cow *async_cow) { struct async_extent *async_extent; @@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode, struct extent_io_tree *io_tree; int ret = 0; - if (list_empty(&async_cow->extents)) - return 0; - again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, @@ -709,15 +723,7 @@ retry: async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); if (ret) { - int i; - - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); - } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + free_async_extent_pages(async_extent); if (ret == -ENOSPC) { unlock_extent(io_tree, async_extent->start, @@ -814,15 +820,26 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); + if (ret) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct page *p = async_extent->pages[0]; + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + p->mapping = inode->i_mapping; + tree->ops->writepage_end_io_hook(p, start, end, + NULL, 0); + p->mapping = NULL; + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); - if (ret) - goto out; cond_resched(); } - ret = 0; -out: - return ret; + return; out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: @@ -832,7 +849,9 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); kfree(async_extent); goto again; } @@ -1318,7 +1337,7 @@ next_slot: * we fall into common COW way. */ if (!nolock) { - err = btrfs_start_nocow_write(root); + err = btrfs_start_write_no_snapshoting(root); if (!err) goto out_check; } @@ -1342,7 +1361,7 @@ out_check: if (extent_end <= start) { path->slots[0]++; if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto next_slot; } if (!nocow) { @@ -1362,7 +1381,7 @@ out_check: page_started, nr_written, 1); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } cow_start = (u64)-1; @@ -1413,7 +1432,7 @@ out_check: num_bytes); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } } @@ -1424,7 +1443,7 @@ out_check: EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -4580,6 +4599,26 @@ next: return err; } +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +static void wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} + static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize > oldsize) { truncate_pagecache(inode, newsize); + /* + * Don't do an expanding truncate while snapshoting is ongoing. + * This is to ensure the snapshot captures a fully consistent + * state of this file - if the snapshot captures this expanding + * truncation, it must capture all writes that happened before + * this truncation. + */ + wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) + if (ret) { + btrfs_end_write_no_snapshoting(root); return ret; + } trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_end_write_no_snapshoting(root); return PTR_ERR(trans); + } i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); + btrfs_end_write_no_snapshoting(root); btrfs_end_transaction(trans, root); } else { @@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_put_ordered_extent(ordered); } else { /* Screw you mmap */ - ret = filemap_write_and_wait_range(inode->i_mapping, - lockstart, - lockend); + ret = btrfs_fdatawrite_range(inode, lockstart, lockend); + if (ret) + break; + ret = filemap_fdatawait_range(inode->i_mapping, + lockstart, + lockend); if (ret) break; @@ -9442,6 +9497,21 @@ out_inode: } +/* Inspired by filemap_check_errors() */ +int btrfs_inode_check_errors(struct inode *inode) +{ + int ret = 0; + + if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &inode->i_mapping->flags) && + test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) + ret = -EIO; + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4399f0c3a4ce..b590e23fa03e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -617,7 +617,7 @@ fail: return ret; } -static void btrfs_wait_nocow_write(struct btrfs_root *root) +static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) { s64 writers; DEFINE_WAIT(wait); @@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); - btrfs_wait_nocow_write(root); + btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -761,7 +732,8 @@ fail: free: kfree(pending_snapshot); out: - atomic_dec(&root->will_be_snapshoted); + if (atomic_dec_and_test(&root->will_be_snapshoted)) + wake_up_atomic_t(&root->will_be_snapshoted); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec4cc20..534544e08f76 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -431,19 +432,31 @@ out: /* Needs to either be called under a log transaction or the log_mutex */ void btrfs_get_logged_extents(struct inode *inode, - struct list_head *logged_list) + struct list_head *logged_list, + const loff_t start, + const loff_t end) { struct btrfs_ordered_inode_tree *tree; struct btrfs_ordered_extent *ordered; struct rb_node *n; + struct rb_node *prev; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + n = __tree_search(&tree->tree, end, &prev); + if (!n) + n = prev; + for (; n; n = rb_prev(n)) { ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + if (ordered->file_offset > end) + continue; + if (entry_end(ordered) <= start) + break; if (!list_empty(&ordered->log_list)) continue; - list_add_tail(&ordered->log_list, logged_list); + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + list_add(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); @@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, spin_unlock_irq(&log->log_extents_lock[index]); } -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; int index = transid % 2; @@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - btrfs_put_ordered_extent(ordered); + if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + ret = btrfs_fdatawrite_range(inode, start, orig_end); if (ret) return ret; - /* - * So with compression we will find and lock a dirty page and clear the - * first one as dirty, setup an async extent, and immediately return - * with the entire range locked but with nobody actually marked with - * writeback. So we can't just filemap_write_and_wait_range() and - * expect it to work since it will just kick off a thread to do the - * actual work. So we need to call filemap_fdatawrite_range _again_ - * since it will wait on the page lock, which won't be unlocked until - * after the pages have been marked as writeback and so we're good to go - * from there. We have to do this otherwise we'll miss the ordered - * extents and that results in badness. Please Josef, do not think you - * know better and pull this out at some point in the future, it is - * right and you are wrong. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - orig_end); - if (ret) - return ret; - } + ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); if (ret) return ret; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274d621e..e96cd4ccd805 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -71,6 +71,8 @@ struct btrfs_ordered_sum { ordered extent */ #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ +#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent + * in the logging code. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -121,6 +123,9 @@ struct btrfs_ordered_extent { /* If we need to wait on this to be done */ struct list_head log_list; + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, - struct list_head *logged_list); + struct list_head *logged_list, + const loff_t start, + const loff_t end); void btrfs_put_logged_extents(struct list_head *logged_list); void btrfs_submit_logged_extents(struct list_head *logged_list, struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27f2e16cd259..f2bb13a23f86 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -4049,6 +4049,50 @@ out: scrub_pending_trans_workers_dec(sctx); } +static int check_extent_to_block(struct inode *inode, u64 start, u64 len, + u64 logical) +{ + struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree; + struct extent_map *em; + u64 lockstart = start, lockend = start + len - 1; + int ret = 0; + + io_tree = &BTRFS_I(inode)->io_tree; + + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = 1; + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > logical || + em->block_start + em->block_len < logical + len) { + free_extent_map(em); + ret = 1; + goto out_unlock; + } + free_extent_map(em); + +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); + return ret; +} + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct scrub_copy_nocow_ctx *nocow_ctx) { @@ -4057,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct inode *inode; struct page *page; struct btrfs_root *local_root; - struct btrfs_ordered_extent *ordered; - struct extent_map *em; - struct extent_state *cached_state = NULL; struct extent_io_tree *io_tree; u64 physical_for_dev_replace; + u64 nocow_ctx_logical; u64 len = nocow_ctx->len; - u64 lockstart = offset, lockend = offset + len - 1; unsigned long index; int srcu_index; int ret = 0; @@ -4095,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; io_tree = &BTRFS_I(inode)->io_tree; + nocow_ctx_logical = nocow_ctx->logical; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, lockstart, len); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock; - } - - /* - * This extent does not actually cover the logical extent anymore, - * move on to the next inode. - */ - if (em->block_start > nocow_ctx->logical || - em->block_start + em->block_len < nocow_ctx->logical + len) { - free_extent_map(em); - goto out_unlock; + ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto out; } - free_extent_map(em); while (len >= PAGE_CACHE_SIZE) { index = offset >> PAGE_CACHE_SHIFT; @@ -4135,7 +4159,7 @@ again: goto next_page; } else { ClearPageError(page); - err = extent_read_full_page_nolock(io_tree, page, + err = extent_read_full_page(io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); if (err) { @@ -4160,6 +4184,14 @@ again: goto next_page; } } + + ret = check_extent_to_block(inode, offset, len, + nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto next_page; + } + err = write_page_nocow(nocow_ctx->sctx, physical_for_dev_replace, page); if (err) @@ -4173,12 +4205,10 @@ next_page: offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; + nocow_ctx_logical += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } ret = COPY_COMPLETE; -out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, - GFP_NOFS); out: mutex_unlock(&inode->i_mutex); iput(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 874828dd0a86..804432dbc351 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5507,6 +5507,51 @@ out: return ret; } +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx->parent_root && + sctx->parent_root->node != sctx->parent_root->commit_root) + goto commit_trans; + + for (i = 0; i < sctx->clone_roots_cnt; i++) + if (sctx->clone_roots[i].root->node != + sctx->clone_roots[i].root->commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans, sctx->send_root); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx->send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans, sctx->send_root); +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 54bd91ece35b..60f7cbe815e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used) { + if (!trans->blocks_used && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "disabling disk space caching"); break; case Opt_inode_cache: - btrfs_set_and_info(root, CHANGE_INODE_CACHE, + btrfs_set_pending_and_info(info, INODE_MAP_CACHE, "enabling inode map caching"); break; case Opt_noinode_cache: - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, "disabling inode map caching"); break; case Opt_clear_cache: @@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); + if (PTR_ERR(trans) == -ENOENT) { + /* + * Exit unless we have some pending changes + * that need to go through commit + */ + if (fs_info->pending_changes == 0) + return 0; + trans = btrfs_start_transaction(root, 0); + } else { + return PTR_ERR(trans); + } } return btrfs_commit_transaction(trans, root); } @@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int i = 0, nr_devices; int ret; + /* + * We aren't under the device list lock, so this is racey-ish, but good + * enough for our purposes. + */ nr_devices = fs_info->fs_devices->open_devices; - BUG_ON(!nr_devices); + if (!nr_devices) { + smp_mb(); + nr_devices = fs_info->fs_devices->open_devices; + ASSERT(nr_devices); + if (!nr_devices) { + *free_bytes = 0; + return 0; + } + } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), GFP_NOFS); @@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (fs_info->alloc_start) + mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || device->is_tgtdev_for_dev_replace) continue; + if (i >= nr_devices) + break; + avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ @@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) skip_space = 1024 * 1024; /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) + if (fs_info->alloc_start && + fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) { + rcu_read_unlock(); skip_space = max(fs_info->alloc_start, skip_space); - /* - * btrfs can not use the free space in [0, skip_space - 1], - * we must subtract it from the total. In order to implement - * it, we account the used space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - return ret; - } + /* + * btrfs can not use the free space in + * [0, skip_space - 1], we must subtract it from the + * total. In order to implement it, we account the used + * space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, + skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + mutex_unlock(&fs_devices->device_list_mutex); + return ret; + } - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; + rcu_read_lock(); + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + } /* * we can use the free space in [0, skip_space - 1], subtract @@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i++; } + rcu_read_unlock(); + if (fs_info->alloc_start) + mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * holding chunk_muext to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree -= block_rsv->size >> bits; spin_unlock(&block_rsv->lock); - buf->f_bavail = total_free_data; + buf->f_bavail = div_u64(total_free_data, factor); ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); - if (ret) { - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (ret) return ret; - } buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b2e7bb4393f6..92db3f648df4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, { struct btrfs_fs_info *fs_info; struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); - struct btrfs_trans_handle *trans; u64 features, set, clear; unsigned long val; int ret; @@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, btrfs_info(fs_info, "%s %s feature flag", val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); - trans = btrfs_start_transaction(fs_info->fs_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - spin_lock(&fs_info->super_lock); features = get_features(fs_info, fa->feature_set); if (val) @@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, set_features(fs_info, fa->feature_set, features); spin_unlock(&fs_info->super_lock); - ret = btrfs_commit_transaction(trans, fs_info->fs_root); - if (ret) - return ret; + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); return count; } @@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = fs_info->fs_root; - int ret; size_t p_len; if (fs_info->sb->s_flags & MS_RDONLY) @@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj, if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - spin_lock(&root->fs_info->super_lock); + spin_lock(&fs_info->super_lock); memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); memcpy(fs_info->super_copy->label, buf, p_len); - spin_unlock(&root->fs_info->super_lock); - ret = btrfs_commit_transaction(trans, root); + spin_unlock(&fs_info->super_lock); - if (!ret) - return len; + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); - return ret; + return len; } BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..a605d4e2f2bc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } + spin_unlock(&tree->lock); +} + static noinline void switch_commit_roots(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { @@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, root->commit_root = btrfs_root_node(root); if (is_fstree(root->objectid)) btrfs_unpin_free_ino(root); + clear_btree_io_tree(&root->dirty_log_pages); } up_write(&fs_info->commit_root_sem); } @@ -220,6 +247,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->pending_ordered); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -488,6 +516,7 @@ again: h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); + INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + if (!list_empty(&trans->ordered)) { + spin_lock(&info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + spin_unlock(&info->trans_lock); + } + trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); - cached_state = NULL; - err = filemap_fdatawrite_range(mapping, start, end); + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to btrfs_wait_marked_extents() would not know that writeback + * for this range started and therefore wouldn't wait for it to + * finish - we don't want to commit a superblock that points to + * btree nodes/leafs for which writeback hasn't finished yet + * (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through clear_btree_io_tree()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; return werr; } @@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { - clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); - err = filemap_fdatawait_range(mapping, start, end); + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through clear_btree_io_tree()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } @@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return 0; } -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!trans || !trans->transaction) { - struct inode *btree_inode; - btree_inode = root->fs_info->btree_inode; - return filemap_write_and_wait(btree_inode->i_mapping); - } - return btrfs_write_and_wait_marked_extents(root, + int ret; + + ret = btrfs_write_and_wait_marked_extents(root, &trans->transaction->dirty_pages, EXTENT_DIRTY); + clear_btree_io_tree(&trans->transaction->dirty_pages); + + return ret; } /* @@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static inline void +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); + btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we @@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } /* - * Since the transaction is done, we should set the inode map cache flag - * before any other comming transaction. + * Since the transaction is done, we can apply the pending changes + * before the next transaction. */ - if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) - btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); - else - btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + btrfs_apply_pending_changes(root->fs_info); /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) return (ret < 0) ? 0 : 1; } + +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +{ + unsigned long prev; + unsigned long bit; + + prev = cmpxchg(&fs_info->pending_changes, 0, 0); + if (!prev) + return; + + bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; + if (prev & bit) + btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; + if (prev & bit) + btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_COMMIT; + if (prev & bit) + btrfs_debug(fs_info, "pending commit done"); + prev &= ~bit; + + if (prev) + btrfs_warn(fs_info, + "unknown pending changes left 0x%lx, ignoring", prev); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..00ed29c4b3f9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -56,6 +56,7 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head pending_chunks; + struct list_head pending_ordered; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; @@ -105,6 +106,7 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; + struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); @@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); + #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 286213cec861..9a02da16f2be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, + mark); + btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = root_log_ctx.log_ret; + if (!ret) + ret = root_log_ctx.log_ret; goto out; } ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); @@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_NEW | EXTENT_DIRTY); - btrfs_wait_logged_extents(log, log_transid); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + if (!ret) + ret = btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + /* + * Clear the AS_EIO/AS_ENOSPC flags from the inode's + * i_mapping flags, so that the next fsync won't get + * an outdated io error too. + */ + btrfs_inode_check_errors(inode); *ordered_io_error = true; break; } @@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) btrfs_set_token_file_extent_type(leaf, fi, @@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list); + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate @@ -4089,6 +4104,21 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { + /* + * Some ordered extents started by fsync might have completed + * before we collected the ordered extents in logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. + */ + err = btrfs_inode_check_errors(inode); + if (err) { + ctx->io_err = err; + goto out_unlock; + } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx); if (ret) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cbb766577f31..0144790e296e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -static void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - static struct btrfs_fs_devices *__alloc_fs_devices(void) { struct btrfs_fs_devices *fs_devs; @@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, u64 *start, u64 len) { struct extent_map *em; + struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; - list_for_each_entry(em, &trans->transaction->pending_chunks, list) { +again: + list_for_each_entry(em, search_list, list) { struct map_lookup *map; int i; @@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, ret = 1; } } + if (search_list == &trans->transaction->pending_chunks) { + search_list = &trans->root->fs_info->pinned_chunks; + goto again; + } return ret; } @@ -1800,8 +1796,8 @@ error_undo: goto error_brelse; } -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; @@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->bdev) fs_devices->open_devices--; +} + +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; call_rcu(&srcdev->rcu, free_device); @@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for the tree */ - free_extent_map(em); out: /* once for us */ free_extent_map(em); @@ -4505,6 +4501,8 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); + /* One for the pending_chunks list reference */ + free_extent_map(em); error: kfree(devices_info); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 70be2571cedf..d6fe73c0f4a2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -456,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, @@ -521,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, struct btrfs_transaction *transaction); + +static inline void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static inline void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index dcf20131fbe4..47b19465f0dc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,6 +29,7 @@ #include "xattr.h" #include "disk-io.h" #include "props.h" +#include "locking.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct btrfs_dir_item *di; + struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; size_t name_len = strlen(name); @@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->skip_release_on_error = 1; + + if (!value) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (!di && (flags & XATTR_REPLACE)) + ret = -ENODATA; + else if (di) + ret = btrfs_delete_one_dir_name(trans, root, path, di); + goto out; + } + /* + * For a replace we can't just do the insert blindly. + * Do a lookup first (read-only btrfs_search_slot), and return if xattr + * doesn't exist. If it exists, fall down below to the insert/replace + * path - we can't race with a concurrent xattr delete, because the VFS + * locks the inode's i_mutex before calling setxattr or removexattr. + */ if (flags & XATTR_REPLACE) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { + ASSERT(mutex_is_locked(&inode->i_mutex)); + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (!di) { ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; btrfs_release_path(path); + di = NULL; + } + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EOVERFLOW) { /* - * remove the attribute + * We have an existing item in a leaf, split_leaf couldn't + * expand it. That item might have or not a dir_item that + * matches our target xattr, so lets check. */ - if (!value) - goto out; - } else { - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), - name, name_len, 0); - if (IS_ERR(di)) { - ret = PTR_ERR(di); + ret = 0; + btrfs_assert_tree_locked(path->nodes[0]); + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (!di && !(flags & XATTR_REPLACE)) { + ret = -ENOSPC; goto out; } - if (!di && !value) - goto out; - btrfs_release_path(path); + } else if (ret == -EEXIST) { + ret = 0; + di = btrfs_match_dir_item_name(root, path, name, name_len); + ASSERT(di); /* logic error */ + } else if (ret) { + goto out; } -again: - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - /* - * If we're setting an xattr to a new value but the new value is say - * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting - * back from split_leaf. This is because it thinks we'll be extending - * the existing item size, but we're asking for enough space to add the - * item itself. So if we get EOVERFLOW just set ret to EEXIST and let - * the rest of the function figure it out. - */ - if (ret == -EOVERFLOW) + if (di && (flags & XATTR_CREATE)) { ret = -EEXIST; + goto out; + } - if (ret == -EEXIST) { - if (flags & XATTR_CREATE) - goto out; + if (di) { /* - * We can't use the path we already have since we won't have the - * proper locking for a delete, so release the path and - * re-lookup to delete the thing. + * We're doing a replace, and it must be atomic, that is, at + * any point in time we have either the old or the new xattr + * value in the tree. We don't want readers (getxattr and + * listxattrs) to miss a value, this is specially important + * for ACLs. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { - /* Shouldn't happen but just in case... */ - btrfs_release_path(path); - goto again; + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); + const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; + struct btrfs_item *item; + unsigned long data_ptr; + char *ptr; + + if (size > old_data_len) { + if (btrfs_leaf_free_space(root, leaf) < + (size - old_data_len)) { + ret = -ENOSPC; + goto out; + } } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + if (old_data_len + name_len + sizeof(*di) == item_size) { + /* No other xattrs packed in the same leaf item. */ + if (size > old_data_len) + btrfs_extend_item(root, path, + size - old_data_len); + else if (size < old_data_len) + btrfs_truncate_item(root, path, data_size, 1); + } else { + /* There are other xattrs packed in the same item. */ + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_extend_item(root, path, data_size); + } + item = btrfs_item_nr(slot); + ptr = btrfs_item_ptr(leaf, slot, char); + ptr += btrfs_item_size(leaf, item) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; + write_extent_buffer(leaf, value, data_ptr, size); + btrfs_mark_buffer_dirty(leaf); + } else { /* - * We have a value to set, so go back and try to insert it now. + * Insert, and we had space for the xattr, so path->slots[0] is + * where our xattr dir_item is and btrfs_insert_xattr_item() + * filled it. */ - if (value) { - btrfs_release_path(path); - goto again; - } } out: btrfs_free_path(path); |