diff options
author | David Sterba <dsterba@suse.com> | 2016-02-26 15:38:32 +0100 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2016-02-26 15:38:32 +0100 |
commit | 675d276b322b45e7bf7c616a2847bdc425745b99 (patch) | |
tree | c61e7108034bf1230024880ec8ae46a9617077c2 /fs/btrfs | |
parent | e9ddd77a31e6ba2867b9c33547f6f079d55f7a72 (diff) | |
parent | 73beece9ca07c003e0e4f4825b12be167334d4ad (diff) | |
download | blackbird-obmc-linux-675d276b322b45e7bf7c616a2847bdc425745b99.tar.gz blackbird-obmc-linux-675d276b322b45e7bf7c616a2847bdc425745b99.zip |
Merge branch 'foreign/liubo/replace-lockup' into for-chris-4.6
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/backref.c | 10 | ||||
-rw-r--r-- | fs/btrfs/compression.c | 6 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 6 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 3 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.h | 2 | ||||
-rw-r--r-- | fs/btrfs/dev-replace.c | 130 | ||||
-rw-r--r-- | fs/btrfs/dev-replace.h | 7 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 6 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 45 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 3 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 14 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 119 | ||||
-rw-r--r-- | fs/btrfs/reada.c | 10 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 6 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 24 |
15 files changed, 236 insertions, 155 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b90cd3776f8e..f6dac40f87ff 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, read_extent_buffer(eb, dest + bytes_left, name_off, name_len); if (eb != eb_in) { - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[0] = NULL; + path->locks[0] = 0; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c473c42d7d6c..3346cd8f9910 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - /* In the parent-locked case, we only locked the range we are - * interested in. In all other cases, we can opportunistically - * cache decompressed data that goes beyond the requested range. */ - if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e2a5cc0d4a14..38f34dadb86a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1005,8 +1005,10 @@ struct btrfs_dev_replace { pid_t lock_owner; atomic_t nesting_level; struct mutex lock_finishing_cancel_unmount; - struct mutex lock_management_lock; - struct mutex lock; + rwlock_t lock; + atomic_t read_locks; + atomic_t blocking_readers; + wait_queue_head_t read_lock_wq; struct btrfs_scrub_progress scrub_progress; }; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0be47e4b8136..b57daa895cea 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + struct list_head *ins_list, bool *emitted) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; + *emitted = true; } return 0; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f70119f25421..0167853c84ae 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + struct list_head *ins_list, bool *emitted); /* for init */ int __init btrfs_delayed_inode_init(void); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 01ce5fcecc5c..ff2db7a6c894 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); return 0; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_mark_buffer_dirty(eb); @@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, return PTR_ERR(trans); } - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->is_valid = 1; dev_replace->item_needs_writeback = 1; args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); goto leave; } @@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } @@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); /* * flush all outstanding I/O and inode extent mappings before the @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); mutex_lock(&root->fs_info->chunk_mutex); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, rcu_str_deref(src_device->name), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); @@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_rm_dev_replace_blocked(fs_info); @@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *srcdev; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); } int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, @@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); goto leave; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: @@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); @@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 1); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); return 0; } - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 1); WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); @@ -865,48 +865,58 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw) { - /* the beginning is just an optimization for the typical case */ - if (atomic_read(&dev_replace->nesting_level) == 0) { -acquire_lock: - /* this is not a nested case where the same thread - * is trying to acqurire the same lock twice */ - mutex_lock(&dev_replace->lock); - mutex_lock(&dev_replace->lock_management_lock); - dev_replace->lock_owner = current->pid; - atomic_inc(&dev_replace->nesting_level); - mutex_unlock(&dev_replace->lock_management_lock); - return; + if (rw == 1) { + /* write */ +again: + wait_event(dev_replace->read_lock_wq, + atomic_read(&dev_replace->blocking_readers) == 0); + write_lock(&dev_replace->lock); + if (atomic_read(&dev_replace->blocking_readers)) { + write_unlock(&dev_replace->lock); + goto again; + } + } else { + read_lock(&dev_replace->lock); + atomic_inc(&dev_replace->read_locks); } +} - mutex_lock(&dev_replace->lock_management_lock); - if (atomic_read(&dev_replace->nesting_level) > 0 && - dev_replace->lock_owner == current->pid) { - WARN_ON(!mutex_is_locked(&dev_replace->lock)); - atomic_inc(&dev_replace->nesting_level); - mutex_unlock(&dev_replace->lock_management_lock); - return; +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw) +{ + if (rw == 1) { + /* write */ + ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); + write_unlock(&dev_replace->lock); + } else { + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + atomic_dec(&dev_replace->read_locks); + read_unlock(&dev_replace->lock); } +} - mutex_unlock(&dev_replace->lock_management_lock); - goto acquire_lock; +/* inc blocking cnt and release read lock */ +void btrfs_dev_replace_set_lock_blocking( + struct btrfs_dev_replace *dev_replace) +{ + /* only set blocking for read lock */ + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + atomic_inc(&dev_replace->blocking_readers); + read_unlock(&dev_replace->lock); } -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +/* acquire read lock and dec blocking cnt */ +void btrfs_dev_replace_clear_lock_blocking( + struct btrfs_dev_replace *dev_replace) { - WARN_ON(!mutex_is_locked(&dev_replace->lock)); - mutex_lock(&dev_replace->lock_management_lock); - WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); - WARN_ON(dev_replace->lock_owner != current->pid); - atomic_dec(&dev_replace->nesting_level); - if (atomic_read(&dev_replace->nesting_level) == 0) { - dev_replace->lock_owner = 0; - mutex_unlock(&dev_replace->lock_management_lock); - mutex_unlock(&dev_replace->lock); - } else { - mutex_unlock(&dev_replace->lock_management_lock); - } + /* only set blocking for read lock */ + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); + read_lock(&dev_replace->lock); + if (atomic_dec_and_test(&dev_replace->blocking_readers) && + waitqueue_active(&dev_replace->read_lock_wq)) + wake_up(&dev_replace->read_lock_wq); } void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 20035cbbf021..29e3ef5f96bd 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw); +void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_clear_lock_blocking( + struct btrfs_dev_replace *dev_replace); static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index de68b8b61fd2..1809475ace2f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2273,9 +2273,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) fs_info->dev_replace.lock_owner = 0; atomic_set(&fs_info->dev_replace.nesting_level, 0); mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - mutex_init(&fs_info->dev_replace.lock_management_lock); - mutex_init(&fs_info->dev_replace.lock); + rwlock_init(&fs_info->dev_replace.lock); + atomic_set(&fs_info->dev_replace.read_locks, 0); + atomic_set(&fs_info->dev_replace.blocking_readers, 0); init_waitqueue_head(&fs_info->replace_wait); + init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b2073389dc2..ed2f32dd0d29 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree, struct block_device *bdev; int ret; int nr = 0; - int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); @@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (!parent_locked) - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (parent_locked) - free_extent_state(cached); - else - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -3214,20 +3205,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; - int ret; - - ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ, NULL); - if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - return ret; -} - static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0377413bd4b9..880d5292e972 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,7 +29,6 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 -#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 350b77220153..d4381f5ef52d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5731,6 +5731,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ + bool emitted; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5759,6 +5760,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; + emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5838,6 +5840,7 @@ skip: if (over) goto nopos; + emitted = true; di_len = btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; @@ -5850,11 +5853,20 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); if (ret) goto nopos; } + /* + * If we haven't emitted any dir entry, we must not touch ctx->pos as + * it was was set to the termination value in previous call. We assume + * that "." and ".." were emitted if we reach this point and set the + * termination value as well for an empty directory. + */ + if (ctx->pos > 2 && !emitted) + goto nopos; + /* Reached end of directory/root. Bump pos past the last item. */ ctx->pos++; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9ba7b5be3fe5..a0be375702f9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2794,24 +2794,29 @@ out: static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; page = grab_cache_page(inode->i_mapping, index); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); if (!PageUptodate(page)) { - if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, - 0)) - return NULL; + int ret; + + ret = btrfs_readpage(NULL, page); + if (ret) + return ERR_PTR(ret); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); - return NULL; + return ERR_PTR(-EIO); + } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + return ERR_PTR(-EAGAIN); } } - unlock_page(page); return page; } @@ -2823,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages, pgoff_t index = off >> PAGE_CACHE_SHIFT; for (i = 0; i < num_pages; i++) { +again: pages[i] = extent_same_get_page(inode, index + i); - if (!pages[i]) - return -ENOMEM; + if (IS_ERR(pages[i])) { + int err = PTR_ERR(pages[i]); + + if (err == -EAGAIN) + goto again; + pages[i] = NULL; + return err; + } } return 0; } -static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +static int lock_extent_range(struct inode *inode, u64 off, u64 len, + bool retry_range_locking) { - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ + /* + * Do any pending delalloc/csum calculations on inode, one way or + * another, and lock file content. + * The locking order is: + * + * 1) pages + * 2) range in the inode's io tree + */ while (1) { struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); @@ -2851,8 +2870,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); + if (!retry_range_locking) + return -EAGAIN; btrfs_wait_ordered_range(inode, off, len); } + return 0; } static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) @@ -2877,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len, + bool retry_range_locking) { + int ret; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - lock_extent_range(inode1, loff1, len); - lock_extent_range(inode2, loff2, len); + ret = lock_extent_range(inode1, loff1, len, retry_range_locking); + if (ret) + return ret; + ret = lock_extent_range(inode2, loff2, len, retry_range_locking); + if (ret) + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, + loff1 + len - 1); + return ret; } struct cmp_pages { @@ -2901,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) for (i = 0; i < cmp->num_pages; i++) { pg = cmp->src_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } pg = cmp->dst_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } } kfree(cmp->src_pages); kfree(cmp->dst_pages); @@ -2966,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, src_page = cmp->src_pages[i]; dst_page = cmp->dst_pages[i]; + ASSERT(PageLocked(src_page)); + ASSERT(PageLocked(dst_page)); addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -3078,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, goto out_unlock; } +again: ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); if (ret) goto out_unlock; if (same_inode) - lock_extent_range(src, same_lock_start, same_lock_len); + ret = lock_extent_range(src, same_lock_start, same_lock_len, + false); else - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, + false); + /* + * If one of the inodes has dirty pages in the respective range or + * ordered extents, we need to flush dellaloc and wait for all ordered + * extents in the range. We must unlock the pages and the ranges in the + * io trees to avoid deadlocks when flushing delalloc (requires locking + * pages) and when waiting for ordered extents to complete (they require + * range locking). + */ + if (ret == -EAGAIN) { + /* + * Ranges in the io trees already unlocked. Now unlock all + * pages before waiting for all IO to complete. + */ + btrfs_cmp_data_free(&cmp); + if (same_inode) { + btrfs_wait_ordered_range(src, same_lock_start, + same_lock_len); + } else { + btrfs_wait_ordered_range(src, loff, len); + btrfs_wait_ordered_range(dst, dst_loff, len); + } + goto again; + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + btrfs_cmp_data_free(&cmp); + return ret; + } /* pass original length for comparison so we stay within i_size */ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); @@ -3795,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 lock_start = min_t(u64, off, destoff); u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, lock_start, lock_len); + ret = lock_extent_range(src, lock_start, lock_len, true); } else { - btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_double_extent_lock(src, off, inode, destoff, len, + true); + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + goto out_unlock; } ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index bf69c008249c..b892914968c1 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -396,7 +396,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, } /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -404,12 +404,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } prev_dev = NULL; @@ -451,13 +451,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); if (!have_zone) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2de7817d0e1b..e42aa27c96e9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3859,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); if (dev->scrub_device || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -EINPROGRESS; } - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6e0e4396d3a4..80857b4646c0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1715,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&root->fs_info->dev_replace); + btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { WARN_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; @@ -3687,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { BUG_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; @@ -5063,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) ret++; - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); return ret; } @@ -5326,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!bbio_ret) goto out; - btrfs_dev_replace_lock(dev_replace); + btrfs_dev_replace_lock(dev_replace, 0); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (!dev_replace_is_ongoing) - btrfs_dev_replace_unlock(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); + else + btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && @@ -5752,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->mirror_num = map->num_stripes + 1; } out: - if (dev_replace_is_ongoing) - btrfs_dev_replace_unlock(dev_replace); + if (dev_replace_is_ongoing) { + btrfs_dev_replace_clear_lock_blocking(dev_replace); + btrfs_dev_replace_unlock(dev_replace, 0); + } free_extent_map(em); return ret; } |