diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/ctree.c | 16 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 11 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 12 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 21 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 3 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 5 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 49 | ||||
-rw-r--r-- | fs/btrfs/send.c | 2 | ||||
-rw-r--r-- | fs/btrfs/super.c | 82 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 12 |
10 files changed, 112 insertions, 101 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d92462fe66c8..f64aad613727 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1016,19 +1016,21 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; /* - * If we are COWing a node/leaf from the extent, chunk or device trees, - * make sure that we do not finish block group creation of pending block - * groups. We do this to avoid a deadlock. + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. * COWing can result in allocation of a new chunk, and flushing pending * block groups (btrfs_create_pending_block_groups()) can be triggered * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk and device trees, therefore we could - * deadlock with ourselves since we are holding a lock on an extent - * buffer that btrfs_create_pending_block_groups() may try to COW later. + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. */ if (root == fs_info->extent_root || root == fs_info->chunk_root || - root == fs_info->dev_root) + root == fs_info->dev_root || + root == fs_info->free_space_root) trans->can_flush_pending_bgs = false; cow = btrfs_alloc_tree_block(trans, root, parent_start, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f031a447a047..7a2a2621f0d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; +struct btrfs_delayed_ref_root; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -786,6 +787,9 @@ enum { * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, }; struct btrfs_fs_info { @@ -1144,9 +1148,6 @@ struct btrfs_fs_info { struct mutex unused_bg_unpin_mutex; struct mutex delete_unused_bgs_mutex; - /* For btrfs to record security options */ - struct security_mnt_opts security_opts; - /* * Chunks that can't be freed yet (under a trim/discard operation) * and will be latter freed. Protected by fs_info->chunk_mutex. @@ -2664,6 +2665,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, @@ -3021,7 +3025,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - security_free_mnt_opts(&fs_info->security_opts); kvfree(fs_info); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8da2f380d3c0..6a2a2a951705 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1682,6 +1682,8 @@ static int cleaner_kthread(void *arg) while (1) { again = 0; + set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; @@ -1728,6 +1730,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) @@ -4201,6 +4204,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -4265,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (pin_bytes) btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b15afeae16df..d81035b7ea7d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { @@ -2544,7 +2542,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -4954,6 +4952,15 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = may_commit_transaction(fs_info, space_info); break; default: @@ -7188,7 +7195,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fc126b92ea59..52abe4082680 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4103,8 +4103,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, while (!list_empty(pages)) { for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = list_entry(pages->prev, - struct page, lru); + struct page *page = lru_to_page(pages); prefetchw(&page->flags); list_del(&page->lru); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43eb4535319d..5c349667c761 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3129,9 +3129,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } @@ -3254,6 +3251,8 @@ void btrfs_add_delayed_iput(struct inode *inode) ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fab9443f6a42..9c8e1734429c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3221,6 +3221,26 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + } + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { @@ -3242,11 +3262,12 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, return -EINVAL; /* - * Lock destination range to serialize with concurrent readpages(). + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; } @@ -3905,17 +3926,33 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, len = ALIGN(src->i_size, bs) - off; if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; } /* - * Lock destination range to serialize with concurrent readpages(). + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. */ - lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); + btrfs_double_extent_lock(src, off, inode, destoff, len); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); + btrfs_double_extent_unlock(src, off, inode, destoff, len); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1b15b43905f8..7ea2d6b1f170 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6646,7 +6646,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - if (!access_ok(VERIFY_READ, arg->clone_sources, + if (!access_ok(arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { ret = -EFAULT; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 368a5b9e6c13..c5586ffd1426 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1458,56 +1458,6 @@ out: return root; } -static int parse_security_options(char *orig_opts, - struct security_mnt_opts *sec_opts) -{ - char *secdata = NULL; - int ret = 0; - - secdata = alloc_secdata(); - if (!secdata) - return -ENOMEM; - ret = security_sb_copy_data(orig_opts, secdata); - if (ret) { - free_secdata(secdata); - return ret; - } - ret = security_sb_parse_opts_str(secdata, sec_opts); - free_secdata(secdata); - return ret; -} - -static int setup_security_options(struct btrfs_fs_info *fs_info, - struct super_block *sb, - struct security_mnt_opts *sec_opts) -{ - int ret = 0; - - /* - * Call security_sb_set_mnt_opts() to check whether new sec_opts - * is valid. - */ - ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); - if (ret) - return ret; - -#ifdef CONFIG_SECURITY - if (!fs_info->security_opts.num_mnt_opts) { - /* first time security setup, copy sec_opts to fs_info */ - memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); - } else { - /* - * Since SELinux (the only one supporting security_mnt_opts) - * does NOT support changing context during remount/mount of - * the same sb, this must be the same or part of the same - * security options, just free it. - */ - security_free_mnt_opts(sec_opts); - } -#endif - return ret; -} - /* * Find a superblock for the given device / mount point. * @@ -1522,16 +1472,15 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, struct btrfs_device *device = NULL; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; - struct security_mnt_opts new_sec_opts; + void *new_sec_opts = NULL; fmode_t mode = FMODE_READ; int error = 0; if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; - security_init_mnt_opts(&new_sec_opts); if (data) { - error = parse_security_options(data, &new_sec_opts); + error = security_sb_eat_lsm_opts(data, &new_sec_opts); if (error) return ERR_PTR(error); } @@ -1550,7 +1499,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - security_init_mnt_opts(&fs_info->security_opts); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; goto error_fs_info; @@ -1601,16 +1549,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data); } + if (!error) + error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); + security_free_mnt_opts(&new_sec_opts); if (error) { deactivate_locked_super(s); - goto error_sec_opts; - } - - fs_info = btrfs_sb(s); - error = setup_security_options(fs_info, s, &new_sec_opts); - if (error) { - deactivate_locked_super(s); - goto error_sec_opts; + return ERR_PTR(error); } return dget(s->s_root); @@ -1779,18 +1723,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_remount_prepare(fs_info); if (data) { - struct security_mnt_opts new_sec_opts; + void *new_sec_opts = NULL; - security_init_mnt_opts(&new_sec_opts); - ret = parse_security_options(data, &new_sec_opts); + ret = security_sb_eat_lsm_opts(data, &new_sec_opts); + if (!ret) + ret = security_sb_remount(sb, new_sec_opts); + security_free_mnt_opts(&new_sec_opts); if (ret) goto restore; - ret = setup_security_options(fs_info, sb, - &new_sec_opts); - if (ret) { - security_free_mnt_opts(&new_sec_opts); - goto restore; - } } ret = btrfs_parse_options(fs_info, data, *flags); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2576b1a379c9..3e4f8f88353e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7825,6 +7825,18 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + /* It's possible this device is a dummy for seed device */ + if (dev->disk_total_bytes == 0) { + dev = find_device(fs_info->fs_devices->seed, devid, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find seed devid %llu", + devid); + ret = -EUCLEAN; + goto out; + } + } + if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", |