From 57ba4cb85bffc0c7c6567c89d23713721fea9655 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 20 May 2016 04:34:23 +0100 Subject: Btrfs: fix race between device replace and block group removal When it's finishing, the device replace code iterates all extent maps representing block group and for each one that has a stripe that refers to the source device, it replaces its device with the target device. However when it replaces the source device with the target device it, the target device still has an ID of 0ULL (BTRFS_DEV_REPLACE_DEVID), only after its ID is changed to match the one from the source device. This leads to races with the chunk removal code that can temporarly see a device with an ID of 0ULL and then attempt to use that ID to remove items from the device tree and fail, causing a transaction abort: [ 9238.594364] BTRFS info (device sdf): dev_replace from /dev/sdf (devid 3) to /dev/sde finished [ 9238.594377] ------------[ cut here ]------------ [ 9238.594402] WARNING: CPU: 14 PID: 21566 at fs/btrfs/volumes.c:2771 btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594403] BTRFS: Transaction aborted (error 1) [ 9238.594416] Modules linked in: btrfs crc32c_generic acpi_cpufreq xor tpm_tis tpm raid6_pq ppdev parport_pc processor psmouse parport i2c_piix4 evdev sg i2c_core se rio_raw pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio e1000 scsi_mod fl oppy [last unloaded: btrfs] [ 9238.594418] CPU: 14 PID: 21566 Comm: btrfs-cleaner Not tainted 4.6.0-rc7-btrfs-next-29+ #1 [ 9238.594419] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 9238.594421] 0000000000000000 ffff88017f1dbc60 ffffffff8126b42c ffff88017f1dbcb0 [ 9238.594422] 0000000000000000 ffff88017f1dbca0 ffffffff81052b14 00000ad37f1dbd18 [ 9238.594423] 0000000000000001 ffff88018068a558 ffff88005c4b9c00 ffff880233f60db0 [ 9238.594424] Call Trace: [ 9238.594428] [] dump_stack+0x67/0x90 [ 9238.594430] [] __warn+0xc2/0xdd [ 9238.594432] [] warn_slowpath_fmt+0x4b/0x53 [ 9238.594434] [] ? kmem_cache_free+0x128/0x188 [ 9238.594450] [] btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594452] [] ? arch_local_irq_save+0x9/0xc [ 9238.594464] [] btrfs_delete_unused_bgs+0x317/0x382 [btrfs] [ 9238.594476] [] cleaner_kthread+0x1ad/0x1c7 [btrfs] [ 9238.594489] [] ? btree_invalidatepage+0x8e/0x8e [btrfs] [ 9238.594490] [] kthread+0xd4/0xdc [ 9238.594494] [] ret_from_fork+0x22/0x40 [ 9238.594495] [] ? kthread_stop+0x286/0x286 [ 9238.594496] ---[ end trace 183efbe50275f059 ]--- The sequence of steps leading to this is like the following: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_delete_unused_bgs() btrfs_remove_chunk() looks up for the extent map corresponding to the chunk lock_chunks() (chunk_mutex) check_system_chunk() unlock_chunks() (chunk_mutex) locks fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) iterates over all stripes from the extent map --> calls btrfs_free_dev_extent() passing it the target device that still has an ID of 0ULL --> btrfs_free_dev_extent() fails --> aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid (which is necessarily > 0) frees the srcdev unlocks fs_info->chunk_mutex So fix this by taking the device list mutex while processing the stripes for the chunk's extent map. This is similar to the race between device replace and block group creation that was fixed by commit 50460e37186a ("Btrfs: fix race when finishing dev replace leading to transaction abort"). Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/volumes.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c01824eef08..04ca48362ef1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2761,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2787,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } @@ -2811,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); -- cgit v1.2.1 From 22ab04e814f4fe2ce72a13d291491f98ef6ac757 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 May 2016 20:29:44 +0100 Subject: Btrfs: fix race between device replace and chunk allocation While iterating and copying extents from the source device, the device replace code keeps adjusting a left cursor that is used to make sure that once we finish processing a device extent, any future writes to extents from the corresponding block group will get into both the source and target devices. This left cursor is also used for resuming the device replace operation at mount time. However using this left cursor to decide whether writes go into both devices or only the source device is not enough to guarantee we don't miss copying extents into the target device. There are two cases where the current approach fails. The first one is related to when there are holes in the device and they get allocated for new block groups while the device replace operation is iterating the device extents (more on this explained below). The second one is that when that loop over the device extents finishes, we start dellaloc, wait for all ordered extents and then commit the current transaction, we might have got new block groups allocated that are now using a device extent that has an offset greater then or equals to the value of the left cursor, in which case writes to extents belonging to these new block groups will get issued only to the source device. For the first case where the current approach of using a left cursor fails, consider the source device currently has the following layout: [ extent bg A ] [ hole, unallocated space ] [extent bg B ] 3Gb 4Gb 5Gb While we are iterating the device extents from the source device using the commit root of the device tree, the following happens: CPU 1 CPU 2 scrub_enumerate_chunks() --> searches the device tree for extents belonging to the source device using the device tree's commit root --> 1st iteration finds extent belonging to block group A --> sets block group A to RO mode (btrfs_inc_block_group_ro) --> sets cursor left to found_key.offset which is 3Gb --> scrub_chunk() starts copies all allocated extents from block group's A stripe at source device into target device btrfs_alloc_chunk() --> allocates device extent in the range [4Gb, 5Gb[ from the source device for a new block group C extent allocated from block group C for a direct IO, buffered write or btree node/leaf extent is written to, perhaps in response to a writepages() call from the VM or directly through direct IO the write is made only against the source device and not against the target device because the extent's offset is in the interval [4Gb, 5Gb[ which is larger then the value of cursor_left (3Gb) --> scrub_chunks() finishes --> updates left cursor from 3Gb to 4Gb --> btrfs_dec_block_group_ro() sets block group A back to RW mode --> 2nd iteration finds extent belonging to block group B - it did not find the new extent in the range [4Gb, 5Gb[ for block group C because we are using the device tree's commit root or even because the block group's items are not all yet inserted in the respective btrees, that is, the block group is still attached to some transaction handle's new_bgs list and btrfs_create_pending_block_groups() was not called yet against that transaction handle, so the device extent items were not yet inserted into the devices tree --> so we end not copying anything from the newly allocated device extent from the source device to the target device So fix this by making __btrfs_map_block() always redirect writes to the target device as well, independently of the left cursor's value. With this change the left cursor is now used only for the purpose of tracking progress and allow a mount operation to resume a device replace. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/volumes.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 04ca48362ef1..765aabd9145f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5773,20 +5773,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } -- cgit v1.2.1 From 65d4f4c151a5fa7b2dacaaf70def3f95001766d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 Sep 2015 15:00:37 -0400 Subject: Btrfs: end transaction if we abort when creating uuid root We still need to call btrfs_end_transaction if we call btrfs_abort_transaction, otherwise we hang and make me super grumpy. Thanks, Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c01824eef08..673c72ab4fbe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4230,6 +4230,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); return ret; } -- cgit v1.2.1 From d865177a5e749827f248f6363f5100d3a2f66b0f Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 3 Jun 2016 17:41:42 -0700 Subject: Btrfs: clear uptodate flags of pages in sys_array eb We set uptodate flag to pages in the temporary sys_array eb, but do not clear the flag after free eb. As the special btree inode may still hold a reference on those pages, the uptodate flag can remain alive in them. If btrfs_super_chunk_root has been intentionally changed to the offset of this sys_array eb, reading chunk_root will read content of sys_array and it will skip our beautiful checks in btree_readpage_end_io_hook() because of "pages of eb are uptodate => eb is uptodate" This adds the 'clear uptodate' part to force it to read from disk. Reviewed-by: Josef Bacik Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 673c72ab4fbe..42ccde43053b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6631,12 +6631,14 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } + clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); + clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return -EIO; } -- cgit v1.2.1 From 99e3ecfcb9f4ca35192d20a5bea158b81f600062 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 3 Jun 2016 12:05:14 -0700 Subject: Btrfs: add more validation checks for superblock This adds validation checks for super_total_bytes, super_bytes_used and super_stripesize, super_num_devices. Reported-by: Vegard Nossum Reported-by: Quentin Casasnovas Reviewed-by: David Sterba Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 42ccde43053b..fd5c9e69894a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6651,6 +6651,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_key found_key; int ret; int slot; + u64 total_dev = 0; root = root->fs_info->chunk_root; @@ -6692,6 +6693,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) ret = read_one_dev(root, leaf, dev_item); if (ret) goto error; + total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -6701,6 +6703,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } path->slots[0]++; } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != root->fs_info->fs_devices->total_devices) { + btrfs_err(root->fs_info, + "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_super_num_devices(root->fs_info->super_copy), + total_dev); + ret = -EINVAL; + goto error; + } + if (btrfs_super_total_bytes(root->fs_info->super_copy) < + root->fs_info->fs_devices->total_rw_bytes) { + btrfs_err(root->fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(root->fs_info->super_copy), + root->fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } ret = 0; error: unlock_chunks(root); -- cgit v1.2.1 From e06cd3dd7cea50e87663a88acdfdb7ac1c53a5ca Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 3 Jun 2016 12:05:15 -0700 Subject: Btrfs: add validadtion checks for chunk loading To prevent fuzzed filesystem images from panic the whole system, we need various validation checks to refuse to mount such an image if btrfs finds any invalid value during loading chunks, including both sys_array and regular chunks. Note that these checks may not be sufficient to cover all corner cases, feel free to add more checks. Reported-by: Vegard Nossum Reported-by: Quentin Casasnovas Reviewed-by: David Sterba Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 82 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fd5c9e69894a..74507b05061b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6251,27 +6251,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; u64 length; u64 stripe_len; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; + u16 num_stripes; + u16 sub_stripes; + u64 type; - logical = key->offset; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - /* Validation check */ + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + if (!num_stripes) { btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", num_stripes); @@ -6282,6 +6278,11 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk logical %llu", logical); return -EIO; } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } if (!length || !IS_ALIGNED(length, root->sectorsize)) { btrfs_err(root->fs_info, "invalid chunk length %llu", length); @@ -6293,13 +6294,54 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)) { + type) { btrfs_err(root->fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EIO; } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 stripe_len; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6547,6 +6589,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6613,6 +6656,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; -- cgit v1.2.1