From 7a5c3c9be1059feed0e470c6dc0994dcaed4f12c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 17 Jun 2014 18:58:59 +0800 Subject: Btrfs: fix put dio bio twice when we submit dio bio fail The caller of btrfs_submit_direct_hook() will put the original dio bio when btrfs_submit_direct_hook() return a error number, so we needn't put the original bio in btrfs_submit_direct_hook(). Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73098328d040..33c05188cbf0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7306,10 +7306,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); - if (ret) { - bio_put(orig_bio); + if (ret) return -EIO; - } if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; @@ -7326,6 +7324,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; + bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; atomic_inc(&dip->pending_bios); -- cgit v1.2.1 From 1707e26d6ab05c477a91d260e31fda7c6c38588e Mon Sep 17 00:00:00 2001 From: chandan Date: Tue, 1 Jul 2014 12:04:28 +0530 Subject: Btrfs: fill_holes: Fix slot number passed to hole_mergeable() call. For a non-existent key, btrfs_search_slot() sets path->slots[0] to the slot where the key could have been present, which in this case would be the slot containing the extent item which would be the next neighbor of the file range being punched. The current code passes an incremented path->slots[0] and we skip to the wrong file extent item. This would mean that we would fail to merge the "yet to be created" hole with the next neighboring hole (if one exists). Fix this. Signed-off-by: Chandan Rajendra Reviewed-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d3afac292d67..77e33534c7d6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2088,10 +2088,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, goto out; } - if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; - path->slots[0]++; key.offset = offset; btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], -- cgit v1.2.1 From b96de000bc8bc9688b3a2abea4332bd57648a49f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 3 Jul 2014 18:22:05 +0800 Subject: Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain Signed-off-by: Wang Shilong Signed-off-by: Miao Xie Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cb82f62cb7c..7c538f65214b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -508,6 +508,33 @@ static noinline int device_list_add(const char *path, ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { + /* + * When FS is already mounted. + * 1. If you are here and if the device->name is NULL that + * means this device was missing at time of FS mount. + * 2. If you are here and if the device->name is different + * from 'path' that means either + * a. The same device disappeared and reappeared with + * different name. or + * b. The missing-disk-which-was-replaced, has + * reappeared now. + * + * We must allow 1 and 2a above. But 2b would be a spurious + * and unintentional. + * + * Further in case of 1 and 2a above, the disk at 'path' + * would have missed some transaction when it was away and + * in case of 2a the stale bdev has to be updated as well. + * 2b must not be allowed at all time. + */ + + /* + * As of now don't allow update to btrfs_fs_device through + * the btrfs dev scan cli, after FS has been mounted. + */ + if (fs_devices->opened) + return -EBUSY; + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; -- cgit v1.2.1 From 77bdae4d136e167bab028cbec58b988f91cf73c0 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 3 Jul 2014 18:22:06 +0800 Subject: btrfs: check generation as replace duplicates devid+uuid When FS in unmounted we need to check generation number as well since devid+uuid combination could match with the missing replaced disk when it reappears, and without this patch it might pair with the replaced disk again. device_list_add() function is called in the following threads, mount device option mount argument ioctl BTRFS_IOC_SCAN_DEV (btrfs dev scan) ioctl BTRFS_IOC_DEVICES_READY (btrfs dev ready ) they have been unit tested to work fine with this patch. If the user knows what he is doing and really want to pair with replaced disk (which is not a standard operation), then he should first clear the kernel btrfs device list in the memory by doing the module unload/load and followed with the mount -o device option. Signed-off-by: Anand Jain Signed-off-by: Wang Shilong Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7c538f65214b..5700ab03e84b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -532,8 +532,19 @@ static noinline int device_list_add(const char *path, * As of now don't allow update to btrfs_fs_device through * the btrfs dev scan cli, after FS has been mounted. */ - if (fs_devices->opened) + if (fs_devices->opened) { return -EBUSY; + } else { + /* + * That is if the FS is _not_ mounted and if you + * are here, that means there is more than one + * disk with same uuid and devid.We keep the one + * with larger generation number or the last-in if + * generation are equal. + */ + if (found_transid < device->generation) + return -EEXIST; + } name = rcu_string_strdup(path, GFP_NOFS); if (!name) @@ -546,6 +557,15 @@ static noinline int device_list_add(const char *path, } } + /* + * Unmount does not free the btrfs_device struct but would zero + * generation along with most of the other members. So just update + * it back. We need it to pick the disk with largest generation + * (as above). + */ + if (!fs_devices->opened) + device->generation = found_transid; + if (found_transid > fs_devices->latest_trans) { fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; -- cgit v1.2.1 From 69611ac810af8e88eeb5ebd851589a0e8cc90860 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 3 Jul 2014 18:22:12 +0800 Subject: Btrfs: fix unzeroed members in fs_devices when creating a fs from seed fs We forgot to zero some members in fs_devices when we create new fs_devices from the one of the seed fs. It would cause the problem that we got wrong chunk profile when allocating chunks. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5700ab03e84b..50edbbc2e76c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1988,6 +1988,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; + fs_devices->missing_devices = 0; + fs_devices->num_can_discard = 0; + fs_devices->rotating = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); -- cgit v1.2.1 From 3a7d55c84c76a3088ddfb798c187182143683a16 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 16 Jul 2014 18:38:01 +0800 Subject: Btrfs: fix wrong missing device counter decrease The missing devices are accounted by its own fs device, for example the missing devices in seed filesystem will be accounted by the fs device of the seed filesystem, not by the new filesystem which is based on the seed filesystem, so when we remove the missing device in the seed filesystem, we should decrease the counter of its own fs device. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 50edbbc2e76c..da0e632a21fc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1718,7 +1718,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->total_devices--; if (device->missing) - root->fs_info->fs_devices->missing_devices--; + device->fs_devices->missing_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); -- cgit v1.2.1 From 9a025a0860ccc0f02af153c966bc1f83e5d9fc62 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:13 +0800 Subject: Btrfs: fix wrong write range for filemap_fdatawrite_range() filemap_fdatawrite_range() expect the third arg to be @end not @len, fix it. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 33c05188cbf0..73fadc7ead0e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7533,7 +7533,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, count = iov_iter_count(iter); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, count); + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); if (rw & WRITE) { /* -- cgit v1.2.1 From e2eca69dc6c09d968d69312b9899968a9b03a4a9 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:14 +0800 Subject: Btrfs: fix wrong extent mapping for DirectIO btrfs_next_leaf() will use current leaf's last key to search and then return a bigger one. So it may still return a file extent item that is smaller than expected value and we will get an overflow here for @em->len. This is easy to reproduce for Btrfs Direct writting, it did not cause any problem, because writting will re-insert right mapping later. However, by hacking code to make DIO support compression, wrong extent mapping is kept and it encounter merging failure(EEXIST) quickly. Fix this problem by looping to find next file extent item that is bigger than @start or we could not find anything more. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73fadc7ead0e..a3c6e76f5a4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6275,6 +6275,8 @@ next: goto not_found; if (start + len <= found_key.offset) goto not_found; + if (start > found_key.offset) + goto next; em->start = start; em->orig_start = start; em->len = found_key.offset - start; -- cgit v1.2.1 From 2c91943b5066314a8bb9f0a65584e5e4cd92ea63 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 18 Jul 2014 09:55:43 +0800 Subject: btrfs: Return right extent when fiemap gives unaligned offset and len. When page aligned start and len passed to extent_fiemap(), the result is good, but when start and len is not aligned, e.g. start = 1 and len = 4095 is passed to extent_fiemap(), it returns no extent. The problem is that start and len is all rounded down which causes the problem. This patch will round down start and round up (start + len) to return right extent. Reported-by: Chandan Rajendra Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a389820d158b..1c70cff4a9e1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4213,8 +4213,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; - start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); - len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + start = round_down(start, BTRFS_I(inode)->root->sectorsize); + len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start; /* * lookup the last file extent. We're not using i_size here -- cgit v1.2.1 From ff61d17c6324d1b483fbbc5144f09668c24ff60c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:06 +0800 Subject: Btrfs: Fix the problem that the replace destroys the seed filesystem The seed filesystem was destroyed by the device replace, the reproduce method is: # mkfs.btrfs -f # btrfstune -S 1 # mount # btrfs device add # umount # mount # btrfs replace start -f # umount # mount It is because we erase the super block on the seed device. It is wrong, we should not change anything on the seed device. Signed-off-by: Miao Xie Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da0e632a21fc..00c8efdcd1e5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1848,8 +1848,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->bdev) { fs_info->fs_devices->open_devices--; - /* zero out the old super */ - btrfs_scratch_superblock(srcdev); + /* + * zero out the old super if it is not writable + * (e.g. seed device) + */ + if (srcdev->writeable) + btrfs_scratch_superblock(srcdev); } call_rcu(&srcdev->rcu, free_device); -- cgit v1.2.1 From 5d68da3b8ee6eb2257aa4b8d885581782278ae93 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:07 +0800 Subject: Btrfs: don't write any data into a readonly device when scrub We should not write data into a readonly device especially seed device when doing scrub, skip those devices. Signed-off-by: Miao Xie Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b6d198f5181e..23d3f6e6a482 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2904,6 +2904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + struct rcu_string *name; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -2965,6 +2966,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -ENODEV; } + if (!is_dev_replace && !readonly && !dev->writeable) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + rcu_read_lock(); + name = rcu_dereference(dev->name); + btrfs_err(fs_info, "scrub: device %s is not writable", + name->str); + rcu_read_unlock(); + return -EROFS; + } + mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); -- cgit v1.2.1 From 7df69d3e94d6de537fd1afb574c760d8dc83ab60 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:13 +0800 Subject: Btrfs: Fix wrong device size when we are resizing the device total_bytes of device is just a in-memory variant which is used to record the size of the device, and it might be changed before we resize a device, if the resize operation fails, it will be fallbacked. But some code used it to update on-disk metadata of the device, it would cause the problem that on-disk metadata of the devices was not consistent. We should use the other variant named disk_total_bytes to update the on-disk metadata of device, because that variant is updated only when the resize operation is successful. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/volumes.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d0ed9e664f7d..c99a414813c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3450,7 +3450,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_generation(dev_item, 0); btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); - btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_total_bytes(dev_item, + dev->disk_total_bytes); btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 00c8efdcd1e5..9d4ce53d7569 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1483,7 +1483,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); -- cgit v1.2.1 From 95669976bd7d30ae265db938ecb46a6b7f8cb893 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:14 +0800 Subject: Btrfs: don't consider the missing device when allocating new chunks The original code allocated new chunks by the number of the writable devices and missing devices to make sure that any RAID levels on a degraded FS continue to be honored, but it introduced a problem that it stopped us to allocating new chunks, the steps to reproduce is following: # mkfs.btrfs -m raid1 -d raid1 -f # mkfs.btrfs -f //Removing from the original fs # mount -o degraded # dd if=/dev/null of=/tmpfile bs=1M It is because we allocate new chunks only on the writable devices, if we take the number of missing devices into account, and want to allocate new chunks with higher RAID level, we will fail becaue we don't have enough writable device. Fix it by ignoring the number of missing devices when allocating new chunks. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 102ed3143976..5524434da059 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3586,13 +3586,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) */ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - u64 num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; u64 tmp; @@ -8440,13 +8434,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (stripped) return extended_to_chunk(stripped); - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + num_devices = root->fs_info->fs_devices->rw_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | -- cgit v1.2.1 From 87fa3bb0786f37dff0b92f2c38421dd56d8902a9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jul 2014 19:09:39 +0800 Subject: Btrfs: fix regression of btrfs device replace Commit 49c6f736f34f901117c20960ebd7d5e60f12fcac( btrfs: dev replace should replace the sysfs entry) added the missing sysfs entry in the process of device replace, but didn't take missing devices into account, so now we have BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [] btrfs_kobj_rm_device+0x21/0x40 [btrfs] ... To reproduce it, 1. mkfs.btrfs -f disk1 disk2 2. mkfs.ext4 disk1 3. mount disk2 /mnt -odegraded 4. btrfs replace start -B 1 disk3 /mnt -------------------------- This fixes the problem. Reported-by: Chris Murphy Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 78699364f537..12e53556e214 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -614,7 +614,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, if (!fs_info->device_dir_kobj) return -EINVAL; - if (one_device) { + if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; -- cgit v1.2.1 From 9c3b306e1c9e6be4be09e99a8fe2227d1005effc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 31 Jul 2014 14:41:07 +0100 Subject: Btrfs: race free update of commit root for ro snapshots This is a better solution for the problem addressed in the following commit: Btrfs: update commit root on snapshot creation after orphan cleanup (3821f348889e506efbd268cc8149e0ebfa47c4e5) The previous solution wasn't the best because of 2 reasons: 1) It added another full transaction commit, which is more expensive than just swapping the commit root with the root; 2) If a reboot happened after the first transaction commit (the one that creates the snapshot) and before the second transaction commit, then we would end up with the same problem if a send using that snapshot was requested before the first transaction commit after the reboot. This change addresses those 2 issues. The second issue is addressed by switching the commit root in the dentry lookup VFS callback, which is also called by the snapshot/subvol creation ioctl and performs orphan cleanup if needed. Like the vfs, the ioctl locks the parent inode too, preventing race issues between a dentry lookup and snapshot creation. Cc: Alex Lyakas Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 ++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.c | 33 --------------------------------- 2 files changed, 36 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3c6e76f5a4e..6dd6e50d143a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5181,6 +5181,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } + /* + * If orphan cleanup did remove any orphans, it means the tree + * was modified and therefore the commit root is not the same as + * the current root anymore. This is a problem, because send + * uses the commit root and therefore can see inode items that + * don't exist in the current root anymore, and for example make + * calls to btrfs_iget, which will do tree lookups based on the + * current root and not on the commit root. Those lookups will + * fail, returning a -ESTALE error, and making send fail with + * that error. So make sure a send does not see any orphans we + * have just removed, and that it will see the same inodes + * regardless of whether a transaction commit happened before + * it started (meaning that the commit root will be the same as + * the current root) or not. + */ + if (sub_root->node != sub_root->commit_root) { + u64 sub_flags = btrfs_root_flags(&sub_root->root_item); + + if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) { + struct extent_buffer *eb; + + /* + * Assert we can't have races between dentry + * lookup called through the snapshot creation + * ioctl and the VFS. + */ + ASSERT(mutex_is_locked(&dir->i_mutex)); + + down_write(&root->fs_info->commit_root_sem); + eb = sub_root->commit_root; + sub_root->commit_root = + btrfs_root_node(sub_root); + up_write(&root->fs_info->commit_root_sem); + free_extent_buffer(eb); + } + } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 47aceb494d1d..845287ca59c3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - ret = btrfs_orphan_cleanup(pending_snapshot->snap); - if (ret) - goto fail; - - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); -- cgit v1.2.1 From 5762b5c958abbecb7fb9f4596a6476d1ce91ecf6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Aug 2014 00:10:32 +0100 Subject: Btrfs: ensure tmpfile inode is always persisted with link count of 0 If we open a file with O_TMPFILE, don't do any further operation on it (so that the inode item isn't updated) and then force a transaction commit, we get a persisted inode item with a link count of 1, and not 0 as it should be. Steps to reproduce it (requires a modern xfs_io with -T support): $ mkfs.btrfs -f /dev/sdd $ mount -o /dev/sdd /mnt $ xfs_io -T /mnt & $ sync Then btrfs-debug-tree shows the inode item with a link count of 1: $ btrfs-debug-tree /dev/sdd (...) fs tree key (FS_TREE ROOT_ITEM 0) leaf 29556736 items 4 free space 15851 generation 6 owner 5 fs uuid f164d01b-1b92-481d-a4e4-435fb0f843d0 chunk uuid 0e3d0e56-bcca-4a1c-aa5f-cec2c6f4f7a6 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 inode generation 3 transid 6 size 0 block group 0 mode 40755 links 1 item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 inode ref index 0 namelen 2 name: .. item 2 key (257 INODE_ITEM 0) itemoff 15951 itemsize 160 inode generation 6 transid 6 size 0 block group 0 mode 100600 links 1 item 3 key (ORPHAN ORPHAN_ITEM 257) itemoff 15951 itemsize 0 orphan item checksum tree key (CSUM_TREE ROOT_ITEM 0) (...) Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dd6e50d143a..57c3129ee2a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5641,6 +5641,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); } + /* + * O_TMPFILE, set link count to 0, so that after this point, + * we fill in an inode item with the correct link count. + */ + if (!name) + set_nlink(inode, 0); + /* * we have to initialize this early, so we can reclaim the inode * number if we fail afterwards in this function. @@ -9007,6 +9014,14 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; + /* + * We set number of links to 0 in btrfs_new_inode(), and here we set + * it to 1 because d_tmpfile() will issue a warning if the count is 0, + * through: + * + * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); mark_inode_dirty(inode); -- cgit v1.2.1 From 74121f7cbb2f60b41c48184ad5b889c0704e7b90 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 7 Aug 2014 12:00:44 +0100 Subject: Btrfs: fix hole detection during file fsync The file hole detection logic during a file fsync wasn't correct, because it didn't look back (in a previous leaf) for the last file extent item that can be in a leaf to the left of our leaf and that has a generation lower than the current transaction id. This made it assume that a hole exists when it really doesn't exist in the file. Such false positive hole detection happens in the following scenario: * We have a file that has many file extent items, covering 3 or more btree leafs (the first leaf must contain non file extent items too). * Two ranges of the file are modified, with their extent items being located at 2 different leafs and those leafs aren't consecutive. * When processing the second modified leaf, we weren't checking if some file extent item exists that is located in some leaf that is between our 2 modified leafs, and therefore assumed the range defined between the last file extent item in the first leaf and the first file extent item in the second leaf matched a hole. Fortunately this didn't result in overriding the log with wrong data, instead it made the last loop in copy_items() attempt to insert a duplicated key (for a hole file extent item), which makes the file fsync code return with -EEXIST to file.c:btrfs_sync_file() which in turn ends up doing a full transaction commit, which is much more expensive then writing only to the log tree and wait for it to be durably persisted (as well as the file's modified extents/pages). Therefore fix the hole detection logic, so that we don't pay the cost of doing full transaction commits. I could trigger this issue with the following test for xfstests (which never fails, either without or with this patch). The last fsync call results in a full transaction commit, due to the -EEXIST error mentioned above. I could also observe this behaviour happening frequently when running xfstests/generic/075 in a loop. Test: _cleanup() { _cleanup_flakey rm -fr $tmp } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_flakey _need_to_be_root rm -f $seqres.full # Create a file with many file extent items, each representing a 4Kb extent. # These items span 3 btree leaves, of 16Kb each (default mkfs.btrfs leaf size # as of btrfs-progs 3.12). _scratch_mkfs -l 16384 >/dev/null 2>&1 _init_flakey SAVE_MOUNT_OPTIONS="$MOUNT_OPTIONS" MOUNT_OPTIONS="$MOUNT_OPTIONS -o commit=999" _mount_flakey # First fsync, inode has BTRFS_INODE_NEEDS_FULL_SYNC flag set. $XFS_IO_PROG -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # For any of the following fsync calls, inode doesn't have the flag # BTRFS_INODE_NEEDS_FULL_SYNC set. for ((i = 1; i <= 500; i++)); do OFFSET=$((4096 * i)) LEN=4096 $XFS_IO_PROG -c "pwrite -S 0x01 $OFFSET $LEN" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io done # Commit transaction and bump next transaction's id (to 7). sync # Truncate will set the BTRFS_INODE_NEEDS_FULL_SYNC flag in the btrfs's # inode runtime flags. $XFS_IO_PROG -c "truncate 2048000" $SCRATCH_MNT/foo # Commit transaction and bump next transaction's id (to 8). sync # Touch 1 extent item from the first leaf and 1 from the last leaf. The leaf # in the middle, containing only file extent items, isn't touched. So the # next fsync, when calling btrfs_search_forward(), won't visit that middle # leaf. First and 3rd leaf have now a generation with value 8, while the # middle leaf remains with a generation with value 6. $XFS_IO_PROG \ -c "pwrite -S 0xee -b 4096 0 4096" \ -c "pwrite -S 0xff -b 4096 2043904 4096" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io _load_flakey_table $FLAKEY_DROP_WRITES md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES # During mount, we'll replay the log created by the fsync above, and the file's # md5 digest should be the same we got before the unmount. _mount_flakey md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey MOUNT_OPTIONS="$SAVE_MOUNT_OPTIONS" status=0 exit Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9e1f2cd5e67a..7e0e6e3029dd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3298,7 +3298,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; bool has_extents = false; - bool need_find_last_extent = (*last_extent == 0); + bool need_find_last_extent = true; bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3352,8 +3352,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, */ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { has_extents = true; - if (need_find_last_extent && - first_key.objectid == (u64)-1) + if (first_key.objectid == (u64)-1) first_key = ins_keys[i]; } else { need_find_last_extent = false; @@ -3427,6 +3426,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!has_extents) return ret; + if (need_find_last_extent && *last_extent == first_key.offset) { + /* + * We don't have any leafs between our current one and the one + * we processed before that can have file extent items for our + * inode (and have a generation number smaller than our current + * transaction id). + */ + need_find_last_extent = false; + } + /* * Because we use btrfs_search_forward we could skip leaves that were * not modified and then assume *last_extent is valid when it really @@ -3537,7 +3546,7 @@ fill_holes: 0, 0); if (ret) break; - *last_extent = offset + len; + *last_extent = extent_end; } /* * Need to let the callers know we dropped the path so they should -- cgit v1.2.1 From 7064dd5c36187725e7ccfd837e07678ae435d3f5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Aug 2014 02:47:05 +0100 Subject: Btrfs: don't monopolize a core when evicting inode If an inode has a very large number of extent maps, we can spend a lot of time freeing them, which triggers a soft lockup warning. Therefore reschedule if we need to when freeing the extent maps while evicting the inode. I could trigger this all the time by running xfstests/generic/299 on a file system with the no-holes feature enabled. That test creates an inode with 11386677 extent maps. $ mkfs.btrfs -f -O no-holes $TEST_DEV $ MKFS_OPTIONS="-O no-holes" ./check generic/299 generic/299 382s ... Message from syslogd@debian-vm3 at Aug 7 10:44:29 ... kernel:[85304.208017] BUG: soft lockup - CPU#0 stuck for 22s! [umount:25330] 384s Ran: generic/299 Passed all 1 tests $ dmesg (...) [86304.300017] BUG: soft lockup - CPU#0 stuck for 23s! [umount:25330] (...) [86304.300036] Call Trace: [86304.300036] [] __slab_free+0x54/0x295 [86304.300036] [] ? free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [] kmem_cache_free+0x282/0x2a0 [86304.300036] [] free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [] btrfs_evict_inode+0xd5/0x660 [btrfs] [86304.300036] [] ? __inode_wait_for_writeback+0x6d/0xc0 [86304.300036] [] ? _raw_spin_unlock+0x2b/0x40 [86304.300036] [] evict+0xab/0x180 [86304.300036] [] dispose_list+0x3e/0x60 [86304.300036] [] evict_inodes+0xf4/0x110 [86304.300036] [] generic_shutdown_super+0x53/0x110 [86304.300036] [] kill_anon_super+0x16/0x30 [86304.300036] [] btrfs_kill_super+0x1a/0xa0 [btrfs] [86304.300036] [] deactivate_locked_super+0x59/0x80 [86304.300036] [] deactivate_super+0x4e/0x70 [86304.300036] [] mntput_no_expire+0x174/0x1f0 [86304.300036] [] ? mntput_no_expire+0x17/0x1f0 [86304.300036] [] SyS_umount+0x97/0x100 (...) Signed-off-by: Filipe Manana Reviewed-by: Satoru Takeuchi Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 57c3129ee2a2..2ac260d41ccd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4674,6 +4674,11 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_bit(EXTENT_FLAG_LOGGING, &em->flags); remove_extent_mapping(map_tree, em); free_extent_map(em); + if (need_resched()) { + write_unlock(&map_tree->lock); + cond_resched(); + write_lock(&map_tree->lock); + } } write_unlock(&map_tree->lock); @@ -4696,6 +4701,7 @@ static void evict_inode_truncate_pages(struct inode *inode) &cached_state, GFP_NOFS); free_extent_state(state); + cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); -- cgit v1.2.1 From 62e2390e1ad78f956e96a6a831761adc6f2bf58a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Aug 2014 02:47:06 +0100 Subject: Btrfs: clone, don't create invalid hole extent map When cloning a file that consists of an inline extent, we were creating an extent map that represents a non-existing trailing hole starting at a file offset that isn't a multiple of the sector size. This happened because when processing an inline extent we weren't aligning the extent's length to the sector size, and therefore incorrectly treating the range [inline_extent_length; sector_size[ as a hole. Signed-off-by: Filipe Manana Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 845287ca59c3..fce6fd0e3f50 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3494,7 +3494,8 @@ process_slot: btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - last_dest_end = new_key.offset + datal; + last_dest_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen); -- cgit v1.2.1 From 51f395ad4058883e4273b02fdebe98072dbdc0d2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 8 Aug 2014 13:06:20 +0800 Subject: btrfs: Use right extent length when inserting overlap extent map. When current btrfs finds that a new extent map is going to be insereted but failed with -EEXIST, it will try again to insert the extent map but with the length of sectorsize. This is OK if we don't enable 'no-holes' feature since all extent space is continuous, we will not go into the not found->insert routine. But if we enable 'no-holes' feature, it will make things out of control. e.g. in 4K sectorsize, we pass the following args to btrfs_get_extent(): btrfs_get_extent() args: start: 27874 len 4100 28672 27874 28672 27874+4100 32768 |-----------------------| |---------hole--------------------|---------data----------| 1) not found and insert Since no extent map containing the range, btrfs_get_extent() will go into the not_found and insert routine, which will try to insert the extent map (27874, 27847 + 4100). 2) first overlap But it overlaps with (28672, 32768) extent, so -EEXIST will be returned by add_extent_mapping(). 3) retry but still overlap After catching the -EEXIST, then btrfs_get_extent() will try insert it again but with 4K length, which still overlaps, so -EEXIST will be returned. This makes the following patch fail to punch hole. d77815461f047e561f77a07754ae923ade597d4e btrfs: Avoid trucating page or punching hole in a already existed hole. This patch will use the right length, which is the (exsisting->start - em->start) to insert, making the above patch works in 'no-holes' mode. Also, some small code style problems in above patch is fixed too. Reported-by: Filipe David Manana Signed-off-by: Qu Wenruo Reviewed-by: Filipe David Manana Tested-by: Filipe David Manana Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77e33534c7d6..f15c13f97018 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2215,7 +2215,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); lockend = round_down(offset + len, BTRFS_I(inode)->root->sectorsize) - 1; same_page = ((offset >> PAGE_CACHE_SHIFT) == @@ -2276,7 +2276,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start + tail_len, 0, 1); if (ret) goto out_only_mutex; - } + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2ac260d41ccd..ae98df67950f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6146,14 +6146,14 @@ out_fail: static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, - u64 map_start, u64 map_len) + u64 map_start) { u64 start_diff; BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); start_diff = map_start - em->start; em->start = map_start; - em->len = map_len; + em->len = existing->start - em->start; if (em->block_start < EXTENT_MAP_LAST_BYTE && !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; @@ -6441,8 +6441,7 @@ insert: em->len); if (existing) { err = merge_extent_mapping(em_tree, existing, - em, start, - root->sectorsize); + em, start); free_extent_map(existing); if (err) { free_extent_map(em); -- cgit v1.2.1 From a3c108950d8e0bfcf48856cc159956022a7ad925 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 17 Aug 2014 15:09:21 -0500 Subject: btrfs: fix leak in qgroup_subtree_accounting() error path Coverity pointed this out; in the newly added qgroup_subtree_accounting(), if btrfs_find_all_roots() returns an error, we leak at least the parents pointer, and possibly the roots pointer, depending on what failure occurs. If btrfs_find_all_roots() returns an error, we need to free up all allocations before we return. "roots" is initialized to NULL, so it should be safe to free it unconditionally (ulist_free() handles that case). Cc: Mark Fasheh Signed-off-by: Eric Sandeen Reviewed-by: Mark Fasheh Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b497498484be..8abe45524de9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1973,7 +1973,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, elem.seq, &roots); btrfs_put_tree_mod_seq(fs_info, &elem); if (ret < 0) - return ret; + goto out; if (roots->nnodes != 1) goto out; -- cgit v1.2.1 From 38c1c2e44bacb37efd68b90b3f70386a8ee370ee Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 19 Aug 2014 23:33:13 +0800 Subject: Btrfs: fix crash on endio of reading corrupted block The crash is ------------[ cut here ]------------ kernel BUG at fs/btrfs/extent_io.c:2124! [...] Workqueue: btrfs-endio normal_work_helper [btrfs] RIP: 0010:[] [] end_bio_extent_readpage+0xb45/0xcd0 [btrfs] This is in fact a regression. It is because we forgot to increase @offset properly in reading corrupted block, so that the @offset remains, and this leads to checksum errors while reading left blocks queued up in the same bio, and then ends up with hiting the above BUG_ON. Reported-by: Chris Murphy Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1c70cff4a9e1..f34b1e262d4e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2532,6 +2532,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; + offset += len; continue; } } -- cgit v1.2.1 From f6dc45c7a93a011dff6eb9b2ffda59c390c7705a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 20 Aug 2014 07:15:33 -0700 Subject: Btrfs: fix filemap_flush call in btrfs_file_release We should only be flushing on close if the file was flagged as needing it during truncate. I broke this with my ordered data vs transaction commit deadlock fix. Thanks to Miao Xie for catching this. Signed-off-by: Chris Mason Reported-by: Miao Xie Reported-by: Fengguang Wu --- fs/btrfs/file.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f15c13f97018..36861b7a6757 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1840,7 +1840,15 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { if (filp->private_data) btrfs_ioctl_trans_end(filp); - filemap_flush(inode->i_mapping); + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); return 0; } -- cgit v1.2.1 From e0b760ff71be168d4e623f7c3612e98902ab93e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2014 09:58:22 -0400 Subject: locks: pass correct "before" pointer to locks_unlink_lock in generic_add_lease The argument to locks_unlink_lock can't be just any pointer to a pointer. It must be a pointer to the fl_next field in the previous lock in the list. Cc: # v3.15+ Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index cb66fb05ad4a..bb08857f90b5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1619,7 +1619,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp smp_mb(); error = check_conflicting_open(dentry, arg); if (error) - locks_unlink_lock(flp); + locks_unlink_lock(before); out: if (is_deleg) mutex_unlock(&inode->i_mutex); -- cgit v1.2.1 From 36de928641ee48b2078d3fe9514242aaa2f92013 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 23 Aug 2014 17:47:19 -0400 Subject: ext4: propagate errors up to ext4_find_entry()'s callers If we run into some kind of error, such as ENOMEM, while calling ext4_getblk() or ext4_dx_find_entry(), we need to make sure this error gets propagated up to ext4_find_entry() and then to its callers. This way, transient errors such as ENOMEM can get propagated to the VFS. This is important so that the system calls return the appropriate error, and also so that in the case of ext4_lookup(), we return an error instead of a NULL inode, since that will result in a negative dentry cache entry that will stick around long past the OOM condition which caused a transient ENOMEM error. Google-Bug-Id: #17142205 Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 2 +- fs/ext4/namei.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5b19760b1de5..4d95c3301775 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) /* * Special error return code only used by dx_probe() and its callers. */ -#define ERR_BAD_DX_DIR -75000 +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* * Timeout and state flag for lazy initialization inode thread. diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b147a67baa0d..ae7088b446d1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, err; + int i, err = 0; int namelen; *res_dir = NULL; @@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (bh || (err != ERR_BAD_DX_DIR)) + if (err == -ENOENT) + return NULL; + if (err && err != ERR_BAD_DX_DIR) + return ERR_PTR(err); + if (bh) return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1295,6 +1299,11 @@ restart: } num++; bh = ext4_getblk(NULL, dir, b++, 0, &err); + if (unlikely(err)) { + if (ra_max == 0) + return ERR_PTR(err); + break; + } bh_use[ra_max] = bh; if (bh) ll_rw_block(READ | REQ_META | REQ_PRIO, @@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child) struct buffer_head *bh; bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto end_rmdir; @@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto end_unlink; @@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de; bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); @@ -3202,6 +3221,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(new.inode); old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3214,6 +3235,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + goto end_rename; + } if (new.bh) { if (!new.inode) { brelse(new.bh); @@ -3330,6 +3355,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3342,6 +3369,10 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + goto end_rename; + } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) -- cgit v1.2.1 From c99d1e6e83b06744c75d9f5e491ed495a7086b7b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 23 Aug 2014 17:47:28 -0400 Subject: ext4: fix BUG_ON in mb_free_blocks() If we suffer a block allocation failure (for example due to a memory allocation failure), it's possible that we will call ext4_discard_allocated_blocks() before we've actually allocated any blocks. In that case, fe_len and fe_start in ac->ac_f_ex will still be zero, and this will result in mb_free_blocks(inode, e4b, 0, 0) triggering the BUG_ON on mb_free_blocks(): BUG_ON(last >= (sb->s_blocksize << 3)); Fix this by bailing out of ext4_discard_allocated_blocks() if fs_len is zero. Also fix a missing ext4_mb_unload_buddy() call in ext4_discard_allocated_blocks(). Google-Bug-Id: 16844242 Fixes: 86f0afd463215fc3e58020493482faa4ac3a4d69 Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 956027711faf..8b0f9ef517d6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int last = first + count - 1; struct super_block *sb = e4b->bd_sb; + if (WARN_ON(count == 0)) + return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ @@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) int err; if (pa == NULL) { + if (ac->ac_f_ex.fe_len == 0) + return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (err) { /* @@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) -- cgit v1.2.1 From 4631dbf677ded0419fee35ca7408285dabfaef1a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 23 Aug 2014 17:48:28 -0400 Subject: ext4: move i_size,i_disksize update routines to helper function Cc: stable@vger.kernel.org # needed for bug fix patches Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 16 ++++++++++++++++ fs/ext4/extents.c | 17 ++++------------- fs/ext4/inode.c | 34 ++++++++-------------------------- 3 files changed, 28 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4d95c3301775..b0c225cdb52c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 76c2df382b7d..f0e6934291dd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4839,12 +4839,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - if (new_size) { - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); + ext4_update_inode_size(inode, new_size); } else { /* * Mark that we allocate beyond EOF so the subsequent truncate @@ -4886,7 +4882,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) int ret = 0; int flags; ext4_lblk_t lblk; - struct timespec tv; unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ @@ -4945,15 +4940,11 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_ERR(handle)) goto out; - tv = inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = ext4_current_time(inode); if (new_size) { - if (new_size > i_size_read(inode)) { - i_size_write(inode, new_size); - inode->i_mtime = tv; - } - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); + if (ext4_update_inode_size(inode, new_size) & 0x1) + inode->i_mtime = inode->i_ctime; } else { /* * Mark that we allocate beyond EOF so the subsequent truncate diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 367a60c07cf0..b1ddd9352644 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file, } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hole i_mutex. - * - * But it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. */ - if (pos + copied > inode->i_size) { - i_size_write(inode, pos + copied); - i_size_changed = 1; - } - - if (pos + copied > EXT4_I(inode)->i_disksize) { - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * but greater than i_disksize. (hint delalloc) - */ - ext4_update_i_disksize(inode, (pos + copied)); - i_size_changed = 1; - } + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); page_cache_release(page); @@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file, int ret = 0, ret2; int partial = 0; unsigned from, to; - loff_t new_i_size; + int size_changed = 0; trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - new_i_size = pos + copied; - if (new_i_size > inode->i_size) - i_size_write(inode, pos+copied); + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - if (new_i_size > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, new_i_size); + unlock_page(page); + page_cache_release(page); + + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } - unlock_page(page); - page_cache_release(page); if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside -- cgit v1.2.1 From 9e0af23764344f7f1b68e4eefbe7dc865018b63d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Aug 2014 23:36:53 +0800 Subject: Btrfs: fix task hang under heavy compressed write This has been reported and discussed for a long time, and this hang occurs in both 3.15 and 3.16. Btrfs now migrates to use kernel workqueue, but it introduces this hang problem. Btrfs has a kind of work queued as an ordered way, which means that its ordered_func() must be processed in the way of FIFO, so it usually looks like -- normal_work_helper(arg) work = container_of(arg, struct btrfs_work, normal_work); work->func() <---- (we name it work X) for ordered_work in wq->ordered_list ordered_work->ordered_func() ordered_work->ordered_free() The hang is a rare case, first when we find free space, we get an uncached block group, then we go to read its free space cache inode for free space information, so it will file a readahead request btrfs_readpages() for page that is not in page cache __do_readpage() submit_extent_page() btrfs_submit_bio_hook() btrfs_bio_wq_end_io() submit_bio() end_workqueue_bio() <--(ret by the 1st endio) queue a work(named work Y) for the 2nd also the real endio() So the hang occurs when work Y's work_struct and work X's work_struct happens to share the same address. A bit more explanation, A,B,C -- struct btrfs_work arg -- struct work_struct kthread: worker_thread() pick up a work_struct from @worklist process_one_work(arg) worker->current_work = arg; <-- arg is A->normal_work worker->current_func(arg) normal_work_helper(arg) A = container_of(arg, struct btrfs_work, normal_work); A->func() A->ordered_func() A->ordered_free() <-- A gets freed B->ordered_func() submit_compressed_extents() find_free_extent() load_free_space_inode() ... <-- (the above readhead stack) end_workqueue_bio() btrfs_queue_work(work C) B->ordered_free() As if work A has a high priority in wq->ordered_list and there are more ordered works queued after it, such as B->ordered_func(), its memory could have been freed before normal_work_helper() returns, which means that kernel workqueue code worker_thread() still has worker->current_work pointer to be work A->normal_work's, ie. arg's address. Meanwhile, work C is allocated after work A is freed, work C->normal_work and work A->normal_work are likely to share the same address(I confirmed this with ftrace output, so I'm not just guessing, it's rare though). When another kthread picks up work C->normal_work to process, and finds our kthread is processing it(see find_worker_executing_work()), it'll think work C as a collision and skip then, which ends up nobody processing work C. So the situation is that our kthread is waiting forever on work C. Besides, there're other cases that can lead to deadlock, but the real problem is that all btrfs workqueue shares one work->func, -- normal_work_helper, so this makes each workqueue to have its own helper function, but only a wraper pf normal_work_helper. With this patch, I no long hit the above hang. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 44 ++++++++++++++++++++++++++++++++-------- fs/btrfs/async-thread.h | 28 ++++++++++++++++++++++++- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 53 ++++++++++++++++++++++++++---------------------- fs/btrfs/extent-tree.c | 7 ++++--- fs/btrfs/inode.c | 35 +++++++++++++++++++++----------- fs/btrfs/ordered-data.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/raid56.c | 9 +++++--- fs/btrfs/reada.c | 3 ++- fs/btrfs/scrub.c | 14 +++++++------ fs/btrfs/volumes.c | 3 ++- 12 files changed, 141 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5a201d81049c..fbd76ded9a34 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "async-thread.h" #include "ctree.h" @@ -55,8 +54,39 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static inline struct __btrfs_workqueue -*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +static void normal_work_helper(struct btrfs_work *work); + +#define BTRFS_WORK_HELPER(name) \ +void btrfs_##name(struct work_struct *arg) \ +{ \ + struct btrfs_work *work = container_of(arg, struct btrfs_work, \ + normal_work); \ + normal_work_helper(work); \ +} + +BTRFS_WORK_HELPER(worker_helper); +BTRFS_WORK_HELPER(delalloc_helper); +BTRFS_WORK_HELPER(flush_delalloc_helper); +BTRFS_WORK_HELPER(cache_helper); +BTRFS_WORK_HELPER(submit_helper); +BTRFS_WORK_HELPER(fixup_helper); +BTRFS_WORK_HELPER(endio_helper); +BTRFS_WORK_HELPER(endio_meta_helper); +BTRFS_WORK_HELPER(endio_meta_write_helper); +BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(rmw_helper); +BTRFS_WORK_HELPER(endio_write_helper); +BTRFS_WORK_HELPER(freespace_write_helper); +BTRFS_WORK_HELPER(delayed_meta_helper); +BTRFS_WORK_HELPER(readahead_helper); +BTRFS_WORK_HELPER(qgroup_rescan_helper); +BTRFS_WORK_HELPER(extent_refs_helper); +BTRFS_WORK_HELPER(scrub_helper); +BTRFS_WORK_HELPER(scrubwrc_helper); +BTRFS_WORK_HELPER(scrubnc_helper); + +static struct __btrfs_workqueue * +__btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); } -static void normal_work_helper(struct work_struct *arg) +static void normal_work_helper(struct btrfs_work *work) { - struct btrfs_work *work; struct __btrfs_workqueue *wq; int need_order = 0; - work = container_of(arg, struct btrfs_work, normal_work); /* * We should not touch things inside work in the following cases: * 1) after work->func() if it has no ordered_free @@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg) trace_btrfs_all_work_done(work); } -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free) @@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work, work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, normal_work_helper); + INIT_WORK(&work->normal_work, uniq_func); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 9c6b66d15fb0..e9e31c94758f 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -19,12 +19,14 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ +#include struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; @@ -38,11 +40,35 @@ struct btrfs_work { unsigned long flags; }; +#define BTRFS_WORK_HELPER_PROTO(name) \ +void btrfs_##name(struct work_struct *arg) + +BTRFS_WORK_HELPER_PROTO(worker_helper); +BTRFS_WORK_HELPER_PROTO(delalloc_helper); +BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); +BTRFS_WORK_HELPER_PROTO(cache_helper); +BTRFS_WORK_HELPER_PROTO(submit_helper); +BTRFS_WORK_HELPER_PROTO(fixup_helper); +BTRFS_WORK_HELPER_PROTO(endio_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); +BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(rmw_helper); +BTRFS_WORK_HELPER_PROTO(endio_write_helper); +BTRFS_WORK_HELPER_PROTO(freespace_write_helper); +BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); +BTRFS_WORK_HELPER_PROTO(readahead_helper); +BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); +BTRFS_WORK_HELPER_PROTO(extent_refs_helper); +BTRFS_WORK_HELPER_PROTO(scrub_helper); +BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); +BTRFS_WORK_HELPER_PROTO(scrubnc_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index da775bfdebc9..a2e90f855d7d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, - NULL, NULL); + btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, + btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c99a414813c1..a1d36e62179c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -39,7 +39,6 @@ #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" -#include "async-thread.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" @@ -693,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->error = err; - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_work(fs_info->endio_meta_write_workers, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_work(fs_info->endio_freespace_worker, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_write_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + wq = fs_info->endio_meta_write_workers; + func = btrfs_endio_meta_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else if (end_io_wq->metadata) - btrfs_queue_work(fs_info->endio_meta_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else if (end_io_wq->metadata) { + wq = fs_info->endio_meta_workers; + func = btrfs_endio_meta_helper; + } else { + wq = fs_info->endio_workers; + func = btrfs_endio_helper; + } } + + btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_queue_work(wq, &end_io_wq->work); } /* @@ -828,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - btrfs_init_work(&async->work, run_one_async_start, + btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, run_one_async_done, run_one_async_free); async->bio_flags = bio_flags; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5524434da059..3efe1c3877bf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->sync = 0; init_completion(&async->wait); - btrfs_init_work(&async->work, delayed_ref_async_start, - NULL, NULL); + btrfs_init_work(&async->work, btrfs_extent_refs_helper, + delayed_ref_async_start, NULL, NULL); btrfs_queue_work(root->fs_info->extent_workers, &async->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ae98df67950f..3d020d6d9ace 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1096,8 +1096,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - btrfs_init_work(&async_cow->work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_cow->work, + btrfs_delalloc_helper, + async_cow_start, async_cow_submit, + async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; @@ -1881,7 +1883,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_fixup_helper, + btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; @@ -2822,7 +2825,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *workers; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2831,13 +2835,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); + if (btrfs_is_free_space_inode(inode)) { + wq = root->fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = root->fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } - if (btrfs_is_free_space_inode(inode)) - workers = root->fs_info->endio_freespace_worker; - else - workers = root->fs_info->endio_write_workers; - btrfs_queue_work(workers, &ordered_extent->work); + btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, + NULL); + btrfs_queue_work(wq, &ordered_extent->work); return 0; } @@ -7208,7 +7216,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, btrfs_endio_write_helper, + finish_ordered_fn, NULL, NULL); btrfs_queue_work(root->fs_info->endio_write_workers, &ordered->work); out_test: @@ -8535,7 +8544,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + WARN_ON_ONCE(!inode); + btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, + btrfs_run_delalloc_work, NULL, NULL); return work; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 963895c1f801..ac734ec4cc20 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, + btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(root->fs_info->flush_workers, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8abe45524de9..ded5c601d916 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2720,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4a88f073fdd7..0a6b6e4bcbb9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1416,7 +1416,8 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + rmw_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio) static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + read_rebuild_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, btrfs_rmw_helper, + unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 09230cf3a244..20408c6b665a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); + btrfs_init_work(&rmw->work, btrfs_readahead_helper, + reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 23d3f6e6a482..f4a41f37be22 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, - NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrub_helper, + scrub_bio_end_io_worker, NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -999,8 +999,8 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, - NULL, NULL); + btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, + scrub_fixup_nodatasum, NULL, NULL); btrfs_queue_work(fs_info->scrub_workers, &fixup_nodatasum->work); goto out; @@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, + scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -3214,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); + btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, + copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); btrfs_queue_work(fs_info->scrub_nocow_workers, &nocow_ctx->work); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d4ce53d7569..340a92d08e84 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5854,7 +5854,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); + btrfs_init_work(&dev->work, btrfs_submit_helper, + pending_bios_fn, NULL, NULL); return dev; } -- cgit v1.2.1 From aee7af356e151494d5014f57b33460b162f181b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Aug 2014 22:33:12 -0400 Subject: NFSv4: Fix problems with close in the presence of a delegation In the presence of delegations, we can no longer assume that the state->n_rdwr, state->n_rdonly, state->n_wronly reflect the open stateid share mode, and so we need to calculate the initial value for calldata->arg.fmode using the state->flags. Reported-by: James Drews Fixes: 88069f77e1ac5 (NFSv41: Fix a potential state leakage when...) Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 75ae8d22f067..ff94a2f36051 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2601,6 +2601,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; dprintk("%s: begin!\n", __func__); @@ -2608,18 +2609,24 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_READ|FMODE_WRITE; spin_lock(&state->owner->so_lock); + is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); + is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); + is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); + /* Calculate the current open share mode */ + calldata->arg.fmode = 0; + if (is_rdonly || is_rdwr) + calldata->arg.fmode |= FMODE_READ; + if (is_wronly || is_rdwr) + calldata->arg.fmode |= FMODE_WRITE; /* Calculate the change in open mode */ if (state->n_rdwr == 0) { if (state->n_rdonly == 0) { - call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); - call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= is_rdonly || is_rdwr; calldata->arg.fmode &= ~FMODE_READ; } if (state->n_wronly == 0) { - call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); - call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= is_wronly || is_rdwr; calldata->arg.fmode &= ~FMODE_WRITE; } } -- cgit v1.2.1 From 412f6c4c26fb1eba8844290663837561ac53fa6e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Aug 2014 22:09:08 -0400 Subject: NFSv4: Don't clear the open state when we just did an OPEN_DOWNGRADE If we did an OPEN_DOWNGRADE, then the right thing to do on success, is to apply the new open mode to the struct nfs4_state. Instead, we were unconditionally clearing the state, making it appear to our state machinery as if we had just performed a CLOSE. Fixes: 226056c5c312b (NFSv4: Use correct locking when updating nfs4_state...) Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff94a2f36051..7dd8aca31c29 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2560,6 +2560,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); + nfs4_stateid *res_stateid = NULL; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -2570,12 +2571,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) */ switch (task->tk_status) { case 0: - if (calldata->roc) + res_stateid = &calldata->res.stateid; + if (calldata->arg.fmode == 0 && calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); - nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - goto out_release; + break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -2589,7 +2590,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); + nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); -- cgit v1.2.1 From f87d928f6d98644d39809a013a22f981d39017cf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Aug 2014 14:46:48 -0400 Subject: NFSv3: Fix another acl regression When creating a new object on the NFS server, we should not be sending posix setacl requests unless the preceding posix_acl_create returned a non-trivial acl. Doing so, causes Solaris servers in particular to return an EINVAL. Fixes: 013cdf1088d72 (nfs: use generic posix ACL infrastructure,,,) Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1132786 Cc: stable@vger.kernel.org # 3.14+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index d0fec260132a..24c6898159cc 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -129,7 +129,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status; + int status = 0; + + if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL)) + goto out; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) -- cgit v1.2.1 From 69dc9536405213c1d545fcace1fc15c481d00aae Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Aug 2014 18:33:49 -0400 Subject: ext4: fix incorect journal credits reservation in ext4_zero_range Currently we reserve only 4 blocks but in worst case scenario ext4_zero_partial_blocks() may want to zeroout and convert two non adjacent blocks. Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f0e6934291dd..0e9de2328bd2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4731,6 +4731,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t new_size = 0; int ret = 0; int flags; + int credits; int partial; loff_t start, end; ext4_lblk_t lblk; @@ -4830,8 +4831,14 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (ret) goto out_dio; } - - handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); -- cgit v1.2.1 From c174e6d6979a04b7b77b93f244396be4b81f8bfb Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Aug 2014 18:40:00 -0400 Subject: ext4: fix transaction issues for ext4_fallocate and ext_zero_range After commit f282ac19d86f we use different transactions for preallocation and i_disksize update which result in complain from fsck after power-failure. spotted by generic/019. IMHO this is regression because fs becomes inconsistent, even more 'e2fsck -p' will no longer works (which drives admins go crazy) Same transaction requirement applies ctime,mtime updates testcase: xfstest generic/019 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 68 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0e9de2328bd2..74292a71b384 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4665,7 +4665,8 @@ retry: } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, - ext4_lblk_t len, int flags, int mode) + ext4_lblk_t len, loff_t new_size, + int flags, int mode) { struct inode *inode = file_inode(file); handle_t *handle; @@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int retries = 0; struct ext4_map_blocks map; unsigned int credits; + loff_t epos; map.m_lblk = offset; + map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple @@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, credits = ext4_chunk_trans_blocks(inode, len); retry: - while (ret >= 0 && ret < len) { - map.m_lblk = map.m_lblk + ret; - map.m_len = len = len - ret; + while (ret >= 0 && len) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { @@ -4709,6 +4710,21 @@ retry: ret2 = ext4_journal_stop(handle); break; } + map.m_lblk += ret; + map.m_len = len = len - ret; + epos = (loff_t)map.m_lblk << inode->i_blkbits; + inode->i_ctime = ext4_current_time(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode->i_mtime = inode->i_ctime; + } else { + if (epos > inode->i_size) + ext4_set_inode_flag(inode, + EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret2) break; @@ -4732,7 +4748,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, int ret = 0; int flags; int credits; - int partial; + int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; struct address_space *mapping = inode->i_mapping; @@ -4772,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (start < offset || end > offset + len) return -EINVAL; - partial = (offset + len) & ((1 << blkbits) - 1); + partial_begin = offset & ((1 << blkbits) - 1); + partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); @@ -4806,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * If we have a partial block after EOF we have to allocate * the entire block. */ - if (partial) + if (partial_end) max_blocks += 1; } @@ -4814,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages*/ truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); @@ -4826,11 +4844,14 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (ret) goto out_dio; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, - mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); if (ret) goto out_dio; } + if (!partial_begin && !partial_end) + goto out_dio; + /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode @@ -4856,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } - ext4_mark_inode_dirty(handle, inode); /* Zero out partial block at the edges of the range */ @@ -4883,7 +4903,6 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - handle_t *handle; loff_t new_size = 0; unsigned int max_blocks; int ret = 0; @@ -4939,32 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); if (ret) goto out; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - goto out; - - inode->i_ctime = ext4_current_time(inode); - - if (new_size) { - if (ext4_update_inode_size(inode, new_size) & 0x1) - inode->i_mtime = inode->i_ctime; - } else { - /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if ((offset + len) > i_size_read(inode)) - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } - ext4_mark_inode_dirty(handle, inode); - if (file->f_flags & O_SYNC) - ext4_handle_sync(handle); - - ext4_journal_stop(handle); out: mutex_unlock(&inode->i_mutex); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -- cgit v1.2.1 From 6603120e96eae9a5d6228681ae55c7fdc998d1bb Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Aug 2014 18:40:03 -0400 Subject: ext4: update i_disksize coherently with block allocation on error path In case of delalloc block i_disksize may be less than i_size. So we have to update i_disksize each time we allocated and submitted some blocks beyond i_disksize. We weren't doing this on the error paths, so fix this. testcase: xfstest generic/019 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b1ddd9352644..3aa26e9117c4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2077,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; + int progress = 0; mpd->io_submit.io_end->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; @@ -2093,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle, * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || - (err == -ENOSPC && ext4_count_free_clusters(sb))) + (err == -ENOSPC && ext4_count_free_clusters(sb))) { + if (progress) + goto update_disksize; return err; + } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" @@ -2111,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle, *give_up_on_write = true; return err; } + progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) - return err; + goto update_disksize; } while (map->m_len); +update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. -- cgit v1.2.1 From 022eaa7517017efe4f6538750c2b59a804dc7df7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2014 18:40:05 -0400 Subject: jbd2: fix infinite loop when recovering corrupt journal blocks When recovering the journal, don't fall into an infinite loop if we encounter a corrupt journal block. Instead, just skip the block and return an error, which fails the mount and thus forces the user to run a full filesystem fsck. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/jbd2/recovery.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3b6bb19d60b1..00e9703d7dc6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -426,6 +426,7 @@ static int do_one_pass(journal_t *journal, int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ int descr_csum_size = 0; + int block_error = 0; /* * First thing is to establish what we expect to find in the log @@ -598,7 +599,8 @@ static int do_one_pass(journal_t *journal, "checksum recovering " "block %llu in log\n", blocknr); - continue; + block_error = 1; + goto skip_write; } /* Find a buffer for the new @@ -797,7 +799,8 @@ static int do_one_pass(journal_t *journal, success = -EIO; } } - + if (block_error && success == 0) + success = -EIO; return success; failed: -- cgit v1.2.1 From db9ee220361de03ee86388f9ea5e529eaad5323c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2014 18:40:07 -0400 Subject: jbd2: fix descriptor block size handling errors with journal_csum It turns out that there are some serious problems with the on-disk format of journal checksum v2. The foremost is that the function to calculate descriptor tag size returns sizes that are too big. This causes alignment issues on some architectures and is compounded by the fact that some parts of jbd2 use the structure size (incorrectly) to determine the presence of a 64bit journal instead of checking the feature flags. Therefore, introduce journal checksum v3, which enlarges the descriptor block tag format to allow for full 32-bit checksums of journal blocks, fix the journal tag function to return the correct sizes, and fix the jbd2 recovery code to use feature flags to determine 64bitness. Add a few function helpers so we don't have to open-code quite so many pieces. Switching to a 16-byte block size was found to increase journal size overhead by a maximum of 0.1%, to convert a 32-bit journal with no checksumming to a 32-bit journal with checksum v3 enabled. Signed-off-by: Darrick J. Wong Reported-by: TR Reardon Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 5 +++-- fs/jbd2/commit.c | 21 +++++++++++--------- fs/jbd2/journal.c | 56 ++++++++++++++++++++++++++++++++++++------------------ fs/jbd2/recovery.c | 26 ++++++++++++++----------- fs/jbd2/revoke.c | 6 +++--- 5 files changed, 70 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 32b43ad154b9..0b28b36e7915 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb) if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { - /* journal checksum v2 */ + /* journal checksum v3 */ compat = 0; - incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; @@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6fac74349856..b73e0215baa7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) struct commit_header *h; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; h = (struct commit_header *)(bh->b_data); @@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) return checksum; } -static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (tag_bytes > JBD2_TAG_SIZE32) + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } @@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j, struct jbd2_journal_block_tail *tail; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - @@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j, static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; seq = cpu_to_be32(sequence); @@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, bh->b_size); kunmap_atomic(addr); - /* We only have space to store the lower 16 bits of the crc32c. */ - tag->t_checksum = cpu_to_be16(csum32); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + tag3->t_checksum = cpu_to_be32(csum32); + else + tag->t_checksum = cpu_to_be16(csum32); } /* * jbd2_journal_commit_transaction @@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) LIST_HEAD(io_bufs); LIST_HEAD(log_bufs); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); /* @@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag_flag |= JBD2_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; - write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], commit_transaction->t_tid); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 67b8e303946c..19d74d86d99c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug); /* Checksumming functions */ static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; @@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; return sb->s_checksum == jbd2_superblock_csum(j, sb); @@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; sb->s_checksum = jbd2_superblock_csum(j, sb); @@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + goto out; + } + if (!jbd2_verify_csum_type(journal, sb)) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } /* Load the checksum driver */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + if (jbd2_journal_has_csum_v2or3(journal)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); @@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal) } /* Precompute checksum seed for all metadata */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); @@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) return 0; - /* Asking for checksumming v2 and v1? Only give them v2. */ - if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && + /* If enabling v2 checksums, turn on v3 instead */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { + incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; + } + + /* Asking for checksumming v3 and v1? Only give them v3. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && compat & JBD2_FEATURE_COMPAT_CHECKSUM) compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; @@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; - /* If enabling v2 checksums, update superblock */ - if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + /* If enabling v3 checksums, update superblock */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); @@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, } /* Precompute checksum seed for all metadata */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); @@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, /* If enabling v1 checksums, downgrade superblock */ if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) sb->s_feature_incompat &= - ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | + JBD2_FEATURE_INCOMPAT_CSUM_V3); sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); @@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode) */ size_t journal_tag_bytes(journal_t *journal) { - journal_block_tag_t tag; - size_t x = 0; + size_t sz; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return sizeof(journal_block_tag3_t); + + sz = sizeof(journal_block_tag_t); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) - x += sizeof(tag.t_checksum); + sz += sizeof(__u16); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) - return x + JBD2_TAG_SIZE64; + return sz; else - return x + JBD2_TAG_SIZE32; + return sz - sizeof(__u32); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 00e9703d7dc6..9b329b55ffe3 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j, __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - @@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) size -= sizeof(struct jbd2_journal_block_tail); tagp = &bh->b_data[sizeof(journal_header_t)]; @@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal) return err; } -static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) +static inline unsigned long long read_tag_block(journal_t *journal, + journal_block_tag_t *tag) { unsigned long long block = be32_to_cpu(tag->t_blocknr); - if (tag_bytes > JBD2_TAG_SIZE32) + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; return block; } @@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; h = buf; @@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u32 csum32; __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; seq = cpu_to_be32(sequence); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - return tag->t_checksum == cpu_to_be16(csum32); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return tag3->t_checksum == cpu_to_be32(csum32); + else + return tag->t_checksum == cpu_to_be16(csum32); } static int do_one_pass(journal_t *journal, @@ -513,8 +518,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) descr_csum_size = sizeof(struct jbd2_journal_block_tail); if (descr_csum_size > 0 && @@ -575,7 +579,7 @@ static int do_one_pass(journal_t *journal, unsigned long long blocknr; J_ASSERT(obh != NULL); - blocknr = read_tag_block(tag_bytes, + blocknr = read_tag_block(journal, tag); /* If the block has been @@ -814,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 198c9c10276d..d5e95a175c92 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -91,8 +91,8 @@ #include #include #include -#endif #include +#endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; @@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal, offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); /* Make sure we have a descriptor with space left for the record */ @@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) struct jbd2_journal_revoke_tail *tail; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - -- cgit v1.2.1 From d80d448c6c5bdd32605b78a60fe8081d82d4da0f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2014 18:40:09 -0400 Subject: ext4: fix same-dir rename when inline data directory overflows When performing a same-directory rename, it's possible that adding or setting the new directory entry will cause the directory to overflow the inline data area, which causes the directory to be converted to an extent-based directory. Under this circumstance it is necessary to re-read the directory when deleting the old dirent because the "old directory" context still points to i_block in the inode table, which is now an extent tree root! The delete fails with an FS error, and the subsequent fsck complains about incorrect link counts and hardlinked directories. Test case (originally found with flat_dir_test in the metadata_csum test program): # mkfs.ext4 -O inline_data /dev/sda # mount /dev/sda /mnt # mkdir /mnt/x # touch /mnt/x/changelog.gz /mnt/x/copyright /mnt/x/README.Debian # sync # for i in /mnt/x/*; do mv $i $i.longer; done # ls -la /mnt/x/ total 0 -rw-r--r-- 1 root root 0 Aug 25 12:03 changelog.gz.longer -rw-r--r-- 1 root root 0 Aug 25 12:03 copyright -rw-r--r-- 1 root root 0 Aug 25 12:03 copyright.longer -rw-r--r-- 1 root root 0 Aug 25 12:03 README.Debian.longer (Hey! Why are there four files now??) Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ae7088b446d1..90a3cdca3f88 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3147,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, return retval; } -static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, + int force_reread) { int retval; /* @@ -3159,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, - ent->de->name_len)) { + ent->de->name_len) || + force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { @@ -3210,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, .dentry = new_dentry, .inode = new_dentry->d_inode, }; + int force_reread; int retval; dquot_initialize(old.dir); @@ -3271,6 +3274,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; } + /* + * If we're renaming a file within an inline_data dir and adding or + * setting the new dirent causes a conversion from inline_data to + * extents/blockmap, we need to force the dirent delete code to + * re-read the directory, or else we end up trying to delete a dirent + * from what is now the extent tree root (or a block map). + */ + force_reread = (new.dir->i_ino == old.dir->i_ino && + ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) @@ -3281,6 +3293,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; } + if (force_reread) + force_reread = !ext4_test_inode_flag(new.dir, + EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a @@ -3292,7 +3307,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, /* * ok, that's it */ - ext4_rename_delete(handle, &old); + ext4_rename_delete(handle, &old, force_reread); if (new.inode) { ext4_dec_count(handle, new.inode); -- cgit v1.2.1 From 2b462638e41ea62230297c21c4da9955937b7a3c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 29 Aug 2014 15:18:58 -0700 Subject: ocfs2: do not write error flag to user structure we cannot copy from/to If we failed to copy from the structure, writing back the flags leaks 31 bits of kernel memory (the rest of the ir_flags field). In any case, if we cannot copy from/to the structure, why should we expect putting just the flags to work? Also make sure ocfs2_info_handle_freeinode() returns the right error code if the copy_to_user() fails. Fixes: ddee5cdb70e6 ('Ocfs2: Add new OCFS2_IOC_INFO ioctl for ocfs2 v8.') Signed-off-by: Ben Hutchings Cc: Joel Becker Acked-by: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 129 +++++++++++++++++++------------------------------------ 1 file changed, 43 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 6f66b3751ace..53e6c40ed4c6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -35,9 +35,8 @@ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* - * This call is void because we are already reporting an error that may - * be -EFAULT. The error will be returned from the ioctl(2) call. It's - * just a best-effort to tell userspace that this request caused the error. + * This is just a best-effort to tell userspace that this request + * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) @@ -146,136 +145,105 @@ bail: static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) - goto bail; + return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oib.ib_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) - goto bail; + return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oic.ic_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) - goto bail; + return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oim.im_req, req); - - return status; + return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) - goto bail; + return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oil.il_req, req); - - return status; + return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) - goto bail; + return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oiu.iu_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) - goto bail; + return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; @@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode, o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oif.if_req, req); - - return status; + return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) - goto bail; + return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; + return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, @@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, u32 i; u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; @@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto out_err; } - if (o2info_from_user(*oifi, req)) - goto bail; + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } oifi->ifi_slotnum = osb->max_slots; @@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, o2info_set_request_filled(&oifi->ifi_req); - if (o2info_to_user(*oifi, req)) - goto bail; + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); - +out_free: kfree(oifi); out_err: return status; @@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, { u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto out_err; } - if (o2info_from_user(*oiff, req)) - goto bail; + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } /* * chunksize from userspace should be power of 2. */ @@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, if (o2info_to_user(*oiff, req)) { status = -EFAULT; - goto bail; + goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); - +out_free: kfree(oiff); out_err: return status; @@ -727,23 +690,17 @@ out_err: static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) - goto bail; + return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oir, req); - - return status; + return 0; } /* -- cgit v1.2.1 From c43c363def04cdaed0d9e26dae846081f55714e7 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:00 -0700 Subject: ocfs2: o2net: don't shutdown connection when idle timeout This patch series is to fix a possible message lost bug in ocfs2 when network go bad. This bug will cause ocfs2 hung forever even network become good again. The messages may lost in this case. After the tcp connection is established between two nodes, an idle timer will be set to check its state periodically, if no messages are received during this time, idle timer will timeout, it will shutdown the connection and try to reconnect, so pending messages in tcp queues will be lost. This messages may be from dlm. Dlm may get hung in this case. This may cause the whole ocfs2 cluster hung. This is very possible to happen when network state goes bad. Do the reconnect is useless, it will fail if network state is still bad. Just waiting there for network recovering may be a good idea, it will not lost messages and some node will be fenced until cluster goes into split-brain state, for this case, Tcp user timeout is used to override the tcp retransmit timeout. It will timeout after 25 days, user should have notice this through the provided log and fix the network, if they don't, ocfs2 will fall back to original reconnect way. This patch (of 3): Some messages in the tcp queue maybe lost if we shutdown the connection and reconnect when idle timeout. If packets lost and reconnect success, then the ocfs2 cluster maybe hung. To fix this, we can leave the connection there and do the fence decision when idle timeout, if network recover before fence dicision is made, the connection survive without lost any messages. This bug can be saw when network state go bad. It may cause ocfs2 hung forever if some packets lost. With this fix, ocfs2 will recover from hung if network becomes good again. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 681691bc233a..2334bfc966c1 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1536,16 +1536,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1560,6 +1564,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); -- cgit v1.2.1 From 8e9801dfe37c9e68cdbfcd15988df2187191864e Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:02 -0700 Subject: ocfs2: o2net: set tcp user timeout to max value When tcp retransmit timeout(15mins), the connection will be closed. Pending messages may be lost during this time. So we set tcp user timeout to override the retransmit timeout to the max value. This is OK for ocfs2 since we have disk heartbeat, if peer crash, the disk heartbeat will timeout and it will be evicted, if disk heartbeat not timeout and connection idle for a long time, then this means the cluster enters split-brain state, since fence can't happen, we'd better keep the connection and wait network recover. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 20 ++++++++++++++++++++ fs/ocfs2/cluster/tcp.h | 1 + 2 files changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2334bfc966c1..ea34952f9496 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static int o2net_set_usertimeout(struct socket *sock) +{ + int user_timeout = O2NET_TCP_USER_TIMEOUT; + + return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, sizeof(user_timeout)); +} + static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1663,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work) goto out; } + ret = o2net_set_usertimeout(sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + o2net_register_callbacks(sc->sc_sock->sk, sc); spin_lock(&nn->nn_lock); @@ -1844,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } + ret = o2net_set_usertimeout(new_sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + slen = sizeof(sin); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, &slen, 1); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..c571e849fda4 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) -- cgit v1.2.1 From 8c7b638cece146234b0c0d5f6ba84d1cf6f81e83 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:04 -0700 Subject: ocfs2: quorum: add a log for node not fenced For debug use, we can see from the log whether the fence decision is made and why it is not fenced. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/quorum.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..62e8ec619b4c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) -- cgit v1.2.1