diff options
author | Qu Wenruo <quwenruo@cn.fujitsu.com> | 2017-02-27 15:10:39 +0800 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2017-06-29 20:17:02 +0200 |
commit | bc42bda22345efdb5d8b578d1b4df2c6eaa85c58 (patch) | |
tree | 825e007666671ac31bd140e61b48540178cbb49e /fs/btrfs/qgroup.c | |
parent | 364ecf3651e0862152c8b340d7cb3021dc0122c7 (diff) | |
download | talos-obmc-linux-bc42bda22345efdb5d8b578d1b4df2c6eaa85c58.tar.gz talos-obmc-linux-bc42bda22345efdb5d8b578d1b4df2c6eaa85c58.zip |
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/qgroup.c')
-rw-r--r-- | fs/btrfs/qgroup.c | 72 |
1 files changed, 67 insertions, 5 deletions
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 002088937021..fc9dffaa9524 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2892,13 +2892,72 @@ cleanup: return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -2924,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2951,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, |