diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 202 |
1 files changed, 136 insertions, 66 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 855935f6671a..611b66d73e80 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1845,8 +1845,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int ret; ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -1906,8 +1908,10 @@ mapit: ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); out: - if (ret < 0) - bio_endio(bio, ret); + if (ret < 0) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -3654,6 +3658,35 @@ cache_index: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * <power failure> + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of ocassional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4209,7 +4242,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4493,8 +4526,7 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); @@ -4989,8 +5021,9 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readpages (called from readahead) - * that didn't get their end io callbacks called yet or they are still - * in progress ((extent_io.c:end_bio_extent_readpage()). This means some + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken @@ -5051,7 +5084,8 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (!special_file(inode->i_mode)) + btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(inode, 0, (u64)-1); @@ -6876,8 +6910,7 @@ out: trace_btrfs_get_extent(root, em); - if (path) - btrfs_free_path(path); + btrfs_free_path(path); if (trans) { ret = btrfs_end_transaction(trans, root); if (!err) @@ -7376,6 +7409,10 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; +}; static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -7383,10 +7420,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; + struct btrfs_dio_data *dio_data = NULL; u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7404,7 +7441,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * that anything that needs to check if there's a transction doesn't get * confused. */ - outstanding_extents = current->journal_info; + dio_data = current->journal_info; current->journal_info = NULL; } @@ -7536,16 +7573,18 @@ unlock: * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (*outstanding_extents) { - (*outstanding_extents)--; + if (dio_data->outstanding_extents) { + (dio_data->outstanding_extents)--; } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + WARN_ON(dio_data->reserve < len); + dio_data->reserve -= len; + current->journal_info = dio_data; } /* @@ -7568,8 +7607,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); - if (outstanding_extents) - current->journal_info = outstanding_extents; + if (dio_data) + current->journal_info = dio_data; return ret; } @@ -7687,13 +7726,13 @@ struct btrfs_retry_complete { int uptodate; }; -static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct bio_vec *bvec; int i; - if (err) + if (bio->bi_error) goto end; done->uptodate = 1; @@ -7742,7 +7781,7 @@ try_again: return 0; } -static void btrfs_retry_endio(struct bio *bio, int err) +static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); @@ -7751,7 +7790,7 @@ static void btrfs_retry_endio(struct bio *bio, int err) int ret; int i; - if (err) + if (bio->bi_error) goto end; uptodate = 1; @@ -7834,12 +7873,13 @@ static int btrfs_subio_endio_read(struct inode *inode, } } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + int err = bio->bi_error; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -7850,17 +7890,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip); - /* If we had a csum failure make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) io_bio->end_io(io_bio, err); bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7871,12 +7908,11 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes, !err); + ordered_bytes, + !bio->bi_error); if (!ret) goto out_test; @@ -7895,15 +7931,11 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); - /* If we had an error make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -7918,9 +7950,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + int err = bio->bi_error; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -7949,8 +7982,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); - bio_endio(dip->orig_bio, 0); + dip->dio_bio->bi_error = 0; + bio_endio(dip->orig_bio); } out: bio_put(bio); @@ -7959,8 +7992,11 @@ out: static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, u64 first_sector, gfp_t gfp_flags) { - int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + struct bio *bio; + bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); + if (bio) + bio_associate_current(bio); + return bio; } static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, @@ -8163,9 +8199,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8182,7 +8217,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8210,25 +8245,56 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + io_bio->bi_error = -EIO; + bio_endio(io_bio); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + dio_bio->bi_error = -EIO; + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, @@ -8269,7 +8335,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - u64 outstanding_extents = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_data dio_data = { 0 }; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8307,7 +8374,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; - outstanding_extents = div64_u64(count + + dio_data.outstanding_extents = div64_u64(count + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); @@ -8316,7 +8383,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * do the accounting properly if we go over the number we * originally calculated. Abuse current->journal_info for this. */ - current->journal_info = &outstanding_extents; + dio_data.reserve = round_up(count, root->sectorsize); + current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_end(inode); @@ -8330,9 +8398,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + if (dio_data.reserve) + btrfs_delalloc_release_space(inode, + dio_data.reserve); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } |