diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 632 |
1 files changed, 488 insertions, 144 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41a5688ffdfe..8b1212e8f7a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -194,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, while (compressed_size > 0) { cpage = compressed_pages[i]; cur_size = min_t(unsigned long, compressed_size, - PAGE_CACHE_SIZE); + PAGE_SIZE); kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); @@ -208,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, compress_type); } else { page = find_get_page(inode->i_mapping, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_CACHE_SIZE - 1); + offset = start & (PAGE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); - page_cache_release(page); + put_page(page); } btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -322,7 +322,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -435,8 +435,8 @@ static noinline void compress_file_range(struct inode *inode, actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; - nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -455,7 +455,7 @@ again: /* * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. + * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -514,7 +514,7 @@ again: if (!ret) { unsigned long offset = total_compressed & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); struct page *page = pages[nr_pages_ret - 1]; char *kaddr; @@ -524,7 +524,7 @@ again: if (offset) { kaddr = kmap_atomic(page); memset(kaddr + offset, 0, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); kunmap_atomic(kaddr); } will_compress = 1; @@ -580,7 +580,7 @@ cont: * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = ALIGN(total_in, PAGE_CACHE_SIZE); + total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -594,7 +594,7 @@ cont: */ for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); pages = NULL; @@ -650,7 +650,7 @@ cleanup_and_bail_uncompressed: free_pages_out: for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); } @@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); + put_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -824,6 +824,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +862,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -966,7 +968,7 @@ static noinline int cow_file_range(struct inode *inode, PAGE_END_WRITEBACK); *nr_written = *nr_written + - (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; goto out; } else if (ret < 0) { @@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1070,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -1106,8 +1111,8 @@ static noinline void async_cow_submit(struct btrfs_work *work) async_cow = container_of(work, struct async_cow, work); root = async_cow->root; - nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + PAGE_SHIFT; /* * atomic_sub_return implies a barrier for waitqueue_active @@ -1164,8 +1169,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow_start, async_cow_submit, async_cow_free); - nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (cur_end - start + PAGE_SIZE) >> + PAGE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); btrfs_queue_work(root->fs_info->delalloc_workers, @@ -1377,6 +1382,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1391,6 +1399,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1411,6 +1422,9 @@ out_check: if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1453,6 +1467,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -1960,9 +1976,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); + WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -1993,7 +2009,7 @@ again: inode = page->mapping->host; page_start = page_offset(page); - page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + page_end = page_offset(page) + PAGE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -2003,7 +2019,7 @@ again: goto out; ordered = btrfs_lookup_ordered_range(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2014,7 +2030,7 @@ again: } ret = btrfs_delalloc_reserve_space(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2030,7 +2046,7 @@ out: &cached_state, GFP_NOFS); out_page: unlock_page(page); - page_cache_release(page); + put_page(page); kfree(fixup); } @@ -2063,7 +2079,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) return -EAGAIN; SetPageChecked(page); - page_cache_get(page); + get_page(page); btrfs_init_work(&fixup->work, btrfs_fixup_helper, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -3103,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } @@ -3706,7 +3721,7 @@ cache_index: * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, - * but it guarantees correctness at the expense of ocassional full + * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ @@ -4247,7 +4262,7 @@ static int truncate_inline_extent(struct inode *inode, if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + loff_t page_end = ALIGN(offset, PAGE_SIZE); /* * Zero out the remaining of the last page of our inline extent, @@ -4633,7 +4648,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; char *kaddr; u32 blocksize = root->sectorsize; - pgoff_t index = from >> PAGE_CACHE_SHIFT; + pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); @@ -4668,7 +4683,7 @@ again: lock_page(page); if (page->mapping != mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } if (!PageUptodate(page)) { @@ -4686,7 +4701,7 @@ again: unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); unlock_page(page); - page_cache_release(page); + put_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; @@ -4728,7 +4743,7 @@ out_unlock: btrfs_delalloc_release_space(inode, block_start, blocksize); unlock_page(page); - page_cache_release(page); + put_page(page); out: return ret; } @@ -4962,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * be instantly completed which will give us extents that need * to be truncated. If we fail to get an orphan inode down we * could have left over extents that were never meant to live, - * so we need to garuntee from this point on that everything + * so we need to guarantee from this point on that everything * will be consistent. */ ret = btrfs_orphan_add(trans, inode); @@ -5232,7 +5247,7 @@ void btrfs_evict_inode(struct inode *inode) } /* - * We can't just steal from the global reserve, we need tomake + * We can't just steal from the global reserve, we need to make * sure there is room to do it, if not we need to commit and try * again. */ @@ -6717,7 +6732,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); kfree(tmp); @@ -6879,8 +6894,8 @@ next: size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, - size - extent_offset); + copy_size = min_t(u64, PAGE_SIZE - pg_offset, + size - extent_offset); em->start = extent_start + extent_offset; em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; @@ -6899,9 +6914,9 @@ next: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); - if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + if (pg_offset + copy_size < PAGE_SIZE) { memset(map + pg_offset + copy_size, 0, - PAGE_CACHE_SIZE - pg_offset - + PAGE_SIZE - pg_offset - copy_size); } kunmap(page); @@ -6964,7 +6979,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to @@ -7129,6 +7155,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7144,41 +7207,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } return em; } @@ -7336,12 +7371,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int start_idx; int end_idx; - start_idx = start >> PAGE_CACHE_SHIFT; + start_idx = start >> PAGE_SHIFT; /* * end is the last byte in the last page. end == start is legal */ - end_idx = end >> PAGE_CACHE_SHIFT; + end_idx = end >> PAGE_SHIFT; rcu_read_lock(); @@ -7382,7 +7417,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -7390,7 +7425,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) if (page) { if (page->index <= end_idx) found = true; - page_cache_release(page); + put_page(page); } rcu_read_unlock(); @@ -7408,7 +7443,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state); /* * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered + * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7570,7 +7605,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (current->journal_info) { /* * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transction doesn't get + * that anything that needs to check if there's a transaction doesn't get * confused. */ dio_data = current->journal_info; @@ -7603,7 +7638,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * - * We return -ENOTBLK because thats what makes DIO go ahead and go back + * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ @@ -7650,24 +7685,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } + em = em2; } - - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - if (ret) { - free_extent_map(em); + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); goto unlock_err; } goto unlock; @@ -8541,13 +8573,13 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_data dio_data = { 0 }; + loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8607,7 +8639,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = __blockdev_direct_IO(iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iter, offset, btrfs_get_blocks_direct, NULL, + iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; @@ -8719,7 +8751,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } return ret; } @@ -8739,7 +8771,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; int inode_evicting = inode->i_state & I_FREEING; @@ -8822,7 +8854,7 @@ again: * 2) Not written to disk * This means the reserved space should be freed here. */ - btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8837,7 +8869,7 @@ again: if (PagePrivate(page)) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } } @@ -8874,11 +8906,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; u64 end; - reserved_space = PAGE_CACHE_SIZE; + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; end = page_end; /* @@ -8934,15 +8966,15 @@ again: goto again; } - if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) { + if (page->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, root->sectorsize); - if (reserved_space < PAGE_CACHE_SIZE) { + if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, page_start, - PAGE_CACHE_SIZE - reserved_space); + PAGE_SIZE - reserved_space); } } @@ -8969,14 +9001,14 @@ again: ret = 0; /* page is wholly or partially inside EOF */ - if (page_start + PAGE_CACHE_SIZE > size) - zero_start = size & ~PAGE_CACHE_MASK; + if (page_start + PAGE_SIZE > size) + zero_start = size & ~PAGE_MASK; else - zero_start = PAGE_CACHE_SIZE; + zero_start = PAGE_SIZE; - if (zero_start != PAGE_CACHE_SIZE) { + if (zero_start != PAGE_SIZE) { kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); flush_dcache_page(page); kunmap(page); } @@ -9019,7 +9051,7 @@ static int btrfs_truncate(struct inode *inode) return ret; /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * Yes ladies and gentlemen, this is indeed ugly. The fact is we have * 3 things going on here * * 1) We need to reserve space for our orphan item and the space to @@ -9033,15 +9065,15 @@ static int btrfs_truncate(struct inode *inode) * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be seperate. The fact is we can use alot of + * And we need these to all be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it + * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode or * removing the orphan item. We also need to be able to stop the * transaction and start a new one, which means we need to be able to * update the inode several times, and we have no idea of knowing how * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. + * entirety of the operation, so that has to be done separately as well. * Then there is the orphan item, which does indeed need to be held on * to for the whole operation, and we need nobody to touch this reserved * space except the orphan code. @@ -9230,6 +9262,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } @@ -9387,10 +9420,281 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct dentry *parent; + u64 old_ino = btrfs_ino(old_inode); + u64 new_ino = btrfs_ino(new_inode); + u64 old_idx = 0; + u64 new_idx = 0; + u64 root_objectid; + int ret; + bool root_log_pinned = false; + bool dest_log_pinned = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&dest->fs_info->subvol_sem); + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 12); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(new_dir, &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(old_dir, &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + btrfs_pin_log_trans(root); + root_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), old_idx); + if (ret) + goto out_fail; + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(dest->fs_info, trans); + } else { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, root, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_ino, + btrfs_ino(old_dir), new_idx); + if (ret) + goto out_fail; + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + new_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, + root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, dest, new_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, old_dir, new_inode, + old_dentry->d_name.name, + old_dentry->d_name.len, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + if (root_log_pinned) { + parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + parent = old_dentry->d_parent; + btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } +out_fail: + /* + * If we have pinned a log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && (root_log_pinned || dest_log_pinned)) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + if (root_log_pinned) { + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } + } + ret = btrfs_end_transaction(trans, root); +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&dest->fs_info->subvol_sem); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode; + u64 objectid; + u64 index; + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + return ret; + + inode = btrfs_new_inode(trans, root, dir, + dentry->d_name.name, + dentry->d_name.len, + btrfs_ino(dir), + objectid, + S_IFCHR | WHITEOUT_MODE, + &index); + + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + return ret; + } + + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, + WHITEOUT_DEV); + + ret = btrfs_init_inode_security(trans, inode, dir, + &dentry->d_name); + if (ret) + goto out; + + ret = btrfs_add_nondir(trans, dir, dentry, + inode, 0, index); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); +out: + unlock_new_inode(inode); + if (ret) + inode_dec_link_count(inode); + iput(inode); + + return ret; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); @@ -9399,6 +9703,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 root_objectid; int ret; u64 old_ino = btrfs_ino(old_inode); + bool log_pinned = false; if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9449,15 +9754,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal + * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. + * If our rename has the whiteout flag, we need more 5 units for the + * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item + * when selinux is enabled). */ - trans = btrfs_start_transaction(root, 11); + trans_num_items = 11; + if (flags & RENAME_WHITEOUT) + trans_num_items += 5; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -9471,6 +9782,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9478,14 +9791,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_ino(new_dir), index); if (ret) goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); } inode_inc_iversion(old_dir); @@ -9552,12 +9857,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (log_pinned) { struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + log_pinned = false; + } + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_whiteout_for_rename(trans, root, old_dir, + old_dentry); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } } out_fail: + /* + * If we have pinned the log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && log_pinned) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + btrfs_end_log_trans(root); + log_pinned = false; + } btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9570,10 +9909,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); + if (flags & RENAME_EXCHANGE) + return btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } static void btrfs_run_delalloc_work(struct btrfs_work *work) @@ -9942,6 +10285,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, @@ -10160,10 +10504,10 @@ static const struct inode_operations btrfs_dir_inode_operations = { .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, @@ -10184,7 +10528,7 @@ static const struct file_operations btrfs_dir_file_operations = { .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, @@ -10237,10 +10581,10 @@ static const struct address_space_operations btrfs_symlink_aops = { static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, @@ -10251,10 +10595,10 @@ static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, @@ -10265,10 +10609,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .update_time = btrfs_update_time, }; |