diff options
Diffstat (limited to 'fs')
77 files changed, 1141 insertions, 620 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 241ef68d2893..cd46e4158830 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -918,7 +918,7 @@ static int load_elf_binary(struct linux_binprm *bprm) total_size = total_mapping_size(elf_phdata, loc->elf_ex.e_phnum); if (!total_size) { - error = -EINVAL; + retval = -EINVAL; goto out_free_dentry; } } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9de772ee0031..614aaa1969bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -880,6 +880,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * + * NOTE: This can return values > 0 + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -1198,6 +1200,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } +/** + * btrfs_check_shared - tell us whether an extent is shared + * + * @trans: optional trans handle + * + * btrfs_check_shared uses the backref walking code but will short + * circuit as soon as it finds a root or inode that doesn't match the + * one passed in. This provides a significant performance benefit for + * callers (such as fiemap) which want to know whether the extent is + * shared but do not need a ref count. + * + * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. + */ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 root_objectid, u64 inum, u64 bytenr) @@ -1226,11 +1241,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, root_objectid, inum); if (ret == BACKREF_FOUND_SHARED) { + /* this is the only condition under which we return 1 */ ret = 1; break; } if (ret < 0 && ret != -ENOENT) break; + ret = 0; node = ulist_next(tmp, &uiter); if (!node) break; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index cde698a07d21..a2ae42720a6a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1802,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1eef4ee01d1a..0ec3acd14cbf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3178,10 +3178,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); fail: - if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_release_path(path); return ret; } @@ -3305,8 +3303,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE) || - block_group->delalloc_bytes) { + !btrfs_test_opt(root, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3408,17 +3405,14 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); - if (!list_empty(&cur_trans->dirty_bgs)) { - list_splice_init(&cur_trans->dirty_bgs, &dirty); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; } + list_splice_init(&cur_trans->dirty_bgs, &dirty); spin_unlock(&cur_trans->dirty_bgs_lock); again: - if (list_empty(&dirty)) { - btrfs_free_path(path); - return 0; - } - /* * make sure all the block groups on our dirty list actually * exist @@ -3431,18 +3425,16 @@ again: return -ENOMEM; } + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { cache = list_first_entry(&dirty, struct btrfs_block_group_cache, dirty_list); - - /* - * cache_write_mutex is here only to save us from balance - * deleting this block group while we are writing out the - * cache - */ - mutex_lock(&trans->transaction->cache_write_mutex); - /* * this can happen if something re-dirties a block * group that is already under IO. Just wait for it to @@ -3493,9 +3485,30 @@ again: ret = 0; } } - if (!ret) + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); - mutex_unlock(&trans->transaction->cache_write_mutex); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } + } /* if its not on the io list, we need to put the block group */ if (should_put) @@ -3503,7 +3516,16 @@ again: if (ret) break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); } + mutex_unlock(&trans->transaction->cache_write_mutex); /* * go through delayed refs for all the stuff we've just kicked off @@ -3514,8 +3536,15 @@ again: loops++; spin_lock(&cur_trans->dirty_bgs_lock); list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } spin_unlock(&cur_trans->dirty_bgs_lock); - goto again; } btrfs_free_path(path); @@ -3588,8 +3617,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ret = 0; } } - if (!ret) + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } /* if its not on the io list, we need to put the block group */ if (should_put) @@ -7537,7 +7569,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * returns the key for the extent through ins, and a tree buffer for * the first block of the extent through buf. * - * returns the tree buffer or NULL. + * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7548,6 +7580,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; u64 flags = 0; int ret; u32 blocksize = root->nodesize; @@ -7568,13 +7601,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(root, blocksize, blocksize, empty_size, hint, &ins, 0, 0); - if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); - return ERR_PTR(ret); - } + if (ret) + goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); - BUG_ON(IS_ERR(buf)); /* -ENOMEM */ + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -7584,9 +7618,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; extent_op = btrfs_alloc_delayed_extent_op(); - BUG_ON(!extent_op); /* -ENOMEM */ + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -7601,13 +7637,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, - ins.offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); - BUG_ON(ret); /* -ENOMEM */ + ins.objectid, ins.offset, + parent, root_objectid, level, + BTRFS_ADD_DELAYED_EXTENT, + extent_op, 0); + if (ret) + goto out_free_delayed; } return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); +out_unuse: + unuse_block_rsv(root->fs_info, block_rsv, blocksize); + return ERR_PTR(ret); } struct walk_control { @@ -8782,6 +8829,24 @@ again: goto again; } + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } ret = set_block_group_ro(cache, 0); if (!ret) @@ -8795,7 +8860,9 @@ again: out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); + lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, root, alloc_flags); + unlock_chunks(root->fs_info->chunk_root); } mutex_unlock(&root->fs_info->ro_block_group_mutex); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 782f3bc4651d..c32d226bfecc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4560,36 +4560,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) do { index--; page = eb->pages[index]; - if (page && mapped) { + if (!page) + continue; + if (mapped) spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. + * We need to make sure we haven't be attached + * to a new eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - } - if (page) { - /* One for when we alloced the page */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ page_cache_release(page); } + + if (mapped) + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ + page_cache_release(page); } while (index != 0); } @@ -4771,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); + /* + * Lock our eb's refs_lock to avoid races with + * free_extent_buffer. When we get our eb it might be flagged + * with EXTENT_BUFFER_STALE and another task running + * free_extent_buffer might have seen that flag set, + * eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process + * of decrementing the extent buffer's reference count twice. + * So here we could race and increment the eb's reference count, + * clear its stale flag, mark it as dirty and drop our reference + * before the other task finishes executing free_extent_buffer, + * which would later result in an attempt to free an extent + * buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } mark_extent_buffer_accessed(eb, NULL); return eb; } @@ -4870,6 +4890,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, mark_extent_buffer_accessed(exists, p); goto free_eb; } + exists = NULL; /* * Do this so attach doesn't complain and we need to @@ -4933,12 +4954,12 @@ again: return eb; free_eb: + WARN_ON(!atomic_dec_and_test(&eb->refs)); for (i = 0; i < num_pages; i++) { if (eb->pages[i]) unlock_page(eb->pages[i]); } - WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 81fa75a8e1f3..9dbe5b548fa6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -86,7 +86,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, mapping_set_gfp_mask(inode->i_mapping, mapping_gfp_mask(inode->i_mapping) & - ~(GFP_NOFS & ~__GFP_HIGHMEM)); + ~(__GFP_FS | __GFP_HIGHMEM)); return inode; } @@ -1218,7 +1218,7 @@ out: * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. + * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, @@ -1235,12 +1235,12 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, int must_iput = 0; if (!i_size_read(inode)) - return -1; + return -EIO; WARN_ON(io_ctl->pages); ret = io_ctl_init(io_ctl, inode, root, 1); if (ret) - return -1; + return ret; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem); @@ -1258,7 +1258,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, inode, 0); + if (ret) + goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state); @@ -3464,6 +3466,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; struct btrfs_io_ctl io_ctl; + bool release_metadata = true; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; @@ -3471,11 +3474,20 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, memset(&io_ctl, 0, sizeof(io_ctl)); ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans, path, 0); - if (!ret) + if (!ret) { + /* + * At this point writepages() didn't error out, so our metadata + * reservation is released when the writeback finishes, at + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing + * with or without an error. + */ + release_metadata = false; ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + } if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); + if (release_metadata) + btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ada4d24ed11b..8bb013672aee 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3632,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. + * + * This is required for both inode re-read from disk and delayed inode + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == root->fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - inode->i_version = btrfs_inode_sequence(leaf, inode_item); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; - rdev = btrfs_inode_rdev(leaf, inode_item); - - BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - -cache_index: path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b05653f182c2..1c22c6518504 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2410,7 +2410,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, "Attempt to delete subvolume %llu during send", dest->root_key.objectid); err = -EPERM; - goto out_dput; + goto out_unlock_inode; } d_invalidate(dentry); @@ -2505,6 +2505,7 @@ out_up_write: root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } +out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 157cc54fc634..760c4a5e096b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { int ret = 0; + int ret_wb = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (ret) return ret; - ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); - if (ret) - return ret; + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; while (1) { @@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) break; end--; } - return ret; + return ret_wb ? ret_wb : ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8bcd2a007517..174f5e1e00ab 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1058,6 +1058,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, struct extent_map *em; struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; + u64 physical_start = *start; again: list_for_each_entry(em, search_list, list) { @@ -1068,9 +1069,9 @@ again: for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev != device) continue; - if (map->stripes[i].physical >= *start + len || + if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= - *start) + physical_start) continue; *start = map->stripes[i].physical + em->orig_block_len; @@ -1193,8 +1194,14 @@ again: */ if (contains_pending_extent(trans, device, &search_start, - hole_size)) - hole_size = 0; + hole_size)) { + if (key.offset >= search_start) { + hole_size = key.offset - search_start; + } else { + WARN_ON_ONCE(1); + hole_size = 0; + } + } if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -4618,6 +4625,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { u64 chunk_offset; + ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex)); chunk_offset = find_next_chunk(extent_root->fs_info); return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 430e0348c99e..7dc886c9a78f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -24,6 +24,7 @@ #include "cifsfs.h" #include "dns_resolve.h" #include "cifs_debug.h" +#include "cifs_unicode.h" static LIST_HEAD(cifs_dfs_automount_list); @@ -312,7 +313,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) xid = get_xid(); rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, &num_referrals, &referrals, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); free_xid(xid); cifs_put_tlink(tlink); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0303c6793d90..5a53ac6b1e02 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -27,41 +27,6 @@ #include "cifsglob.h" #include "cifs_debug.h" -/* - * cifs_utf16_bytes - how long will a string be after conversion? - * @utf16 - pointer to input string - * @maxbytes - don't go past this many bytes of input string - * @codepage - destination codepage - * - * Walk a utf16le string and return the number of bytes that the string will - * be after being converted to the given charset, not including any null - * termination required. Don't walk past maxbytes in the source buffer. - */ -int -cifs_utf16_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage) -{ - int i; - int charlen, outlen = 0; - int maxwords = maxbytes / 2; - char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; - - for (i = 0; i < maxwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) - break; - - charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); - if (charlen > 0) - outlen += charlen; - else - outlen++; - } - - return outlen; -} - int cifs_remap(struct cifs_sb_info *cifs_sb) { int map_type; @@ -155,10 +120,13 @@ convert_sfm_char(const __u16 src_char, char *target) * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). */ static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, int maptype) { int len = 1; + __u16 src_char; + + src_char = *from; if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target)) return len; @@ -168,10 +136,23 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, /* if character not one of seven in special remap set */ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); - if (len <= 0) { - *target = '?'; - len = 1; - } + if (len <= 0) + goto surrogate_pair; + + return len; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; return len; } @@ -206,7 +187,7 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ /* * because the chars can be of varying widths, we need to take care @@ -217,9 +198,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); for (i = 0; i < fromwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) break; + if (i + 1 < fromwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < fromwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; /* * check to see if converting this character might make the @@ -234,6 +223,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, /* put converted char into 'to' buffer */ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type); outlen += charlen; + + /* charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; } /* properly null-terminate string */ @@ -296,6 +296,46 @@ success: } /* + * cifs_utf16_bytes - how long will a string be after conversion? + * @utf16 - pointer to input string + * @maxbytes - don't go past this many bytes of input string + * @codepage - destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + */ +int +cifs_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + if (i + 1 < maxwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < maxwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; + + charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD); + outlen += charlen; + } + + return outlen; +} + +/* * cifs_strndup_from_utf16 - copy a string from wire format to the local * codepage * @src - source string @@ -409,10 +449,15 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, char src_char; __le16 dst_char; wchar_t tmp; + wchar_t *wchar_to; /* UTF-16 */ + int ret; + unicode_t u; if (map_chars == NO_MAP_UNI_RSVD) return cifs_strtoUTF16(target, source, PATH_MAX, cp); + wchar_to = kzalloc(6, GFP_KERNEL); + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; @@ -441,11 +486,55 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, * if no match, use question mark, which at least in * some cases serves as wild card */ - if (charlen < 1) { - dst_char = cpu_to_le16(0x003f); - charlen = 1; + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8") || !wchar_to) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; } + +ctoUTF16: /* * character may take more than one byte in the source string, * but will take exactly two bytes in the target string @@ -456,6 +545,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, ctoUTF16_out: put_unaligned(0, &target[j]); /* Null terminate target unicode string */ + kfree(wchar_to); return j; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f5089bde3635..0a9fb6b53126 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nouser_xattr"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) seq_puts(s, ",mapchars"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + seq_puts(s, ",mapposix"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c31ce98c1704..c63fd1dde25b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -361,11 +361,11 @@ extern int CIFSUnixCreateHardLink(const unsigned int xid, extern int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage); + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, - const struct nls_table *nls_codepage); + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 84650a51c7c4..f26ffbfc64d8 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2784,7 +2784,7 @@ copyRetry: int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage) + const struct nls_table *nls_codepage, int remap) { TRANSACTION2_SPI_REQ *pSMB = NULL; TRANSACTION2_SPI_RSP *pSMBr = NULL; @@ -2804,9 +2804,9 @@ createSymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName, - /* find define for this maxpathcomponent */ - PATH_MAX, nls_codepage); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fromName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; @@ -2828,9 +2828,9 @@ createSymLinkRetry: data_offset = (char *) (&pSMB->hdr.Protocol) + offset; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len_target = - cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX - /* find define for this maxpathcomponent */ - , nls_codepage); + cifsConvertToUTF16((__le16 *) data_offset, toName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); name_len_target++; /* trailing null */ name_len_target *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -3034,7 +3034,7 @@ winCreateHardLinkRetry: int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **symlinkinfo, - const struct nls_table *nls_codepage) + const struct nls_table *nls_codepage, int remap) { /* SMB_QUERY_FILE_UNIX_LINK */ TRANSACTION2_QPI_REQ *pSMB = NULL; @@ -3055,8 +3055,9 @@ querySymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage); + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -4917,7 +4918,7 @@ getDFSRetry: strncpy(pSMB->RequestFileName, search_name, name_len); } - if (ses->server && ses->server->sign) + if (ses->server->sign) pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f3bfe08e177b..8383d5ea4202 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -386,6 +386,7 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); + mutex_unlock(&server->srv_mutex); msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -393,8 +394,8 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); } - mutex_unlock(&server->srv_mutex); } while (server->tcpStatus == CifsNeedReconnect); return rc; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 338d56936f6a..c3eb998a99bd 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -620,8 +620,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, } rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc) goto mknod_out; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cafbf10521d5..3f50cee79df9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -140,8 +140,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags = cifs_posix_convert_flags(f_flags); rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data, poplock, full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (rc) @@ -1553,8 +1552,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, flock); + if (flock->fl_flags & FL_POSIX && !rc) + rc = posix_lock_file_wait(file, flock); return rc; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 55b58112d122..f621b44cb800 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -373,8 +373,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (!rc) { @@ -402,9 +401,25 @@ int cifs_get_inode_info_unix(struct inode **pinode, rc = -ENOMEM; } else { /* we already have inode, update it */ + + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgiiu_exit; + } + + /* if filetype is different, return error */ + if (unlikely(((*pinode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgiiu_exit; + } + cifs_fattr_to_inode(*pinode, &fattr); } +cgiiu_exit: return rc; } @@ -839,6 +854,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (!*inode) rc = -ENOMEM; } else { + /* we already have inode, update it */ + + /* if filetype is different, return error */ + if (unlikely(((*inode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgii_exit; + } + cifs_fattr_to_inode(*inode, &fattr); } @@ -2215,8 +2239,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) pTcon = tlink_tcon(tlink); rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 252e672d5604..e6c707cc62b3 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -717,7 +717,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* else rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, cifs_sb_target->local_nls); */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b4a47237486b..b1eede3678a9 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -90,6 +90,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, if (dentry) { inode = d_inode(dentry); if (inode) { + if (d_mountpoint(dentry)) + goto out; /* * If we're generating inode numbers, then we don't * want to clobber the existing one with the one that diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 7bfdd6066276..fc537c29044e 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -960,7 +960,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, /* Check for unix extensions */ if (cap_unix(tcon->ses)) { rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, - cifs_sb->local_nls); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); if (rc == -EREMOTE) rc = cifs_unix_dfs_readlink(xid, tcon, full_path, target_path, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 65cd7a84c8bc..54cbe19d9c08 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -110,7 +110,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ - if ((tcon->ses) && + if ((tcon->ses) && (tcon->ses->server) && (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) hdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index da94e41bdbf6..537356742091 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -173,5 +173,5 @@ MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); -module_init(configfs_init); +core_initcall(configfs_init); module_exit(configfs_exit); diff --git a/fs/dcache.c b/fs/dcache.c index 656ce522a218..37b5afdaf698 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1239,13 +1239,13 @@ ascend: /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; - next = child->d_child.next; - while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) { + /* go into the first sibling still alive */ + do { + next = child->d_child.next; if (next == &this_parent->d_subdirs) goto ascend; child = list_entry(next, struct dentry, d_child); - next = next->next; - } + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); rcu_read_unlock(); goto resume; } diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 59fedbcf8798..86a2121828c3 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -121,7 +121,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, int len, i; int err = -ENOMEM; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return err; diff --git a/fs/exec.c b/fs/exec.c index 49a1c61433b7..1977c2a553ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -659,6 +659,9 @@ int setup_arg_pages(struct linux_binprm *bprm, if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; + /* Add space for stack randomization. */ + stack_base += (STACK_RND_MASK << PAGE_SHIFT); + /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 18228c201f7f..024f2284d3f6 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -64,8 +64,8 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4_FS_ENCRYPTION - bool "Ext4 Encryption" +config EXT4_ENCRYPTION + tristate "Ext4 Encryption" depends on EXT4_FS select CRYPTO_AES select CRYPTO_CBC @@ -81,6 +81,11 @@ config EXT4_FS_ENCRYPTION efficient since it avoids caching the encrypted and decrypted pages in the page cache. +config EXT4_FS_ENCRYPTION + bool + default y + depends on EXT4_ENCRYPTION + config EXT4_DEBUG bool "EXT4 debugging support" depends on EXT4_FS diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index ca2f5948c1ac..fded02f72299 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -66,6 +66,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist sg[1]; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); char *workbuf; if (iname->len <= 0 || iname->len > ctx->lim) @@ -73,6 +74,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; @@ -101,7 +103,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, /* Create encryption request */ sg_init_table(sg, 1); sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); @@ -198,106 +200,57 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, return oname->len; } +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + /** * ext4_fname_encode_digest() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. */ -int ext4_fname_encode_digest(char *dst, char *src, u32 len) +static int digest_encode(const char *src, int len, char *dst) { - static const char *lookup_table = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+"; - u32 current_chunk, num_chunks, i; - char tmp_buf[3]; - u32 c0, c1, c2, c3; - - current_chunk = 0; - num_chunks = len/3; - for (i = 0; i < num_chunks; i++) { - c0 = src[3*i] & 0x3f; - c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f; - c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f; - c3 = (src[3*i+2]>>2) & 0x3f; - dst[4*i] = lookup_table[c0]; - dst[4*i+1] = lookup_table[c1]; - dst[4*i+2] = lookup_table[c2]; - dst[4*i+3] = lookup_table[c3]; - } - if (i*3 < len) { - memset(tmp_buf, 0, 3); - memcpy(tmp_buf, &src[3*i], len-3*i); - c0 = tmp_buf[0] & 0x3f; - c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f; - c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f; - c3 = (tmp_buf[2]>>2) & 0x3f; - dst[4*i] = lookup_table[c0]; - dst[4*i+1] = lookup_table[c1]; - dst[4*i+2] = lookup_table[c2]; - dst[4*i+3] = lookup_table[c3]; + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); i++; } - return (i * 4); + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; } -/** - * ext4_fname_hash() - - * - * This function computes the hash of the input filename, and sets the output - * buffer to the *encoded* digest. It returns the length of the digest as its - * return value. Errors are returned as negative numbers. We trust the caller - * to allocate sufficient memory to oname string. - */ -static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_str *iname, - struct ext4_str *oname) +static int digest_decode(const char *src, int len, char *dst) { - struct scatterlist sg; - struct hash_desc desc = { - .tfm = (struct crypto_hash *)ctx->htfm, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; - int res = 0; - - if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { - res = ext4_fname_encode_digest(oname->name, iname->name, - iname->len); - oname->len = res; - return res; - } - - sg_init_one(&sg, iname->name, iname->len); - res = crypto_hash_init(&desc); - if (res) { - printk(KERN_ERR - "%s: Error initializing crypto hash; res = [%d]\n", - __func__, res); - goto out; - } - res = crypto_hash_update(&desc, &sg, iname->len); - if (res) { - printk(KERN_ERR - "%s: Error updating crypto hash; res = [%d]\n", - __func__, res); - goto out; - } - res = crypto_hash_final(&desc, - &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]); - if (res) { - printk(KERN_ERR - "%s: Error finalizing crypto hash; res = [%d]\n", - __func__, res); - goto out; + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; } - /* Encode the digest as a printable string--this will increase the - * size of the digest */ - oname->name[0] = 'I'; - res = ext4_fname_encode_digest(oname->name+1, - &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE], - EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1; - oname->len = res; -out: - return res; + if (ac) + return -1; + return cp - dst; } /** @@ -405,6 +358,7 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( if (IS_ERR(ctx)) return ctx; + ctx->flags = ei->i_crypt_policy_flags; if (ctx->has_valid_key) { if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING @@ -517,6 +471,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen) { u32 ciphertext_len; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); if (ctx == NULL) return -EIO; @@ -524,6 +479,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : namelen; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; return (int) ciphertext_len; @@ -539,10 +495,13 @@ int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, u32 ilen, struct ext4_str *crypto_str) { unsigned int olen; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); if (!ctx) return -EIO; - olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE); + if (padding < EXT4_CRYPTO_BLOCK_SIZE) + padding = EXT4_CRYPTO_BLOCK_SIZE; + olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; @@ -571,9 +530,13 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_str *iname, - struct ext4_str *oname) + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname) { + char buf[24]; + int ret; + if (ctx == NULL) return -EIO; if (iname->len < 3) { @@ -587,18 +550,33 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, } if (ctx->has_valid_key) return ext4_fname_decrypt(ctx, iname, oname); - else - return ext4_fname_hash(ctx, iname, oname); + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hinfo) { + memcpy(buf, &hinfo->hash, 4); + memcpy(buf+4, &hinfo->minor_hash, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name+1); + oname->len = ret + 1; + return ret + 1; } int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) { struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, &iname, oname); + return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); } @@ -640,10 +618,11 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct dx_hash_info *hinfo) { - struct ext4_str tmp, tmp2; + struct ext4_str tmp; int ret = 0; + char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; - if (!ctx || !ctx->has_valid_key || + if (!ctx || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { @@ -651,59 +630,90 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, return 0; } + if (!ctx->has_valid_key && iname->name[0] == '_') { + if (iname->len != 33) + return -ENOENT; + ret = digest_decode(iname->name+1, iname->len, buf); + if (ret != 24) + return -ENOENT; + memcpy(&hinfo->hash, buf, 4); + memcpy(&hinfo->minor_hash, buf + 4, 4); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] != '_') { + if (iname->len > 43) + return -ENOENT; + ret = digest_decode(iname->name, iname->len, buf); + ext4fs_dirhash(buf, ret, hinfo); + return 0; + } + /* First encrypt the plaintext name */ ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); if (ret < 0) return ret; ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret < 0) - goto out; - - tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; - tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL); - if (tmp2.name == NULL) { - ret = -ENOMEM; - goto out; + if (ret >= 0) { + ext4fs_dirhash(tmp.name, tmp.len, hinfo); + ret = 0; } - ret = ext4_fname_hash(ctx, &tmp, &tmp2); - if (ret > 0) - ext4fs_dirhash(tmp2.name, tmp2.len, hinfo); - ext4_fname_crypto_free_buffer(&tmp2); -out: ext4_fname_crypto_free_buffer(&tmp); return ret; } -/** - * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string - */ -int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_dir_entry_2 *de, - struct dx_hash_info *hinfo) +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de) { - struct ext4_str iname = {.name = (unsigned char *) de->name, - .len = de->name_len}; - struct ext4_str tmp; - int ret; + int ret = -ENOENT; + int bigname = (*name == '_'); - if (!ctx || - ((iname.name[0] == '.') && - ((iname.len == 1) || - ((iname.name[1] == '.') && (iname.len == 2))))) { - ext4fs_dirhash(iname.name, iname.len, hinfo); - return 0; + if (ctx->has_valid_key) { + if (cstr->name == NULL) { + struct qstr istr; + + ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); + if (ret < 0) + goto errout; + istr.name = name; + istr.len = len; + ret = ext4_fname_encrypt(ctx, &istr, cstr); + if (ret < 0) + goto errout; + } + } else { + if (cstr->name == NULL) { + cstr->name = kmalloc(32, GFP_KERNEL); + if (cstr->name == NULL) + return -ENOMEM; + if ((bigname && (len != 33)) || + (!bigname && (len > 43))) + goto errout; + ret = digest_decode(name+bigname, len-bigname, + cstr->name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + cstr->len = ret; + } + if (bigname) { + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + cstr->name + 8, 16); + return (ret == 0) ? 1 : 0; + } } - - tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; - tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL); - if (tmp.name == NULL) - return -ENOMEM; - - ret = ext4_fname_hash(ctx, &iname, &tmp); - if (ret > 0) - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ext4_fname_crypto_free_buffer(&tmp); + if (de->name_len != cstr->len) + return 0; + ret = memcmp(de->name, cstr->name, cstr->len); + return (ret == 0) ? 1 : 0; +errout: + kfree(cstr->name); + cstr->name = NULL; return ret; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c8392af8abbb..52170d0b7c40 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -110,6 +110,7 @@ int ext4_generate_encryption_key(struct inode *inode) } res = 0; + ei->i_crypt_policy_flags = ctx.flags; if (S_ISREG(inode->i_mode)) crypt_key->mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 30eaf9e9864a..a6d6291aea16 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -37,6 +37,8 @@ static int ext4_is_encryption_context_consistent_with_policy( return 0; return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == + policy->flags) && (ctx.contents_encryption_mode == policy->contents_encryption_mode) && (ctx.filenames_encryption_mode == @@ -56,25 +58,25 @@ static int ext4_create_encryption_context_from_policy( printk(KERN_WARNING "%s: Invalid contents encryption mode %d\n", __func__, policy->contents_encryption_mode); - res = -EINVAL; - goto out; + return -EINVAL; } if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { printk(KERN_WARNING "%s: Invalid filenames encryption mode %d\n", __func__, policy->filenames_encryption_mode); - res = -EINVAL; - goto out; + return -EINVAL; } + if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) + return -EINVAL; ctx.contents_encryption_mode = policy->contents_encryption_mode; ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: if (!res) ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); return res; @@ -115,6 +117,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) policy->version = 0; policy->contents_encryption_mode = ctx.contents_encryption_mode; policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); return 0; @@ -176,6 +179,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) EXT4_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; memset(ctx.master_key_descriptor, 0x42, EXT4_KEY_DESCRIPTOR_SIZE); res = 0; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 61db51a5ce4c..5665d82d2332 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -249,7 +249,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } else { /* Directory is encrypted */ err = ext4_fname_disk_to_usr(enc_ctx, - de, &fname_crypto_str); + NULL, de, &fname_crypto_str); if (err < 0) goto errout; if (!dir_emit(ctx, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ef267adce19a..9a83f149ac85 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -911,6 +911,7 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; @@ -1066,12 +1067,6 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 -/* Encryption algorithms */ -#define EXT4_ENCRYPTION_MODE_INVALID 0 -#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 -#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 -#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 - /* * Structure of the super block */ @@ -2093,9 +2088,11 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, u32 ilen, struct ext4_str *crypto_str); int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, @@ -2104,11 +2101,12 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct dx_hash_info *hinfo); -int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_dir_entry_2 *de, - struct dx_hash_info *hinfo); int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen); +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de); + #ifdef CONFIG_EXT4_FS_ENCRYPTION void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); @@ -2891,7 +2889,6 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index c2ba35a914b6..d75159c101ce 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -20,12 +20,20 @@ struct ext4_encryption_policy { char version; char contents_encryption_mode; char filenames_encryption_mode; + char flags; char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; } __attribute__((__packed__)); #define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 #define EXT4_KEY_DERIVATION_NONCE_SIZE 16 +#define EXT4_POLICY_FLAGS_PAD_4 0x00 +#define EXT4_POLICY_FLAGS_PAD_8 0x01 +#define EXT4_POLICY_FLAGS_PAD_16 0x02 +#define EXT4_POLICY_FLAGS_PAD_32 0x03 +#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 +#define EXT4_POLICY_FLAGS_VALID 0x03 + /** * Encryption context for inode * @@ -41,7 +49,7 @@ struct ext4_encryption_context { char format; char contents_encryption_mode; char filenames_encryption_mode; - char reserved; + char flags; char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; } __attribute__((__packed__)); @@ -120,6 +128,7 @@ struct ext4_fname_crypto_ctx { struct crypto_hash *htfm; struct page *workpage; struct ext4_encryption_key key; + unsigned flags : 8; unsigned has_valid_key : 1; unsigned ctfm_key_is_ready : 1; }; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3445035c7e01..d41843181818 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -87,6 +87,12 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) ext4_put_nojournal(handle); return 0; } + + if (!handle->h_transaction) { + err = jbd2_journal_stop(handle); + return handle->h_err ? handle->h_err : err; + } + sb = handle->h_transaction->t_journal->j_private; err = handle->h_err; rc = jbd2_journal_stop(handle); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 973816bfe4a9..e003a1e81dc3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -377,7 +377,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); ext4_lblk_t last = lblock + len - 1; - if (lblock > last) + if (len == 0 || lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -4927,13 +4927,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); @@ -4955,6 +4948,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) mutex_lock(&inode->i_mutex); + /* + * We only support preallocation for extent-based files only + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -5395,6 +5396,14 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size, ioffset; int ret; + /* + * We need to test this early because xfstests assumes that a + * collapse range of (0, 1) will return EOPNOTSUPP if the file + * system does not support collapse range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || len & (EXT4_CLUSTER_SIZE(sb) - 1)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d33d5a6852b9..26724aeece73 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -703,6 +703,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); + if ((status & EXTENT_STATUS_DELAYED) && + (status & EXTENT_STATUS_WRITTEN)) { + ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " + " delayed and written which can potentially " + " cause data loss.\n", lblk, len); + WARN_ON(1); + } + newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbd0654a2675..0554b0b5957b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -531,6 +531,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -635,6 +636,7 @@ found: status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -4343,7 +4345,7 @@ static void ext4_update_other_inodes_time(struct super_block *sb, int inode_size = EXT4_INODE_SIZE(sb); oi.orig_ino = orig_ino; - ino = orig_ino & ~(inodes_per_block - 1); + ino = (orig_ino & ~(inodes_per_block - 1)) + 1; for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7223b0b4bc38..814f3beb4369 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -640,7 +640,7 @@ static struct stats dx_show_leaf(struct inode *dir, ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } - res = ext4_fname_disk_to_usr(ctx, de, + res = ext4_fname_disk_to_usr(ctx, NULL, de, &fname_crypto_str); if (res < 0) { printk(KERN_WARNING "Error " @@ -653,15 +653,8 @@ static struct stats dx_show_leaf(struct inode *dir, name = fname_crypto_str.name; len = fname_crypto_str.len; } - res = ext4_fname_disk_to_hash(ctx, de, - &h); - if (res < 0) { - printk(KERN_WARNING "Error " - "converting filename " - "from disk to htree" - "\n"); - h.hash = 0xDEADBEEF; - } + ext4fs_dirhash(de->name, de->name_len, + &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -1008,15 +1001,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - err = ext4_fname_disk_to_hash(ctx, de, hinfo); - if (err < 0) { - count = err; - goto errout; - } -#else ext4fs_dirhash(de->name, de->name_len, hinfo); -#endif if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1032,7 +1017,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, &tmp_str); } else { /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, de, + err = ext4_fname_disk_to_usr(ctx, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1193,26 +1178,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int err; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - err = ext4_fname_disk_to_hash(ctx, de, &h); - if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return err; - } -#else ext4fs_dirhash(de->name, de->name_len, &h); -#endif map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1223,9 +1192,6 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, /* XXX: do we need to check rec_len == 0 case? -Chris */ de = ext4_next_entry(de, blocksize); } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); -#endif return count; } @@ -1287,16 +1253,8 @@ static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) { - /* Directory is encrypted */ - res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str); - if (res < 0) - return res; - if (len != res) - return 0; - res = memcmp(name, fname_crypto_str->name, len); - return (res == 0) ? 1 : 0; - } + if (ctx) + return ext4_fname_match(ctx, fname_crypto_str, len, name, de); #endif if (len != de->name_len) return 0; @@ -1324,16 +1282,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, if (IS_ERR(ctx)) return -1; - if (ctx != NULL) { - /* Allocate buffer to hold maximum name length */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -1; - } - } - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { @@ -1872,14 +1820,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return res; } reclen = EXT4_DIR_REC_LEN(res); - - /* Allocate buffer to hold maximum name length */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -1; - } } de = (struct ext4_dir_entry_2 *)buf; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 8a8ec6293b19..cf0c472047e3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1432,12 +1432,15 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit; /* * We will always be modifying at least the superblock and GDT - * block. If we are adding a group past the last current GDT block, + * blocks. If we are adding a group past the last current GDT block, * we will also modify the inode and the dindirect block. If we * are adding a group with superblock/GDT backups we will also * modify each of the reserved GDT dindirect blocks. */ - credit = flex_gd->count * 4 + reserved_gdb; + credit = 3; /* sb, resize inode, resize inode dindirect */ + /* GDT blocks */ + credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); + credit += reserved_gdb; /* Reserved GDT dindirect blocks */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f06d0589ddba..ca9d4a2fed41 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -294,6 +294,8 @@ static void __save_error_info(struct super_block *sb, const char *func, struct ext4_super_block *es = EXT4_SB(sb)->s_es; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (bdev_read_only(sb->s_bdev)) + return; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); es->s_last_error_time = cpu_to_le32(get_seconds()); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 19f78f20975e..187b78920314 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -74,7 +74,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) goto errout; } pstr.name = paddr; - res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr); + res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b91b0e10678e..1e1aae669fa8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1513,6 +1513,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool locked = false; int ret; long diff; @@ -1533,7 +1534,13 @@ static int f2fs_write_data_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, DATA, wbc); + if (!S_ISDIR(inode->i_mode)) { + mutex_lock(&sbi->writepages); + locked = true; + } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + if (locked) + mutex_unlock(&sbi->writepages); f2fs_submit_merged_bio(sbi, DATA, WRITE); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d8921cf2ba9a..8de34ab6d5b1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -625,6 +625,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7e3794edae42..658e8079aaf9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -298,16 +298,14 @@ fail: static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct page *page; + struct page *page = page_follow_link_light(dentry, nd); - page = page_follow_link_light(dentry, nd); - if (IS_ERR(page)) + if (IS_ERR_OR_NULL(page)) return page; /* this is broken symlink case */ if (*nd_get_link(nd) == 0) { - kunmap(page); - page_cache_release(page); + page_put_link(dentry, nd, page); return ERR_PTR(-ENOENT); } return page; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 160b88346b24..b2dd1b01f076 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1035,6 +1035,7 @@ try_onemore: sbi->raw_super = raw_super; sbi->raw_super_buf = raw_super_buf; mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); clear_sbi_flag(sbi, SBI_POR_DOING); diff --git a/fs/fhandle.c b/fs/fhandle.c index 999ff5c3cab0..d59712dfa3e7 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -195,8 +195,9 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, goto out_err; } /* copy the full handle */ - if (copy_from_user(handle, ufh, - sizeof(struct file_handle) + + *handle = f_handle; + if (copy_from_user(&handle->f_handle, + &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_handle; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ef263174acd2..07d8d8f52faf 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -581,7 +581,7 @@ static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (name == NULL) goto out_put; - fd = file_create(name, mode & S_IFMT); + fd = file_create(name, mode & 0777); if (fd < 0) error = fd; else diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index b5128c6e63ad..a9079d035ae5 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -842,15 +842,23 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, { jbd2_journal_revoke_header_t *header; int offset, max; + int csum_size = 0; + __u32 rcount; int record_len = 4; header = (jbd2_journal_revoke_header_t *) bh->b_data; offset = sizeof(jbd2_journal_revoke_header_t); - max = be32_to_cpu(header->r_count); + rcount = be32_to_cpu(header->r_count); if (!jbd2_revoke_block_csum_verify(journal, header)) return -EINVAL; + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (rcount > journal->j_blocksize - csum_size) + return -EINVAL; + max = rcount; + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) record_len = 8; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index c6cbaef2bda1..14214da80eb8 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -577,7 +577,7 @@ static void write_one_revoke_record(journal_t *journal, { int csum_size = 0; struct buffer_head *descriptor; - int offset; + int sz, offset; journal_header_t *header; /* If we are already aborting, this all becomes a noop. We @@ -594,9 +594,14 @@ static void write_one_revoke_record(journal_t *journal, if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + sz = 8; + else + sz = 4; + /* Make sure we have a descriptor with space left for the record */ if (descriptor) { - if (offset >= journal->j_blocksize - csum_size) { + if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } @@ -619,16 +624,13 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); - offset += 8; - - } else { + else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); - offset += 4; - } + offset += sz; *offsetp = offset; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5f09370c90a8..ff2f2e6ad311 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -551,7 +551,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) int result; int wanted; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -627,7 +626,6 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) tid_t tid; int need_to_start, ret; - WARN_ON(!transaction); /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) @@ -785,7 +783,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int need_copy = 0; unsigned long start_lock, time_lock; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1051,7 +1048,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) int err; jbd_debug(5, "journal_head %p\n", jh); - WARN_ON(!transaction); err = -EROFS; if (is_handle_aborted(handle)) goto out; @@ -1266,7 +1262,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int ret = 0; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1397,7 +1392,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) int err = 0; int was_modified = 0; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1530,8 +1524,22 @@ int jbd2_journal_stop(handle_t *handle) tid_t tid; pid_t pid; - if (!transaction) - goto free_and_exit; + if (!transaction) { + /* + * Handle is already detached from the transaction so + * there is nothing to do other than decrease a refcount, + * or free the handle if refcount drops to zero + */ + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } else { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + goto free_and_exit; + } + } journal = transaction->t_journal; J_ASSERT(journal_current_handle() == handle); @@ -2373,7 +2381,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) transaction_t *transaction = handle->h_transaction; journal_t *journal; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index f131fc23ffc4..fffca9517321 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + /* + * If the ino of the sysfs entry created for a kmem cache gets + * allocated from an ida layer, which is accounted to the memcg that + * owns the cache, the memcg will get pinned forever. So do not account + * ino ida allocations. + */ + ret = ida_simple_get(&root->ino_ida, 1, 0, + GFP_KERNEL | __GFP_NOACCOUNT); if (ret < 0) goto err_out2; kn->ino = ret; diff --git a/fs/namei.c b/fs/namei.c index 4a8d998b7274..fe30d3be43a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1415,6 +1415,7 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; + bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1424,8 +1425,11 @@ static int lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = dentry->d_inode; + negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; + if (negative) + return -ENOENT; /* * This sequence count validates that the parent had no @@ -1472,6 +1476,10 @@ unlazy: goto need_lookup; } + if (unlikely(d_is_negative(dentry))) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1583,10 +1591,10 @@ static inline int walk_component(struct nameidata *nd, struct path *path, goto out_err; inode = path->dentry->d_inode; + err = -ENOENT; + if (d_is_negative(path->dentry)) + goto out_path_put; } - err = -ENOENT; - if (d_is_negative(path->dentry)) - goto out_path_put; if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { @@ -3036,14 +3044,13 @@ retry_lookup: BUG_ON(nd->flags & LOOKUP_RCU); inode = path->dentry->d_inode; -finish_lookup: - /* we _can_ be in RCU mode here */ error = -ENOENT; if (d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - +finish_lookup: + /* we _can_ be in RCU mode here */ if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(nd->path.mnt != path->mnt || @@ -3226,7 +3233,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); - goto out; + goto out2; } error = path_init(dfd, pathname, flags, nd); @@ -3256,6 +3263,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, } out: path_cleanup(nd); +out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); diff --git a/fs/namespace.c b/fs/namespace.c index 1f4f9dac6e5a..1b9e11167bae 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3179,6 +3179,12 @@ bool fs_fully_visible(struct file_system_type *type) if (mnt->mnt.mnt_sb->s_type != type) continue; + /* This mount is not fully visible if it's root directory + * is not the root directory of the filesystem. + */ + if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + continue; + /* This mount is not fully visible if there are any child mounts * that cover anything except for empty directories. */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 45b35b9b1e36..55e1e3af23a3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,6 +38,7 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/file.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/printk.h> @@ -5604,6 +5605,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); + get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -5716,6 +5718,7 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); + fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d12a4be613a5..dfc19f1575a1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1845,12 +1845,15 @@ int nfs_wb_all(struct inode *inode) trace_nfs_writeback_inode_enter(inode); ret = filemap_write_and_wait(inode->i_mapping); - if (!ret) { - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (!ret) - pnfs_sync_inode(inode, true); - } + if (ret) + goto out; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out; + pnfs_sync_inode(inode, true); + ret = 0; +out: trace_nfs_writeback_inode_exit(inode, ret); return ret; } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 03d647bf195d..cdefaa331a07 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -181,6 +181,17 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, } const struct nfsd4_layout_ops bl_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, .proc_layoutget = nfsd4_block_proc_layoutget, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 58277859a467..5694cfb7a47b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -224,7 +224,7 @@ static int nfs_cb_stat_to_errno(int status) } static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, - enum nfsstat4 *status) + int *status) { __be32 *p; u32 op; @@ -235,7 +235,7 @@ static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, op = be32_to_cpup(p++); if (unlikely(op != expected)) goto out_unexpected; - *status = be32_to_cpup(p); + *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -446,22 +446,16 @@ out_overflow: static int decode_cb_sequence4res(struct xdr_stream *xdr, struct nfsd4_callback *cb) { - enum nfsstat4 nfserr; int status; if (cb->cb_minorversion == 0) return 0; - status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - goto out_default; - status = decode_cb_sequence4resok(xdr, cb); -out: - return status; -out_default: - return nfs_cb_stat_to_errno(nfserr); + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); + if (unlikely(status || cb->cb_status)) + return status; + + return decode_cb_sequence4resok(xdr, cb); } /* @@ -524,26 +518,19 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; if (cb != NULL) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } #ifdef CONFIG_NFSD_PNFS @@ -621,24 +608,18 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; + if (cb) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -898,13 +879,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) if (!nfsd41_cb_get_slot(clp, task)) return; } - spin_lock(&clp->cl_lock); - if (list_empty(&cb->cb_per_client)) { - /* This is the first call, not a restart */ - cb->cb_done = false; - list_add(&cb->cb_per_client, &clp->cl_callbacks); - } - spin_unlock(&clp->cl_lock); rpc_call_start(task); } @@ -918,22 +892,33 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) if (clp->cl_minorversion) { /* No need for lock, access serialized in nfsd4_cb_prepare */ - ++clp->cl_cb_session->se_cb_seq_nr; + if (!task->tk_status) + ++clp->cl_cb_session->se_cb_seq_nr; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); } - if (clp->cl_cb_client != task->tk_client) { - /* We're shutting down or changing cl_cb_client; leave - * it to nfsd4_process_cb_update to restart the call if - * necessary. */ + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) { + task->tk_status = 0; + cb->cb_need_restart = true; return; } - if (cb->cb_done) - return; + if (cb->cb_status) { + WARN_ON_ONCE(task->tk_status); + task->tk_status = cb->cb_status; + } switch (cb->cb_ops->done(cb, task)) { case 0: @@ -949,21 +934,17 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) default: BUG(); } - cb->cb_done = true; } static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_client *clp = cb->cb_clp; - - if (cb->cb_done) { - spin_lock(&clp->cl_lock); - list_del(&cb->cb_per_client); - spin_unlock(&clp->cl_lock); + if (cb->cb_need_restart) + nfsd4_run_cb(cb); + else cb->cb_ops->release(cb); - } + } static const struct rpc_call_ops nfsd4_cb_ops = { @@ -1058,9 +1039,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) nfsd4_mark_cb_down(clp, err); return; } - /* Yay, the callback channel's back! Restart any callbacks: */ - list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) - queue_work(callback_wq, &cb->cb_work); } static void @@ -1071,8 +1049,12 @@ nfsd4_run_cb_work(struct work_struct *work) struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); @@ -1084,6 +1066,15 @@ nfsd4_run_cb_work(struct work_struct *work) cb->cb_ops->release(cb); return; } + + /* + * Don't send probe messages for 4.1 or later. + */ + if (!cb->cb_ops && clp->cl_minorversion) { + clp->cl_cb_state = NFSD4_CB_UP; + return; + } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); @@ -1098,8 +1089,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - INIT_LIST_HEAD(&cb->cb_per_client); - cb->cb_done = true; + cb->cb_status = 0; + cb->cb_need_restart = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 38f2d7abe3a7..039f9c8a95e8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -94,6 +94,7 @@ static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; +static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); @@ -281,6 +282,7 @@ put_nfs4_file(struct nfs4_file *fi) if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); + WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } @@ -471,6 +473,86 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) __nfs4_file_put_access(fp, O_RDONLY); } +/* + * Allocate a new open/delegation state counter. This is needed for + * pNFS for proper return on close semantics. + * + * Note that we only allocate it for pNFS-enabled exports, otherwise + * all pointers to struct nfs4_clnt_odstate are always NULL. + */ +static struct nfs4_clnt_odstate * +alloc_clnt_odstate(struct nfs4_client *clp) +{ + struct nfs4_clnt_odstate *co; + + co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); + if (co) { + co->co_client = clp; + atomic_set(&co->co_odcount, 1); + } + return co; +} + +static void +hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp = co->co_file; + + lockdep_assert_held(&fp->fi_lock); + list_add(&co->co_perfile, &fp->fi_clnt_odstate); +} + +static inline void +get_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + if (co) + atomic_inc(&co->co_odcount); +} + +static void +put_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp; + + if (!co) + return; + + fp = co->co_file; + if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + list_del(&co->co_perfile); + spin_unlock(&fp->fi_lock); + + nfsd4_return_all_file_layouts(co->co_client, fp); + kmem_cache_free(odstate_slab, co); + } +} + +static struct nfs4_clnt_odstate * +find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) +{ + struct nfs4_clnt_odstate *co; + struct nfs4_client *cl; + + if (!new) + return NULL; + + cl = new->co_client; + + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client == cl) { + get_clnt_odstate(co); + goto out; + } + } + co = new; + co->co_file = fp; + hash_clnt_odstate_locked(new); +out: + spin_unlock(&fp->fi_lock); + return co; +} + struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { @@ -606,7 +688,8 @@ static void block_delegations(struct knfsd_fh *fh) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, + struct nfs4_clnt_odstate *odstate) { struct nfs4_delegation *dp; long n; @@ -631,6 +714,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_clnt_odstate = odstate; + get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -714,6 +799,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhash_delegation_locked(dp); spin_unlock(&state_lock); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -724,6 +810,7 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); if (clp->cl_minorversion == 0) @@ -933,6 +1020,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); + put_clnt_odstate(stp->st_clnt_odstate); release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); @@ -1538,7 +1626,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); @@ -1634,6 +1721,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -3057,6 +3145,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, fh); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; @@ -3073,6 +3162,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, void nfsd4_free_slabs(void) { + kmem_cache_destroy(odstate_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); @@ -3103,8 +3193,14 @@ nfsd4_init_slabs(void) sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) goto out_free_stateid_slab; + odstate_slab = kmem_cache_create("nfsd4_odstate", + sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + if (odstate_slab == NULL) + goto out_free_deleg_slab; return 0; +out_free_deleg_slab: + kmem_cache_destroy(deleg_slab); out_free_stateid_slab: kmem_cache_destroy(stateid_slab); out_free_file_slab: @@ -3581,6 +3677,14 @@ alloc_stateid: open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; + + if (nfsd4_has_session(cstate) && + (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { + open->op_odstate = alloc_clnt_odstate(clp); + if (!open->op_odstate) + return nfserr_jukebox; + } + return nfs_ok; } @@ -3869,7 +3973,7 @@ out_fput: static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, - struct nfs4_file *fp) + struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) { int status; struct nfs4_delegation *dp; @@ -3877,7 +3981,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - dp = alloc_init_deleg(clp, fh); + dp = alloc_init_deleg(clp, fh, odstate); if (!dp) return ERR_PTR(-ENOMEM); @@ -3903,6 +4007,7 @@ out_unlock: spin_unlock(&state_lock); out: if (status) { + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); return ERR_PTR(status); } @@ -3980,7 +4085,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, default: goto out_no_deleg; } - dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file); + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); if (IS_ERR(dp)) goto out_no_deleg; @@ -4069,6 +4174,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf release_open_stateid(stp); goto out; } + + stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, + open->op_odstate); + if (stp->st_clnt_odstate == open->op_odstate) + open->op_odstate = NULL; } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -4129,6 +4239,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); + if (open->op_odstate) + kmem_cache_free(odstate_slab, open->op_odstate); } __be32 @@ -4385,10 +4497,17 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) +{ + if (ols->st_stateowner->so_is_open_owner && + !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; +} + static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; - struct nfs4_ol_stateid *ols; __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) @@ -4418,13 +4537,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: - ols = openlockstateid(s); - if (ols->st_stateowner->so_is_open_owner - && !(openowner(ols->st_stateowner)->oo_flags - & NFS4_OO_CONFIRMED)) - status = nfserr_bad_stateid; - else - status = nfs_ok; + status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); @@ -4516,8 +4629,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, status = nfs4_check_fh(current_fh, stp); if (status) goto out; - if (stp->st_stateowner->so_is_open_owner - && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + status = nfsd4_check_openowner_confirmed(stp); + if (status) goto out; status = nfs4_check_openmode(stp, flags); if (status) @@ -4852,9 +4965,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, - stp->st_stid.sc_file); - nfsd4_close_open_stateid(stp); /* put reference from nfs4_preprocess_seqid_op */ @@ -6488,6 +6598,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4f3bfeb11766..dbc4f85a5008 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,12 +63,12 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; - bool cb_done; + int cb_status; + bool cb_need_restart; }; struct nfsd4_callback_ops { @@ -126,6 +126,7 @@ struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ + struct nfs4_clnt_odstate *dl_clnt_odstate; u32 dl_type; time_t dl_time; /* For recall: */ @@ -332,7 +333,6 @@ struct nfs4_client { int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; - struct list_head cl_callbacks; /* list of in-progress callbacks */ /* for all client information that callback code might need: */ spinlock_t cl_lock; @@ -465,6 +465,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) } /* + * Per-client state indicating no. of opens and outstanding delegations + * on a file from a particular client.'od' stands for 'open & delegation' + */ +struct nfs4_clnt_odstate { + struct nfs4_client *co_client; + struct nfs4_file *co_file; + struct list_head co_perfile; + atomic_t co_odcount; +}; + +/* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * * These objects are global. nfsd keeps one instance of a nfs4_file per @@ -485,6 +496,7 @@ struct nfs4_file { struct list_head fi_delegations; struct rcu_head fi_rcu; }; + struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* @@ -526,6 +538,7 @@ struct nfs4_ol_stateid { struct list_head st_perstateowner; struct list_head st_locks; struct nfs4_stateowner * st_stateowner; + struct nfs4_clnt_odstate * st_clnt_odstate; unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid * st_openstp; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index f982ae84f0cd..2f8c092be2b3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -247,6 +247,7 @@ struct nfsd4_open { struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ + struct nfs4_clnt_odstate *op_odstate; /* used during processing */ struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 059f37137f9a..919fd5bb14a8 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || - level > NILFS_BTREE_LEVEL_MAX || + level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a6944b25fd5b..fdf4b41d0609 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -757,6 +757,19 @@ lookup: if (tmpres) { spin_unlock(&dlm->spinlock); spin_lock(&tmpres->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&tmpres->hash_node)) { + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 082234581d05..83f4e76511c2 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -159,7 +159,7 @@ int omfs_allocate_range(struct super_block *sb, goto out; found: - *return_block = i * bits_per_entry + bit; + *return_block = (u64) i * bits_per_entry + bit; *return_size = run; ret = set_run(sb, i, bits_per_entry, bit, run, 1); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 138321b0c6c2..3d935c81789a 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -306,7 +306,8 @@ static const struct super_operations omfs_sops = { */ static int omfs_get_imap(struct super_block *sb) { - unsigned int bitmap_size, count, array_size; + unsigned int bitmap_size, array_size; + int count; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; @@ -359,7 +360,7 @@ nomem: } enum { - Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err }; static const match_table_t tokens = { @@ -368,6 +369,7 @@ static const match_table_t tokens = { {Opt_umask, "umask=%o"}, {Opt_dmask, "dmask=%o"}, {Opt_fmask, "fmask=%o"}, + {Opt_err, NULL}, }; static int parse_options(char *options, struct omfs_sb_info *sbi) @@ -548,8 +550,10 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); - if (!sb->s_root) + if (!sb->s_root) { + ret = -ENOMEM; goto out_brelse_bh2; + } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 24f640441bd9..84d693d37428 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -299,6 +299,9 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct cred *override_cred; char *link = NULL; + if (WARN_ON(!workdir)) + return -EROFS; + ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index d139405d2bfa..692ceda3bc21 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -222,6 +222,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, struct kstat stat; int err; + if (WARN_ON(!workdir)) + return ERR_PTR(-EROFS); + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -322,6 +325,9 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; + if (WARN_ON(!workdir)) + return -EROFS; + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -506,11 +512,28 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { - opaquedir = ovl_check_empty_and_clear(dentry); - err = PTR_ERR(opaquedir); - if (IS_ERR(opaquedir)) - goto out; + if (WARN_ON(!workdir)) + return -EROFS; + + if (is_dir) { + if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { + opaquedir = ovl_check_empty_and_clear(dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } else { + LIST_HEAD(list); + + /* + * When removing an empty opaque directory, then it + * makes no sense to replace it with an exact replica of + * itself. But emptiness still needs to be checked. + */ + err = ovl_check_empty_dir(dentry, &list); + ovl_cache_free(&list); + if (err) + goto out; + } } err = ovl_lock_rename_workdir(workdir, upperdir); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5f0d1993e6e3..bf8537c7f455 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -529,7 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ufs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) + if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir)) return -EROFS; return 0; @@ -925,9 +925,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); err = PTR_ERR(ufs->workdir); if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_upper_mnt; + pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + ufs->config.workdir, OVL_WORKDIR_NAME, -err); + sb->s_flags |= MS_RDONLY; + ufs->workdir = NULL; } } @@ -997,7 +998,6 @@ out_put_lower_mnt: kfree(ufs->lower_mnt); out_put_workdir: dput(ufs->workdir); -out_put_upper_mnt: mntput(ufs->upper_mnt); out_put_lowerpath: for (i = 0; i < numlower; i++) diff --git a/fs/splice.c b/fs/splice.c index 476024bb6546..bfe62ae40f40 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1161,7 +1161,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, long ret, bytes; umode_t i_mode; size_t len; - int i, flags; + int i, flags, more; /* * We require the input being a regular file, as we don't want to @@ -1204,6 +1204,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; while (len) { size_t read_len; @@ -1217,6 +1218,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->total_len = read_len; /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; + /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 04e79d57bca6..e9d401ce93bb 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -574,8 +574,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) * After the last attribute is removed revert to original inode format, * making all literal area available to the data fork once more. */ -STATIC void -xfs_attr_fork_reset( +void +xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { @@ -641,7 +641,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (mp->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); @@ -905,7 +905,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 025c4b820c03..882c8d338891 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -53,7 +53,7 @@ int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); - +void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index aeffeaaac0ec..f1026e86dabc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3224,12 +3224,24 @@ xfs_bmap_extsize_align( align_alen += temp; align_off -= temp; } + + /* Same adjustment for the end of the requested area. */ + temp = (align_alen % extsz); + if (temp) + align_alen += extsz - temp; + /* - * Same adjustment for the end of the requested area. + * For large extent hint sizes, the aligned extent might be larger than + * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls + * the length back under MAXEXTLEN. The outer allocation loops handle + * short allocation just fine, so it is safe to do this. We only want to + * do it when we are forced to, though, because it means more allocation + * operations are required. */ - if ((temp = (align_alen % extsz))) { - align_alen += extsz - temp; - } + while (align_alen > MAXEXTLEN) + align_alen -= extsz; + ASSERT(align_alen <= MAXEXTLEN); + /* * If the previous block overlaps with this proposed allocation * then move the start forward without adjusting the length. @@ -3318,7 +3330,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - ASSERT(orig_end <= align_off + align_alen); + /* see MAXEXTLEN handling above */ + ASSERT(orig_end <= align_off + align_alen || + align_alen + extsz > MAXEXTLEN); } #ifdef DEBUG @@ -4099,13 +4113,6 @@ xfs_bmapi_reserve_delalloc( /* Figure out the extent size, adjust alen */ extsz = xfs_get_extsz_hint(ip); if (extsz) { - /* - * Make sure we don't exceed a single extent length when we - * align the extent by reducing length we are going to - * allocate by the maximum amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 07349a183a11..1c9e75521250 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -376,7 +376,7 @@ xfs_ialloc_ag_alloc( */ newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && - percpu_counter_read(&args.mp->m_icount) + newlen > + percpu_counter_read_positive(&args.mp->m_icount) + newlen > args.mp->m_maxicount) return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; @@ -1339,10 +1339,13 @@ xfs_dialloc( * If we have already hit the ceiling of inode blocks then clear * okalloc so we scan all available agi structures for a free * inode. + * + * Read rough value of mp->m_icount by percpu_counter_read_positive, + * which will sacrifice the preciseness but improve the performance. */ if (mp->m_maxicount && - percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos > - mp->m_maxicount) { + percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos + > mp->m_maxicount) { noroom = 1; okalloc = 0; } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index f9c1c64782d3..3fbf167cfb4c 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -380,23 +380,31 @@ xfs_attr3_root_inactive( return error; } +/* + * xfs_attr_inactive kills all traces of an attribute fork on an inode. It + * removes both the on-disk and in-memory inode fork. Note that this also has to + * handle the condition of inodes without attributes but with an attribute fork + * configured, so we can't use xfs_inode_hasattr() here. + * + * The in-memory attribute fork is removed even on error. + */ int -xfs_attr_inactive(xfs_inode_t *dp) +xfs_attr_inactive( + struct xfs_inode *dp) { - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; + struct xfs_trans *trans; + struct xfs_mount *mp; + int cancel_flags = 0; + int lock_mode = XFS_ILOCK_SHARED; + int error = 0; mp = dp->i_mount; ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return 0; - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_ilock(dp, lock_mode); + if (!XFS_IFORK_Q(dp)) + goto out_destroy_fork; + xfs_iunlock(dp, lock_mode); /* * Start our first transaction of the day. @@ -408,13 +416,18 @@ xfs_attr_inactive(xfs_inode_t *dp) * the inode in every transaction to let it float upward through * the log. */ + lock_mode = 0; trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); - if (error) { - xfs_trans_cancel(trans, 0); - return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + if (error) + goto out_cancel; + + lock_mode = XFS_ILOCK_EXCL; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; + xfs_ilock(dp, lock_mode); + + if (!XFS_IFORK_Q(dp)) + goto out_cancel; /* * No need to make quota reservations here. We expect to release some @@ -422,29 +435,31 @@ xfs_attr_inactive(xfs_inode_t *dp) */ xfs_trans_ijoin(trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = 0; - goto out; + /* invalidate and truncate the attribute fork extents */ + if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out_cancel; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out_cancel; } - error = xfs_attr3_root_inactive(&trans, dp); - if (error) - goto out; - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); - if (error) - goto out; + /* Reset the attribute fork - this also destroys the in-core fork */ + xfs_attr_fork_remove(dp, trans); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - + xfs_iunlock(dp, lock_mode); return error; -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_cancel: + xfs_trans_cancel(trans, cancel_flags); +out_destroy_fork: + /* kill the in-core attr fork before we drop the inode lock */ + if (dp->i_afp) + xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (lock_mode) + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75352ee..3b7591224f4a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -124,7 +124,7 @@ xfs_iozero( status = 0; } while (count); - return (-status); + return status; } int diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6ebc85192b7..539a85fddbc2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1946,21 +1946,17 @@ xfs_inactive( /* * If there are attributes associated with the file then blow them away * now. The code calls a routine that recursively deconstructs the - * attribute fork. We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). + * attribute fork. If also blows away the in-core attribute fork. */ - if (ip->i_d.di_anextents > 0) { - ASSERT(ip->i_d.di_forkoff != 0); - + if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) return; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - + ASSERT(!ip->i_afp); ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); /* * Free the inode. @@ -2883,7 +2879,13 @@ xfs_rename_alloc_whiteout( if (error) return error; - /* Satisfy xfs_bumplink that this is a real tmpfile */ + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -3151,7 +3153,7 @@ xfs_rename( * intermediate state on disk. */ if (wip) { - ASSERT(wip->i_d.di_nlink == 0); + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) goto out_trans_abort; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2ce7ee3b4ec1..6f23fbdfb365 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1084,14 +1084,18 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); } +/* + * Deltas for the inode count are +/-64, hence we use a large batch size + * of 128 so we don't need to take the counter lock on every update. + */ +#define XFS_ICOUNT_BATCH 128 int xfs_mod_icount( struct xfs_mount *mp, int64_t delta) { - /* deltas are +/-64, hence the large batch size of 128. */ - __percpu_counter_add(&mp->m_icount, delta, 128); - if (percpu_counter_compare(&mp->m_icount, 0) < 0) { + __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { ASSERT(0); percpu_counter_add(&mp->m_icount, -delta); return -EINVAL; @@ -1113,6 +1117,14 @@ xfs_mod_ifree( return 0; } +/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 int xfs_mod_fdblocks( struct xfs_mount *mp, @@ -1151,25 +1163,19 @@ xfs_mod_fdblocks( * Taking blocks away, need to be more accurate the closer we * are to zero. * - * batch size is set to a maximum of 1024 blocks - if we are - * allocating of freeing extents larger than this then we aren't - * going to be hammering the counter lock so a lock per update - * is not a problem. - * * If the counter has a value of less than 2 * max batch size, * then make everything serialise as we are real close to * ENOSPC. */ -#define __BATCH 1024 - if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0) + if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + XFS_FDBLOCKS_BATCH) < 0) batch = 1; else - batch = __BATCH; -#undef __BATCH + batch = XFS_FDBLOCKS_BATCH; __percpu_counter_add(&mp->m_fdblocks, delta, batch); - if (percpu_counter_compare(&mp->m_fdblocks, - XFS_ALLOC_SET_ASIDE(mp)) >= 0) { + if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; } |