diff options
Diffstat (limited to 'fs')
56 files changed, 835 insertions, 370 deletions
diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e03910cebdd4..804d1f905622 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -441,7 +441,10 @@ enum afs_lock_state { }; /* - * AFS inode private data + * AFS inode private data. + * + * Note that afs_alloc_inode() *must* reset anything that could incorrectly + * leak from one inode to another. */ struct afs_vnode { struct inode vfs_inode; /* the VFS's inode record */ diff --git a/fs/afs/security.c b/fs/afs/security.c index 2b00097101b3..b88b7d45fdaa 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -120,7 +120,7 @@ static void afs_hash_permits(struct afs_permits *permits) void afs_cache_permit(struct afs_vnode *vnode, struct key *key, unsigned int cb_break) { - struct afs_permits *permits, *xpermits, *replacement, *new = NULL; + struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL; afs_access_t caller_access = READ_ONCE(vnode->status.caller_access); size_t size = 0; bool changed = false; @@ -204,7 +204,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, new = kzalloc(sizeof(struct afs_permits) + sizeof(struct afs_permit) * size, GFP_NOFS); if (!new) - return; + goto out_put; refcount_set(&new->usage, 1); new->nr_permits = size; @@ -229,8 +229,6 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, afs_hash_permits(new); - afs_put_permits(permits); - /* Now see if the permit list we want is actually already available */ spin_lock(&afs_permits_lock); @@ -262,11 +260,15 @@ found: kfree(new); spin_lock(&vnode->lock); - if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break) || - permits != rcu_access_pointer(vnode->permit_cache)) - goto someone_else_changed_it_unlock; - rcu_assign_pointer(vnode->permit_cache, replacement); + zap = rcu_access_pointer(vnode->permit_cache); + if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) && + zap == permits) + rcu_assign_pointer(vnode->permit_cache, replacement); + else + zap = replacement; spin_unlock(&vnode->lock); + afs_put_permits(zap); +out_put: afs_put_permits(permits); return; diff --git a/fs/afs/super.c b/fs/afs/super.c index d3f97da61bdf..1037dd41a622 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -536,7 +536,9 @@ static void afs_kill_super(struct super_block *sb) } /* - * initialise an inode cache slab element prior to any use + * Initialise an inode cache slab element prior to any use. Note that + * afs_alloc_inode() *must* reset anything that could incorrectly leak from one + * inode to another. */ static void afs_i_init_once(void *_vnode) { @@ -568,11 +570,21 @@ static struct inode *afs_alloc_inode(struct super_block *sb) atomic_inc(&afs_count_active_inodes); + /* Reset anything that shouldn't leak from one inode to the next. */ memset(&vnode->fid, 0, sizeof(vnode->fid)); memset(&vnode->status, 0, sizeof(vnode->status)); vnode->volume = NULL; + vnode->lock_key = NULL; + vnode->permit_cache = NULL; + vnode->cb_interest = NULL; +#ifdef CONFIG_AFS_FSCACHE + vnode->cache = NULL; +#endif + vnode->flags = 1 << AFS_VNODE_UNSET; + vnode->cb_type = 0; + vnode->lock_state = AFS_VNODE_LOCK_NONE; _leave(" = %p", &vnode->vfs_inode); return &vnode->vfs_inode; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d79ced925861..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk) pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); - ino->last_used = jiffies; } + ino->last_used = jiffies; return status; } @@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new; new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - if (new == dentry) - dput(new); - else { - struct autofs_info *ino; - - ino = autofs4_dentry_ino(new); - ino->last_used = jiffies; - dput(path->dentry); - path->dentry = new; - } + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; } return path->dentry; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b35ce16b3df3..5982c8a71f02 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -295,7 +295,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, - unsigned long nr_pages) + unsigned long nr_pages, + unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio = NULL; @@ -327,7 +328,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; bio = btrfs_bio_alloc(bdev, first_byte); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; refcount_set(&cb->pending_bios, 1); @@ -374,7 +375,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_put(bio); bio = btrfs_bio_alloc(bdev, first_byte); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -1528,5 +1529,5 @@ unsigned int btrfs_compress_str2level(const char *str) if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) return str[5] - '0'; - return 0; + return BTRFS_ZLIB_DEFAULT_LEVEL; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index da20755ebf21..0868cc554f14 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,6 +34,8 @@ /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) +#define BTRFS_ZLIB_DEFAULT_LEVEL 3 + struct compressed_bio { /* number of bios pending for this compressed extent */ refcount_t pending_bios; @@ -91,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, - unsigned long nr_pages); + unsigned long nr_pages, + unsigned int write_flags); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 51477a537c83..13c260b525a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3180,6 +3180,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + unsigned int extra_bits, struct extent_state **cached_state, int dedupe); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index efce9a2fa9be..10a2a579cc7f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } @@ -3848,7 +3848,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) { + /* + * Since btrfs_mark_buffer_dirty() can be called with item pointer set + * but item data not updated. + * So here we should only check item pointers, not item data. + */ + if (btrfs_header_level(buf) == 0 && + btrfs_check_leaf_relaxed(root, buf)) { btrfs_print_leaf(buf); ASSERT(0); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7208ecef7088..4497f937e8fb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3502,13 +3502,6 @@ again: goto again; } - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -3532,6 +3525,13 @@ again: } WARN_ON(ret); + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + if (i_size_read(inode) > 0) { ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f9e9f721efe2..012d63870b99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3253,7 +3253,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, delalloc_start, delalloc_end, &page_started, - nr_written); + nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4a8861379d3e..93dcae0c3183 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -116,7 +116,8 @@ struct extent_io_ops { */ int (*fill_delalloc)(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written); + unsigned long *nr_written, + struct writeback_control *wbc); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -365,10 +366,11 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) + u64 end, unsigned int extra_bits, + struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, NULL, cached_state, GFP_NOFS); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f80254d82f40..eb1bac7c8553 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,6 +477,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * after copy_from_user, pages need to be dirtied and we need to make * sure holes are created between the current EOF and the start of @@ -497,14 +538,34 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, u64 end_of_last_block; u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(inode); + unsigned int extra_bits = 0; start_pos = pos & ~((u64) fs_info->sectorsize - 1); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); end_of_last_block = start_pos + num_bytes - 1; + + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (start_pos >= isize && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { + /* + * There can't be any extents following eof in this case + * so just set the delalloc new bit for the range + * directly. + */ + extra_bits |= EXTENT_DELALLOC_NEW; + } else { + err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode), + start_pos, + num_bytes, cached); + if (err) + return err; + } + } + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - cached, 0); + extra_bits, cached, 0); if (err) return err; @@ -1404,47 +1465,6 @@ fail: } -static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, - const u64 start, - const u64 len, - struct extent_state **cached_state) -{ - u64 search_start = start; - const u64 end = start + len - 1; - - while (search_start < end) { - const u64 search_len = end - search_start + 1; - struct extent_map *em; - u64 em_len; - int ret = 0; - - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); - if (IS_ERR(em)) - return PTR_ERR(em); - - if (em->block_start != EXTENT_MAP_HOLE) - goto next; - - em_len = em->len; - if (em->start < search_start) - em_len -= search_start - em->start; - if (em_len > search_len) - em_len = search_len; - - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, - NULL, cached_state, GFP_NOFS); -next: - search_start = extent_map_end(em); - free_extent_map(em); - if (ret) - return ret; - } - return 0; -} - /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1473,10 +1493,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size || - (inode->flags & BTRFS_INODE_PREALLOC)) { + if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - unsigned int clear_bits; lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); @@ -1498,19 +1516,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - ret = btrfs_find_new_delalloc_bytes(inode, start_pos, - last_pos - start_pos + 1, - cached_state); - clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; - if (ret) - clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; - clear_extent_bit(&inode->io_tree, start_pos, - last_pos, clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, cached_state, GFP_NOFS); - if (ret) - return ret; + clear_extent_bit(&inode->io_tree, start_pos, last_pos, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached_state, GFP_NOFS); *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -2048,6 +2057,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); + btrfs_init_log_ctx(&ctx, inode); + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by @@ -2194,8 +2205,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx, inode); - ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2253,6 +2262,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cdc9f4015ec3..4426d1c73e50 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1264,7 +1264,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, inode, 0); if (ret) - goto out; + goto out_unlock; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); @@ -1358,6 +1358,7 @@ out_nospc_locked: out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state); +out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b93fe05a39c7..993061f83067 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -378,6 +378,7 @@ struct async_cow { struct page *locked_page; u64 start; u64 end; + unsigned int write_flags; struct list_head extents; struct btrfs_work work; }; @@ -857,7 +858,8 @@ retry: async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages)) { + async_extent->nr_pages, + async_cow->write_flags)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -1191,7 +1193,8 @@ static noinline void async_cow_free(struct btrfs_work *work) static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; @@ -1208,6 +1211,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; + async_cow->write_flags = write_flags; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -1577,11 +1581,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) */ static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); + unsigned int write_flags = wbc_to_write_flags(wbc); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1596,7 +1602,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written); + page_started, nr_written, + write_flags); } if (ret) btrfs_cleanup_ordered_extents(inode, start, end - start + 1); @@ -2025,11 +2032,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, } int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state); + extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2090,7 +2098,7 @@ again: goto out; } - btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state, + btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state, 0); ClearPageChecked(page); set_page_dirty(page); @@ -4790,7 +4798,7 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, block_start, block_end, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -5438,6 +5446,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, goto out_err; btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); + if (location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY) { + btrfs_warn(root->fs_info, +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", + __func__, name, btrfs_ino(BTRFS_I(dir)), + location->objectid, location->type, location->offset); + goto out_err; + } out: btrfs_free_path(path); return ret; @@ -5754,8 +5770,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); @@ -9150,7 +9164,7 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, end, + ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4cf2eb67eba6..f0c3f00e97cb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3268,7 +3268,8 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0); + btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL, + 0); set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c10e4c70f02d..20d3300bd268 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3521,7 +3521,40 @@ out: } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Check if inode ino2, or any of its ancestors, is inode ino1. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int check_ino_in_path(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + const u64 ino2_gen, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + if (ino1 == ino2) + return ino1_gen == ino2_gen; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent; + u64 parent_gen; + int ret; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) + return ret; + if (parent == ino1) + return parent_gen == ino1_gen; + ino = parent; + } + return 0; +} + +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ static int is_ancestor(struct btrfs_root *root, @@ -3530,36 +3563,91 @@ static int is_ancestor(struct btrfs_root *root, const u64 ino2, struct fs_path *fs_path) { - u64 ino = ino2; - bool free_path = false; + bool free_fs_path = false; int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; if (!fs_path) { fs_path = fs_path_alloc(); if (!fs_path) return -ENOMEM; - free_path = true; + free_fs_path = true; } - while (ino > BTRFS_FIRST_FREE_OBJECTID) { - u64 parent; - u64 parent_gen; + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } - fs_path_reset(fs_path); - ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); - if (ret < 0) { - if (ret == -ENOENT && ino == ino2) - ret = 0; - goto out; + key.objectid = ino2; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + continue; } - if (parent == ino1) { - ret = parent_gen == ino1_gen ? 1 : 0; - goto out; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino2) + break; + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + while (cur_offset < item_size) { + u64 parent; + u64 parent_gen; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + unsigned long ptr; + struct btrfs_inode_extref *extref; + + ptr = btrfs_item_ptr_offset(leaf, slot); + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + parent = btrfs_inode_extref_parent(leaf, + extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + parent = key.offset; + cur_offset = item_size; + } + + ret = get_inode_info(root, parent, NULL, &parent_gen, + NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = check_ino_in_path(root, ino1, ino1_gen, + parent, parent_gen, fs_path); + if (ret) + goto out; } - ino = parent; + path->slots[0]++; } + ret = 0; out: - if (free_path) + btrfs_free_path(path); + if (free_fs_path) fs_path_free(fs_path); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 305cae7444a0..3a4dce153645 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -507,9 +507,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, token == Opt_compress_force || strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; - info->compress_level = - btrfs_compress_str2level(args[0].from); + info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + /* + * args[0] contains uninitialized data since + * for these tokens we don't expect any + * parameter. + */ + if (token != Opt_compress && + token != Opt_compress_force) + info->compress_level = + btrfs_compress_str2level(args[0].from); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d06b1c931d05..2e7f64a3b22b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -114,7 +114,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -145,7 +145,7 @@ static int test_find_delalloc(u32 sectorsize) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -200,7 +200,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index f797642c013d..30affb60da51 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -984,7 +984,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1018,7 +1018,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1036,7 +1036,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1053,7 +1053,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1089,7 +1089,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 114fc5f0ecc5..ce4ed6ec8f39 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -242,7 +242,8 @@ static int check_leaf_item(struct btrfs_root *root, return ret; } -int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) +static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, + bool check_item_data) { struct btrfs_fs_info *fs_info = root->fs_info; /* No valid key type is 0, so all key should be larger than this key */ @@ -361,10 +362,15 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) return -EUCLEAN; } - /* Check if the item size and content meet other criteria */ - ret = check_leaf_item(root, leaf, &key, slot); - if (ret < 0) - return ret; + if (check_item_data) { + /* + * Check if the item size and content meet other + * criteria + */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + } prev_key.objectid = key.objectid; prev_key.type = key.type; @@ -374,6 +380,17 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) return 0; } +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, true); +} + +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, false); +} + int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) { unsigned long nr = btrfs_header_nritems(node); diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 96c486e95d70..3d53e8d6fda0 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -20,7 +20,19 @@ #include "ctree.h" #include "extent_io.h" -int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf); +/* + * Comprehensive leaf checker. + * Will check not only the item pointers, but also every possible member + * in item data. + */ +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf); + +/* + * Less strict leaf checker. + * Will only check item pointers, not reading item data. + */ +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf); int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index aa7c71cff575..7bf9b31561db 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4102,7 +4102,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ordered_io_err) { ctx->io_err = -EIO; - return 0; + return ctx->io_err; } btrfs_init_map_token(&token); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 925070b9ce03..49810b70afd3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -189,6 +189,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } kfree(fs_devices); @@ -578,6 +579,7 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev) fs_devs->num_devices--; list_del(&dev->dev_list); rcu_string_free(dev->name); + bio_put(dev->flush_bio); kfree(dev); } break; @@ -630,6 +632,7 @@ static noinline int device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { + bio_put(device->flush_bio); kfree(device); return -ENOMEM; } @@ -742,6 +745,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); goto error; } @@ -807,6 +811,7 @@ again: list_del_init(&device->dev_list); fs_devices->num_devices--; rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1750,20 +1755,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, key.offset = device->devid; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; + if (ret) { + if (ret > 0) + ret = -ENOENT; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + out: btrfs_free_path(path); - btrfs_commit_transaction(trans); + if (!ret) + ret = btrfs_commit_transaction(trans); return ret; } @@ -1993,7 +2002,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, fs_devices = srcdev->fs_devices; list_del_rcu(&srcdev->dev_list); - list_del_rcu(&srcdev->dev_alloc_list); + list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; if (srcdev->missing) fs_devices->missing_devices--; @@ -2349,6 +2358,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); ret = -ENOMEM; goto error; @@ -2358,6 +2368,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); ret = PTR_ERR(trans); goto error; @@ -2501,6 +2512,7 @@ error_trans: if (trans) btrfs_end_transaction(trans); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2567,6 +2579,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); ret = -ENOMEM; goto error; @@ -6284,6 +6297,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { + bio_put(dev->flush_bio); kfree(dev); return ERR_PTR(ret); } @@ -627,7 +627,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + if (!pmd_dirty(*pmdp) + && !pmd_access_permitted(*pmdp, WRITE)) goto unlock_pmd; flush_cache_page(vma, address, pfn); diff --git a/fs/exec.c b/fs/exec.c index 1d6243d9f2b6..6be2aa0ab26f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * needing to clean up the change on failure. + * races from other threads changing the limits. This also + * must be protected from races with prlimit() calls. */ + task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; + task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 016c46b5e44c..20a0a89eaca5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -779,7 +779,7 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { - int new_rdonly; + bool new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e76730aac0d..8a85f3f53446 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * page_put due to reference from alloc_huge_page() * unlock_page because locked by add_to_page_cache() + * page_put due to reference from alloc_huge_page() */ - put_page(page); unlock_page(page); + put_page(page); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0d4e590e0549..826a89184f90 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -578,8 +578,10 @@ static void nlm_complain_hosts(struct net *net) if (ln->nrhosts == 0) return; - printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net); - dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net); + pr_warn("lockd: couldn't shutdown host module for net %x!\n", + net->ns.inum); + dprintk("lockd: %lu hosts left in net %x:\n", ln->nrhosts, + net->ns.inum); } else { if (nrhosts == 0) return; @@ -590,9 +592,9 @@ static void nlm_complain_hosts(struct net *net) for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; - dprintk(" %s (cnt %d use %d exp %ld net %p)\n", + dprintk(" %s (cnt %d use %d exp %ld net %x)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); + host->h_inuse, host->h_expires, host->net->ns.inum); } } @@ -605,7 +607,8 @@ nlm_shutdown_hosts_net(struct net *net) mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ - dprintk("lockd: nuking all hosts in net %p...\n", net); + dprintk("lockd: nuking all hosts in net %x...\n", + net ? net->ns.inum : 0); for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -618,9 +621,8 @@ nlm_shutdown_hosts_net(struct net *net) /* Then, perform a garbage collection pass */ nlm_gc_hosts(net); - mutex_unlock(&nlm_host_mutex); - nlm_complain_hosts(net); + mutex_unlock(&nlm_host_mutex); } /* @@ -646,7 +648,8 @@ nlm_gc_hosts(struct net *net) struct hlist_node *next; struct nlm_host *host; - dprintk("lockd: host garbage collection for net %p\n", net); + dprintk("lockd: host garbage collection for net %x\n", + net ? net->ns.inum : 0); for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -662,9 +665,10 @@ nlm_gc_hosts(struct net *net) if (atomic_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " - "(cnt %d use %d exp %ld net %p)\n", + "(cnt %d use %d exp %ld net %x)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); + host->h_inuse, host->h_expires, + host->net->ns.inum); continue; } nlm_destroy_host_locked(host); diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9fbbd11f9ecb..96cfb2967ac7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -110,7 +110,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, clnt = nsm_create(host->net, host->nodename); if (IS_ERR(clnt)) { dprintk("lockd: failed to create NSM upcall transport, " - "status=%ld, net=%p\n", PTR_ERR(clnt), host->net); + "status=%ld, net=%x\n", PTR_ERR(clnt), + host->net->ns.inum); return PTR_ERR(clnt); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index a8e3777c94dc..9c36d614bf89 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,6 +57,9 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; +atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); +DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); + unsigned int lockd_net_id; /* @@ -259,7 +262,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) if (error < 0) goto err_bind; set_grace_period(net); - dprintk("lockd_up_net: per-net data created; net=%p\n", net); + dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: @@ -274,12 +277,15 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); - dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); + dprintk("%s: per-net data destroyed; net=%x\n", + __func__, net->ns.inum); } } else { - printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", - nlmsvc_task, net); + pr_err("%s: no users! task=%p, net=%x\n", + __func__, nlmsvc_task, net->ns.inum); BUG(); } } @@ -290,7 +296,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; if (nlmsvc_rqst) { @@ -301,6 +308,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -317,7 +326,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; if (nlmsvc_rqst) { @@ -329,6 +339,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -345,10 +357,12 @@ static void lockd_unregister_notifiers(void) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif + wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); } static void lockd_svc_exit_thread(void) { + atomic_dec(&nlm_ntf_refcnt); lockd_unregister_notifiers(); svc_exit_thread(nlmsvc_rqst); } @@ -373,6 +387,7 @@ static int lockd_start_svc(struct svc_serv *serv) goto out_rqst; } + atomic_inc(&nlm_ntf_refcnt); svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; @@ -676,6 +691,17 @@ static int lockd_init_net(struct net *net) static void lockd_exit_net(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + WARN_ONCE(!list_empty(&ln->lockd_manager.list), + "net %x %s: lockd_manager.list is not empty\n", + net->ns.inum, __func__); + WARN_ONCE(!list_empty(&ln->nsm_handles), + "net %x %s: nsm_handles list is not empty\n", + net->ns.inum, __func__); + WARN_ONCE(delayed_work_pending(&ln->grace_period_end), + "net %x %s: grace_period_end was not cancelled\n", + net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index a563ddbc19e6..4ec3d6e03e76 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -370,7 +370,7 @@ nlmsvc_mark_resources(struct net *net) { struct nlm_host hint; - dprintk("lockd: nlmsvc_mark_resources for net %p\n", net); + dprintk("lockd: %s for net %x\n", __func__, net ? net->ns.inum : 0); hint.net = net; nlm_traverse_files(&hint, nlmsvc_mark_host, NULL); } diff --git a/fs/mbcache.c b/fs/mbcache.c index d818fd236787..b8b8b9ced9f8 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); + /* Unlikely, but not impossible */ + if (unlikely(cache->c_entry_count < 0)) + return 0; return cache->c_entry_count; } diff --git a/fs/namei.c b/fs/namei.c index f0c7a7b9b6ca..9cc91fb7f156 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | - LOOKUP_AUTOMOUNT))) { - /* Positive dentry that isn't meant to trigger an - * automount, EISDIR will allow it to be used, - * otherwise there's no mount here "now" so return - * ENOENT. - */ - if (path->dentry->d_inode) - return -EISDIR; - else - return -ENOENT; - } + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 54fd56d715a8..e4f4a09ed9f4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -71,8 +71,8 @@ const nfs4_stateid zero_stateid = { }; const nfs4_stateid invalid_stateid = { { - .seqid = cpu_to_be32(0xffffffffU), - .other = { 0 }, + /* Funky initialiser keeps older gcc versions happy */ + .data = { 0xff, 0xff, 0xff, 0xff, 0 }, }, .type = NFS4_INVALID_STATEID_TYPE, }; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 897b299db55e..5be08f02a76b 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm) struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); - list_add(&lm->list, grace_list); + if (list_empty(&lm->list)) + list_add(&lm->list, grace_list); + else + WARN(1, "double list_add attempt detected in net %x %s\n", + net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); @@ -104,7 +108,9 @@ grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); - BUG_ON(!list_empty(grace_list)); + WARN_ONCE(!list_empty(grace_list), + "net %x %s: grace_list is not empty\n", + net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 46b48dbbdd32..8ceb25a10ea0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -232,7 +232,7 @@ static struct cache_head *expkey_alloc(void) return NULL; } -static struct cache_detail svc_expkey_cache_template = { +static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", @@ -748,7 +748,7 @@ static struct cache_head *svc_export_alloc(void) return NULL; } -static struct cache_detail svc_export_cache_template = { +static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", @@ -1230,7 +1230,7 @@ nfsd_export_init(struct net *net) int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("nfsd: initializing export module (net: %p).\n", net); + dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) @@ -1278,7 +1278,7 @@ nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("nfsd: shutting down export module (net: %p).\n", net); + dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); @@ -1286,5 +1286,5 @@ nfsd_export_shutdown(struct net *net) cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); - dprintk("nfsd: export shutdown complete (net: %p).\n", net); + dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); } diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1c91391f4805..36358d435cb0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -119,6 +119,9 @@ struct nfsd_net { u32 clverifier_counter; struct svc_serv *nfsd_serv; + + wait_queue_head_t ntf_wq; + atomic_t ntf_refcnt; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 6b9b6cca469f..a5bb76593ce7 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -178,7 +178,7 @@ static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); static struct ent *idtoname_update(struct cache_detail *, struct ent *, struct ent *); -static struct cache_detail idtoname_cache_template = { +static const struct cache_detail idtoname_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", @@ -341,7 +341,7 @@ static struct ent *nametoid_update(struct cache_detail *, struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); -static struct cache_detail nametoid_cache_template = { +static const struct cache_detail nametoid_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b82817767b9d..b29b5a185a2c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -63,12 +63,16 @@ static const stateid_t zero_stateid = { static const stateid_t currentstateid = { .si_generation = 1, }; +static const stateid_t close_stateid = { + .si_generation = 0xffffffffU, +}; static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) +#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); @@ -83,6 +87,11 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid); */ static DEFINE_SPINLOCK(state_lock); +enum nfsd4_st_mutex_lock_subclass { + OPEN_STATEID_MUTEX = 0, + LOCK_STATEID_MUTEX = 1, +}; + /* * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for * the refcount on the open stateid to drop. @@ -3562,7 +3571,9 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; - if (local->st_stateowner == &oo->oo_owner) { + if (local->st_stateowner != &oo->oo_owner) + continue; + if (local->st_stid.sc_type == NFS4_OPEN_STID) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; @@ -3571,6 +3582,52 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) return ret; } +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + switch (s->sc_type) { + default: + break; + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + ret = nfserr_bad_stateid; + break; + case NFS4_REVOKED_DELEG_STID: + ret = nfserr_deleg_revoked; + } + return ret; +} + +/* Lock the stateid st_mutex, and deal with races with CLOSE */ +static __be32 +nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) +{ + __be32 ret; + + mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); + ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret != nfs_ok) + mutex_unlock(&stp->st_mutex); + return ret; +} + +static struct nfs4_ol_stateid * +nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *stp; + for (;;) { + spin_lock(&fp->fi_lock); + stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); + if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) + break; + nfs4_put_stid(&stp->st_stid); + } + return stp; +} + static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) @@ -3613,8 +3670,9 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) stp = open->op_stp; /* We are moving these outside of the spinlocks to avoid the warnings */ mutex_init(&stp->st_mutex); - mutex_lock(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3639,7 +3697,11 @@ out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); if (retstp) { - mutex_lock(&retstp->st_mutex); + /* Handle races with CLOSE */ + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); stp = retstp; @@ -4449,6 +4511,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; + bool new_stp = false; /* * Lookup file; if found, lookup stateid and check open request, @@ -4460,9 +4523,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; - spin_lock(&fp->fi_lock); - stp = nfsd4_find_existing_open(fp, open); - spin_unlock(&fp->fi_lock); + stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; status = nfserr_bad_stateid; @@ -4470,35 +4531,31 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } + if (!stp) { + stp = init_open_stateid(fp, open); + if (!open->op_stp) + new_stp = true; + } + /* * OPEN the file, or upgrade an existing OPEN. * If truncate fails, the OPEN fails. + * + * stp is already locked. */ - if (stp) { + if (!new_stp) { /* Stateid was found, this is an OPEN upgrade */ - mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { mutex_unlock(&stp->st_mutex); goto out; } } else { - /* stp is returned locked. */ - stp = init_open_stateid(fp, open); - /* See if we lost the race to some other thread */ - if (stp->st_access_bmap != 0) { - status = nfs4_upgrade_open(rqstp, fp, current_fh, - stp, open); - if (status) { - mutex_unlock(&stp->st_mutex); - goto out; - } - goto upgrade_out; - } status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - mutex_unlock(&stp->st_mutex); + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); + mutex_unlock(&stp->st_mutex); goto out; } @@ -4507,7 +4564,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } -upgrade_out: + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); @@ -4734,7 +4791,7 @@ nfs4_laundromat(struct nfsd_net *nn) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); @@ -4855,6 +4912,18 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session) +{ + __be32 ret; + + spin_lock(&s->sc_lock); + ret = nfsd4_verify_open_stid(s); + if (ret == nfs_ok) + ret = check_stateid_generation(in, &s->sc_stateid, has_session); + spin_unlock(&s->sc_lock); + return ret; +} + static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) { if (ols->st_stateowner->so_is_open_owner && @@ -4868,7 +4937,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) struct nfs4_stid *s; __be32 status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { @@ -4883,7 +4953,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; - status = check_stateid_generation(stateid, &s->sc_stateid, 1); + status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; switch (s->sc_type) { @@ -4926,7 +4996,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, else if (typemask & NFS4_DELEG_STID) typemask |= NFS4_REVOKED_DELEG_STID; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { @@ -5044,7 +5115,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, &s, nn); if (status) return status; - status = check_stateid_generation(stateid, &s->sc_stateid, + status = nfsd4_stid_check_stateid_generation(stateid, s, nfsd4_has_session(cstate)); if (status) goto out; @@ -5098,7 +5169,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret; - mutex_lock(&stp->st_mutex); + ret = nfsd4_lock_ol_stateid(stp); + if (ret) + goto out_put_stid; ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) @@ -5109,11 +5182,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) lockowner(stp->st_stateowner))) goto out; + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(stp); ret = nfs_ok; out: mutex_unlock(&stp->st_mutex); +out_put_stid: nfs4_put_stid(s); return ret; } @@ -5133,6 +5208,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; + spin_lock(&s->sc_lock); switch (s->sc_type) { case NFS4_DELEG_STID: ret = nfserr_locks_held; @@ -5144,11 +5220,13 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: + spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: + spin_unlock(&s->sc_lock); dp = delegstateid(s); list_del_init(&dp->dl_recall_lru); spin_unlock(&cl->cl_lock); @@ -5157,6 +5235,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* Default falls through and returns nfserr_bad_stateid */ } + spin_unlock(&s->sc_lock); out_unlock: spin_unlock(&cl->cl_lock); out: @@ -5179,15 +5258,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID - || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) - /* - * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step, and - * revoked delegations are kept only for free_stateid. - */ - return nfserr_bad_stateid; - mutex_lock(&stp->st_mutex); + status = nfsd4_lock_ol_stateid(stp); + if (status != nfs_ok) + return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); @@ -5367,7 +5440,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) bool unhashed; LIST_HEAD(reaplist); - s->st_stid.sc_type = NFS4_CLOSED_STID; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -5407,10 +5479,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; + + stp->st_stid.sc_type = NFS4_CLOSED_STID; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); + mutex_unlock(&stp->st_mutex); + + /* See RFC5661 sectionm 18.2.4 */ + if (stp->st_stid.sc_client->cl_minorversion) + memcpy(&close->cl_stateid, &close_stateid, + sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); @@ -5436,7 +5515,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; dp = delegstateid(s); - status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); + status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate)); if (status) goto put_stateid; @@ -5642,14 +5721,41 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, return ret; } -static void +static struct nfs4_ol_stateid * +find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *lst; + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_type != NFS4_LOCK_STID) + continue; + if (lst->st_stid.sc_file == fp) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, struct nfs4_file *fp, struct inode *inode, struct nfs4_ol_stateid *open_stp) { struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *retstp; - lockdep_assert_held(&clp->cl_lock); + mutex_init(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: + spin_lock(&clp->cl_lock); + spin_lock(&fp->fi_lock); + retstp = find_lock_stateid(lo, fp); + if (retstp) + goto out_unlock; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; @@ -5659,29 +5765,22 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - mutex_init(&stp->st_mutex); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); - spin_lock(&fp->fi_lock); list_add(&stp->st_perfile, &fp->fi_stateids); +out_unlock: spin_unlock(&fp->fi_lock); -} - -static struct nfs4_ol_stateid * -find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) -{ - struct nfs4_ol_stateid *lst; - struct nfs4_client *clp = lo->lo_owner.so_client; - - lockdep_assert_held(&clp->cl_lock); - - list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { - if (lst->st_stid.sc_file == fp) { - refcount_inc(&lst->st_stid.sc_count); - return lst; + spin_unlock(&clp->cl_lock); + if (retstp) { + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + stp = retstp; } - return NULL; + return stp; } static struct nfs4_ol_stateid * @@ -5694,26 +5793,25 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *clp = oo->oo_owner.so_client; + *new = false; spin_lock(&clp->cl_lock); lst = find_lock_stateid(lo, fi); - if (lst == NULL) { - spin_unlock(&clp->cl_lock); - ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); - if (ns == NULL) - return NULL; - - spin_lock(&clp->cl_lock); - lst = find_lock_stateid(lo, fi); - if (likely(!lst)) { - lst = openlockstateid(ns); - init_lock_stateid(lst, lo, fi, inode, ost); - ns = NULL; - *new = true; - } - } spin_unlock(&clp->cl_lock); - if (ns) + if (lst != NULL) { + if (nfsd4_lock_ol_stateid(lst) == nfs_ok) + goto out; + nfs4_put_stid(&lst->st_stid); + } + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); + if (ns == NULL) + return NULL; + + lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost); + if (lst == openlockstateid(ns)) + *new = true; + else nfs4_put_stid(ns); +out: return lst; } @@ -5750,7 +5848,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval; - bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5766,25 +5863,12 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } -retry: lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (lst == NULL) { status = nfserr_jukebox; goto out; } - mutex_lock(&lst->st_mutex); - - /* See if it's still hashed to avoid race with FREE_STATEID */ - spin_lock(&cl->cl_lock); - hashed = !list_empty(&lst->st_perfile); - spin_unlock(&cl->cl_lock); - - if (!hashed) { - mutex_unlock(&lst->st_mutex); - nfs4_put_stid(&lst->st_stid); - goto retry; - } status = nfs_ok; *plst = lst; out: @@ -5990,14 +6074,16 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; - mutex_unlock(&lock_stp->st_mutex); - /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. */ - if (status && new) + if (status && new) { + lock_stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(lock_stp); + } + + mutex_unlock(&lock_stp->st_mutex); nfs4_put_stid(&lock_stp->st_stid); } @@ -7017,6 +7103,10 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; + nn->boot_time = get_seconds(); + nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; + INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); @@ -7074,9 +7164,6 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nn->boot_time = get_seconds(); - nn->grace_ended = false; - nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n", @@ -7153,7 +7240,7 @@ nfs4_state_shutdown_net(struct net *net) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6493df6b1bd5..d107b4426f7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1241,6 +1241,9 @@ static __net_init int nfsd_init_net(struct net *net) nn->nfsd4_grace = 90; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); + + atomic_set(&nn->ntf_refcnt, 0); + init_waitqueue_head(&nn->ntf_wq); return 0; out_idmap_error: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 33117d4ffce0..89cb484f1cfb 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -335,7 +335,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) goto out; if (nn->nfsd_serv) { @@ -344,6 +345,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); out: return NOTIFY_DONE; @@ -363,7 +366,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) goto out; if (nn->nfsd_serv) { @@ -374,7 +378,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); out: return NOTIFY_DONE; } @@ -391,6 +396,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -398,6 +404,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } + wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting @@ -517,6 +524,7 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } + atomic_inc(&nn->ntf_refcnt); ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 39f1b0b0c76f..020c597ef9b6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -941,12 +941,13 @@ static int dqinit_needed(struct inode *inode, int type) } /* This routine is guarded by s_umount semaphore */ -static void add_dquot_ref(struct super_block *sb, int type) +static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif + int err = 0; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { @@ -966,7 +967,11 @@ static void add_dquot_ref(struct super_block *sb, int type) reserved = 1; #endif iput(old_inode); - __dquot_initialize(inode, type); + err = __dquot_initialize(inode, type); + if (err) { + iput(inode); + goto out; + } /* * We hold a reference to 'inode' so it couldn't have been @@ -981,7 +986,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); - +out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { quota_error(sb, "Writes happened before quota was turned on " @@ -989,6 +994,7 @@ static void add_dquot_ref(struct super_block *sb, int type) "Please run quotacheck(8)"); } #endif + return err; } /* @@ -2379,10 +2385,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); - add_dquot_ref(sb, type); - - return 0; + error = add_dquot_ref(sb, type); + if (error) + dquot_disable(sb, type, flags); + return error; out_file_init: dqopt->files[type] = NULL; iput(inode); @@ -2985,7 +2992,8 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - register_shrinker(&dqcache_shrinker); + if (register_shrinker(&dqcache_shrinker)) + panic("Cannot register dquot shrinker"); return 0; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 020c9cacbb2f..1fc934d24459 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2591,7 +2591,6 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 08df809e2315..1210f684d3c2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5662,7 +5662,8 @@ xfs_bmap_collapse_extents( *done = true; goto del_cursor; } - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), + del_cursor); new_startoff = got.br_startoff - offset_shift_fsb; if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { @@ -5767,7 +5768,8 @@ xfs_bmap_insert_extents( goto del_cursor; } } - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), + del_cursor); if (stop_fsb >= got.br_startoff + got.br_blockcount) { error = -EIO; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 637b7a892313..f120fb20452f 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -318,8 +318,20 @@ xfs_scrub_dinode( /* di_mode */ mode = be16_to_cpu(dip->di_mode); - if (mode & ~(S_IALLUGO | S_IFMT)) + switch (mode & S_IFMT) { + case S_IFLNK: + case S_IFREG: + case S_IFDIR: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + /* mode is recognized */ + break; + default: xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } /* v1/v2 fields */ switch (dip->di_version) { diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 8e58ba842946..3d9037eceaf1 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -107,7 +107,7 @@ xfs_scrub_quota_item( unsigned long long rcount; xfs_ino_t fs_icount; - offset = id * qi->qi_dqperchunk; + offset = id / qi->qi_dqperchunk; /* * We fed $id and DQNEXT into the xfs_qm_dqget call, which means @@ -207,7 +207,7 @@ xfs_scrub_quota( xfs_dqid_t id = 0; uint dqtype; int nimaps; - int error; + int error = 0; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return -ENOENT; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a3eeaba156c5..21e2d70884e1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -399,7 +399,7 @@ xfs_map_blocks( (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_super->s_maxbytes) + if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -896,13 +896,13 @@ xfs_writepage_map( struct writeback_control *wbc, struct inode *inode, struct page *page, - loff_t offset, - uint64_t end_offset) + uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; ssize_t len = i_blocksize(inode); + uint64_t offset; int error = 0; int count = 0; int uptodate = 1; @@ -1146,7 +1146,7 @@ xfs_do_writepage( end_offset = offset; } - return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); + return xfs_writepage_map(wpc, wbc, inode, page, end_offset); redirty: redirty_page_for_writepage(wbc, page); @@ -1265,7 +1265,7 @@ xfs_map_trim_size( if (mapping_size > size) mapping_size = size; if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { + (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, i_blocksize(inode)); @@ -1312,7 +1312,7 @@ xfs_get_blocks( lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + size > mp->m_super->s_maxbytes) + if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes) size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index dd136f7275e4..e5fb008d75e8 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -389,7 +389,8 @@ xfs_bud_init( int xfs_bui_recover( struct xfs_mount *mp, - struct xfs_bui_log_item *buip) + struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops) { int error = 0; unsigned int bui_type; @@ -404,9 +405,7 @@ xfs_bui_recover( xfs_exntst_t state; struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_defer_ops dfops; struct xfs_bmbt_irec irec; - xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -464,7 +463,6 @@ xfs_bui_recover( if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - xfs_defer_init(&dfops, &firstfsb); /* Process deferred bmap item. */ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? @@ -479,16 +477,16 @@ xfs_bui_recover( break; default: error = -EFSCORRUPTED; - goto err_dfops; + goto err_inode; } xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); if (error) - goto err_dfops; + goto err_inode; if (count > 0) { ASSERT(type == XFS_BMAP_UNMAP); @@ -496,16 +494,11 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec); if (error) - goto err_dfops; + goto err_inode; } - /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto err_dfops; - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -513,8 +506,6 @@ xfs_bui_recover( return error; -err_dfops: - xfs_defer_cancel(&dfops); err_inode: xfs_trans_cancel(tp); if (ip) { diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index c867daae4a3c..24b354a2c836 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, struct xfs_bui_log_item *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops); #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4db6e8d780f6..4c6e86d861fd 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1815,22 +1815,27 @@ xfs_alloc_buftarg( btp->bt_daxdev = dax_dev; if (xfs_setsize_buftarg_early(btp, bdev)) - goto error; + goto error_free; if (list_lru_init(&btp->bt_lru)) - goto error; + goto error_free; if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error; + goto error_lru; btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&btp->bt_shrinker); + if (register_shrinker(&btp->bt_shrinker)) + goto error_pcpu; return btp; -error: +error_pcpu: + percpu_counter_destroy(&btp->bt_io_count); +error_lru: + list_lru_destroy(&btp->bt_lru); +error_free: kmem_free(btp); return NULL; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d57c2db64e59..f248708c10ff 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -970,14 +970,22 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if ((lip->li_flags & XFS_LI_IN_AIL) && - lip->li_lsn == qip->qli_flush_lsn) { + ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_flags & XFS_LI_FAILED))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - if (lip->li_lsn == qip->qli_flush_lsn) + if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); - else + } else { + /* + * Clear the failed state since we are about to drop the + * flush lock + */ + if (lip->li_flags & XFS_LI_FAILED) + xfs_clear_li_failed(lip); spin_unlock(&ailp->xa_lock); + } } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 2c7a1629e064..664dea105e76 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -137,6 +137,26 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * this informs the AIL that the dquot is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_dquot_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + struct xfs_dquot *dqp; + + dqp = DQUOT_ITEM(lip)->qli_dquot; + ASSERT(!completion_done(&dqp->q_flush)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -144,13 +164,28 @@ xfs_qm_dquot_logitem_push( __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -242,7 +277,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_committing = xfs_qm_dquot_logitem_committing + .iop_committing = xfs_qm_dquot_logitem_committing, + .iop_error = xfs_dquot_item_error }; /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 61d1cb7dc10d..801274126648 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2401,6 +2401,24 @@ retry: } /* + * Free any local-format buffers sitting around before we reset to + * extents format. + */ +static inline void +xfs_ifree_local_data( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); +} + +/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2437,6 +2455,9 @@ xfs_ifree( if (error) return error; + xfs_ifree_local_data(ip, XFS_DATA_FORK); + xfs_ifree_local_data(ip, XFS_ATTR_FORK); + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 87b1c331f9eb..28d1abfe835e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -4716,7 +4717,8 @@ STATIC int xlog_recover_process_cui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_cui_log_item *cuip; int error; @@ -4729,7 +4731,7 @@ xlog_recover_process_cui( return 0; spin_unlock(&ailp->xa_lock); - error = xfs_cui_recover(mp, cuip); + error = xfs_cui_recover(mp, cuip, dfops); spin_lock(&ailp->xa_lock); return error; @@ -4756,7 +4758,8 @@ STATIC int xlog_recover_process_bui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_bui_log_item *buip; int error; @@ -4769,7 +4772,7 @@ xlog_recover_process_bui( return 0; spin_unlock(&ailp->xa_lock); - error = xfs_bui_recover(mp, buip); + error = xfs_bui_recover(mp, buip, dfops); spin_lock(&ailp->xa_lock); return error; @@ -4805,6 +4808,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) } } +/* Take all the collected deferred ops and finish them in order. */ +static int +xlog_finish_defer_ops( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops) +{ + struct xfs_trans *tp; + int64_t freeblks; + uint resblks; + int error; + + /* + * We're finishing the defer_ops that accumulated as a result of + * recovering unfinished intent items during log recovery. We + * reserve an itruncate transaction because it is the largest + * permanent transaction type. Since we're the only user of the fs + * right now, take 93% (15/16) of the available free blocks. Use + * weird math to avoid a 64-bit division. + */ + freeblks = percpu_counter_sum(&mp->m_fdblocks); + if (freeblks <= 0) + return -ENOSPC; + resblks = min_t(int64_t, UINT_MAX, freeblks); + resblks = (resblks * 15) >> 4; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + error = xfs_defer_finish(&tp, dfops); + if (error) + goto out_cancel; + + return xfs_trans_commit(tp); + +out_cancel: + xfs_trans_cancel(tp); + return error; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4825,10 +4868,12 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_log_item *lip; - int error = 0; + struct xfs_defer_ops dfops; struct xfs_ail_cursor cur; + struct xfs_log_item *lip; struct xfs_ail *ailp; + xfs_fsblock_t firstfsb; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif @@ -4839,6 +4884,7 @@ xlog_recover_process_intents( #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif + xfs_defer_init(&dfops, &firstfsb); while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -4859,6 +4905,12 @@ xlog_recover_process_intents( */ ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + /* + * NOTE: If your intent processing routine can create more + * deferred ops, you /must/ attach them to the dfops in this + * routine or else those subsequent intents will get + * replayed in the wrong order! + */ switch (lip->li_type) { case XFS_LI_EFI: error = xlog_recover_process_efi(log->l_mp, ailp, lip); @@ -4867,10 +4919,12 @@ xlog_recover_process_intents( error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; case XFS_LI_CUI: - error = xlog_recover_process_cui(log->l_mp, ailp, lip); + error = xlog_recover_process_cui(log->l_mp, ailp, lip, + &dfops); break; case XFS_LI_BUI: - error = xlog_recover_process_bui(log->l_mp, ailp, lip); + error = xlog_recover_process_bui(log->l_mp, ailp, lip, + &dfops); break; } if (error) @@ -4880,6 +4934,11 @@ xlog_recover_process_intents( out: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); + if (error) + xfs_defer_cancel(&dfops); + else + error = xlog_finish_defer_ops(log->l_mp, &dfops); + return error; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8f2e2fac4255..3a55d6fc271b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -393,7 +393,8 @@ xfs_cud_init( int xfs_cui_recover( struct xfs_mount *mp, - struct xfs_cui_log_item *cuip) + struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops) { int i; int error = 0; @@ -405,11 +406,9 @@ xfs_cui_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; enum xfs_refcount_intent_type type; - xfs_fsblock_t firstfsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; struct xfs_bmbt_irec irec; - struct xfs_defer_ops dfops; bool requeue_only = false; ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); @@ -465,7 +464,6 @@ xfs_cui_recover( return error; cudp = xfs_trans_get_cud(tp, cuip); - xfs_defer_init(&dfops, &firstfsb); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { refc = &cuip->cui_format.cui_extents[i]; refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; @@ -485,7 +483,7 @@ xfs_cui_recover( new_len = refc->pe_len; } else error = xfs_trans_log_finish_refcount_update(tp, cudp, - &dfops, type, refc->pe_startblock, refc->pe_len, + dfops, type, refc->pe_startblock, refc->pe_len, &new_fsb, &new_len, &rcur); if (error) goto abort_error; @@ -497,21 +495,21 @@ xfs_cui_recover( switch (type) { case XFS_REFCOUNT_INCREASE: error = xfs_refcount_increase_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_decrease_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_ALLOC_COW: error = xfs_refcount_alloc_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: error = xfs_refcount_free_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; @@ -525,17 +523,12 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); error = xfs_trans_commit(tp); return error; abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); -abort_defer: - xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index 5b74dddfa64b..0e5327349a13 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, struct xfs_cui_log_item *); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops); #endif /* __XFS_REFCOUNT_ITEM_H__ */ |