diff options
Diffstat (limited to 'fs')
232 files changed, 8160 insertions, 4739 deletions
diff --git a/fs/adfs/super.c b/fs/adfs/super.c index a19c31d3f369..4d4a0df8344f 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index cffe8370fb44..c69a87eaf57d 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -64,7 +64,7 @@ struct affs_inode_info { /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { - return list_entry(inode, struct affs_inode_info, vfs_inode); + return container_of(inode, struct affs_inode_info, vfs_inode); } /* diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index a8f463c028ce..5fa92bc790ef 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry) { struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; - struct buffer_head *bh = NULL, *link_bh = NULL; + struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a022f4accd76..17349500592d 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 { struct super_block *sb = dir->i_sb; struct buffer_head *inode_bh = NULL; - struct buffer_head *bh = NULL; + struct buffer_head *bh; u32 block = 0; int retval; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index f39b71c3981e..ea5b69a18ba9 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; char *link = kmap(page); struct slink_front *lf; - int err; int i, j; char c; char lc; pr_debug("follow_link(ino=%lu)\n", inode->i_ino); - err = -EIO; bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) goto fail; @@ -66,7 +64,7 @@ fail: SetPageError(page); kunmap(page); unlock_page(page); - return err; + return -EIO; } const struct address_space_operations affs_symlink_aops = { diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 5b700ef1e59d..c37149b929be 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) return d_inode(sbi->sb->s_root)->i_ino; } -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - static inline void __autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); diff --git a/fs/befs/befs.h b/fs/befs/befs.h index 1fead8d56a98..35d19e8731e3 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -112,7 +112,7 @@ BEFS_SB(const struct super_block *super) static inline struct befs_inode_info * BEFS_I(const struct inode *inode) { - return list_entry(inode, struct befs_inode_info, vfs_inode); + return container_of(inode, struct befs_inode_info, vfs_inode); } static inline befs_blocknr_t diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cd46e4158830..6b659967898e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note) file = vma->vm_file; if (!file) continue; - filename = d_path(&file->f_path, name_curpos, remaining); + filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { vfree(data); @@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note) continue; } - /* d_path() fills at the end, move name down */ + /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; diff --git a/fs/block_dev.c b/fs/block_dev.c index b155d32db766..198243717da5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -43,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode) return container_of(inode, struct bdev_inode, vfs_inode); } -inline struct block_device *I_BDEV(struct inode *inode) +struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } @@ -152,6 +152,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -443,6 +446,12 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + if (size < 0) return size; if (!ops->direct_access) @@ -1170,6 +1179,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index df9932b00d08..1ce06c849a86 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper); BTRFS_WORK_HELPER(scrub_helper); BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); +BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ec2ee477f8ba..b0b093b6afec 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper); BTRFS_WORK_HELPER_PROTO(scrub_helper); BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); +BTRFS_WORK_HELPER_PROTO(scrubparity_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 614aaa1969bd..802fabb30e15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * the first item to check. But sometimes, we may enter it with * slot==nritems. In that case, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) - ret = btrfs_next_old_leaf(root, path, time_seq); + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + if (time_seq == (u64)-1) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + } while (!ret && count < total_refs) { eb = path->nodes[0]; @@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - ret = btrfs_next_old_item(root, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, time_seq); } if (ret > 0) @@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); + else if (time_seq == (u64)-1) + root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, + 0, 0); + else + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, + time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, } /* - * merge two lists of backrefs and adjust counts accordingly + * merge backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. @@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode) ref2 = list_entry(pos2, struct __prelim_ref, list); + if (!ref_for_same_block(ref1, ref2)) + continue; if (mode == 1) { - if (!ref_for_same_block(ref1, ref2)) - continue; if (!ref1->parent && ref2->parent) { xchg = ref1; ref1 = ref2; @@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct list_head *prefs, u64 *total_refs, u64 inum) { + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; struct btrfs_key key; struct btrfs_key op_key = {0}; int sgn; @@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, btrfs_disk_key_to_cpu(&op_key, &extent_op->key); spin_lock(&head->lock); - n = rb_first(&head->ref_root); - while (n) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - n = rb_next(n); + list_for_each_entry(node, &head->ref_list, list) { if (node->seq > seq) continue; @@ -882,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * much like trans == NULL case, the difference only lies in it will not + * commit root. + * The special case is for qgroup to search roots in commit_transaction(). + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -920,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } + if (time_seq == (u64)-1) + path->skip_locking = 1; + /* * grab both a lock on the path and a lock on the delayed ref head. * We need both to get a consistent picture of how the refs look @@ -934,9 +953,10 @@ again: BUG_ON(ret == 0); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (trans && likely(trans->type != __TRANS_DUMMY)) { + if (trans && likely(trans->type != __TRANS_DUMMY) && + time_seq != (u64)-1) { #else - if (trans) { + if (trans && time_seq != (u64)-1) { #endif /* * look if there are updates for this ref queued and lock the @@ -1034,7 +1054,10 @@ again: eb = read_tree_block(fs_info->extent_root, ref->parent, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0f11ebc92f02..54114b4887dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); old = read_tree_block(root, logical, 0); - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { @@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur || !uptodate) { if (!cur) { cur = read_tree_block(root, blocknr, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, ret = -EAGAIN; tmp = read_tree_block(root, blocknr, 0); - if (tmp) { + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f364e1d8d3d..80a9aefb0c46 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4, 0 }; +static int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1619,10 +1619,7 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - struct kobject super_kobj; struct kobject *space_info_kobj; - struct kobject *device_dir_kobj; - struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; @@ -1698,6 +1695,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; + struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1735,7 +1733,7 @@ struct btrfs_fs_info { /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ @@ -3458,6 +3456,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); @@ -3515,6 +3514,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -4050,6 +4052,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_BTRFS_ASSERT +__cold static inline void assfail(char *expr, char *file, int line) { pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", @@ -4065,10 +4068,12 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -4111,11 +4116,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ - #define btrfs_abort_transaction(trans, root, errno) \ do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ } while (0) #define btrfs_std_error(fs_info, errno) \ @@ -4132,6 +4143,7 @@ do { \ } while (0) __printf(5, 6) +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8f8ed7d20bac..ac3e81da6d4e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -22,6 +22,7 @@ #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" +#include "qgroup.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1, - bool compare_seq) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->no_quota > ref2->no_quota) - return 1; - if (ref1->no_quota < ref2->no_quota) - return -1; - /* merging of sequenced refs is not allowed */ - if (compare_seq) { - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - } - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1), - ref1->type); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins, 1); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, rb_erase(&head->href_node, &delayed_refs->href_root); } else { assert_spin_locked(&head->lock); - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } -static int merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, u64 seq) -{ - struct rb_node *node; - int mod = 0; - int done = 0; - - node = rb_next(&ref->rb_node); - while (!done && node) { - struct btrfs_delayed_ref_node *next; - - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - if (seq && next->seq >= seq) - break; - if (comp_entry(ref, next, 0)) - continue; - - if (ref->action == next->action) { - mod = next->ref_mod; - } else { - if (ref->ref_mod < next->ref_mod) { - struct btrfs_delayed_ref_node *tmp; - - tmp = ref; - ref = next; - next = tmp; - done = 1; - } - mod = -next->ref_mod; - } - - drop_delayed_ref(trans, delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); - done = 1; - } else { - /* - * You can't have multiples of the same ref on a tree - * block. - */ - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } - return done; -} - -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - u64 seq = 0; - - assert_spin_locked(&head->lock); - /* - * We don't have too much refs to merge in the case of delayed data - * refs. - */ - if (head->is_data) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - node = rb_first(&head->ref_root); - while (node) { - struct btrfs_delayed_ref_node *ref; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - /* We can't merge refs that are outside of our seq count */ - if (seq && ref->seq >= seq) - break; - if (merge_ref(trans, delayed_refs, head, ref, seq)) - node = rb_first(&head->ref_root); - else - node = rb_next(&ref->rb_node); - } -} - int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -443,45 +270,71 @@ again: } /* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent + * Helper to insert the ref_node to the tail or merge with tail. * - * This may free existing if the update cancels out whatever - * operation it was doing. + * Return 0 for insert. + * Return >0 for merge. */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) +static int +add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, head, existing); - else - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + struct btrfs_delayed_ref_node *exist; + int mod; + int ret = 0; + + spin_lock(&href->lock); + /* Check whether we can merge the tail node with ref */ + if (list_empty(&href->ref_list)) + goto add_tail; + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, + list); + /* No need to compare bytenr nor is_head */ + if (exist->type != ref->type || exist->no_quota != ref->no_quota || + exist->seq != ref->seq) + goto add_tail; + + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), + btrfs_delayed_node_to_tree_ref(ref), + ref->type)) + goto add_tail; + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || + exist->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), + btrfs_delayed_node_to_data_ref(ref))) + goto add_tail; + + /* Now we are sure we can merge */ + ret = 1; + if (exist->action == ref->action) { + mod = ref->ref_mod; } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + } else + mod = -ref->ref_mod; } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(trans, root, href, exist); + spin_unlock(&href->lock); + return ret; + +add_tail: + list_add_tail(&ref->list, &href->ref_list); + atomic_inc(&root->num_entries); + trans->delayed_ref_updates++; + spin_unlock(&href->lock); + return ret; } /* @@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, int action, int is_data) + struct btrfs_delayed_ref_node *ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - head_ref->ref_root = RB_ROOT; + INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + /* Record qgroup extent info if provided */ + if (qrecord) { + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qrecord); + if (qexisting) + kfree(qrecord); + } + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + /* + * XXX: memory should be freed at the same level allocated. + * But bad practice is anywhere... Follow it now. Need cleanup. + */ + if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + if (!head_ref) + goto free_ref; + + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) + goto free_head_ref; } head_ref->extent_op = extent_op; @@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_ref: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + return -ENOMEM; } /* @@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; } + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 5eb0892396d0..13fb5e6090fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -24,9 +24,25 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +/* + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the + * same ref_node structure. + * Ref_head is in a higher logic level than tree/data ref, and duplicated + * bytenr/num_bytes in ref_node is really a waste or memory, they should be + * referred from ref_head. + * This gets more disgusting after we use list to store tree/data ref in + * ref_head. Must clean this mess up later. + */ struct btrfs_delayed_ref_node { + /* + * ref_head use rb tree, stored in ref_root->href. + * indexed by bytenr + */ struct rb_node rb_node; + /*data/tree ref use list, stored in ref_head->ref_list. */ + struct list_head list; + /* the starting bytenr of the extent */ u64 bytenr; @@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_root; + struct list_head ref_list; struct rb_node href_node; @@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root href_root; + /* dirty extent records */ + struct rb_root dirty_extent_root; + /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0573848c7333..862fbc206755 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_error(root->fs_info, ret, "kobj add dev failed"); + printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : @@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0bccf18dc1dc..3f43bfea3684 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) - return NULL; + return ERR_PTR(-ENOMEM); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { free_extent_buffer(buf); - return NULL; + return ERR_PTR(ret); } return buf; @@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), generation); - if (!root->node) { - ret = -ENOMEM; + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto read_fail; + free_extent_buffer(root->node); + goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; -read_fail: - free_extent_buffer(root->node); find_fail: kfree(root); alloc_fail: @@ -2320,8 +2319,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { + if (IS_ERR(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + ret = PTR_ERR(log_tree_root->node); + kfree(log_tree_root); + return ret; + } else if (!extent_buffer_uptodate(log_tree_root->node)) { printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2494,7 +2497,6 @@ int open_ctree(struct super_block *sb, seqlock_init(&fs_info->profiles_lock); init_rwsem(&fs_info->delayed_iput_sem); - init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -2797,8 +2799,8 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), generation); - if (!chunk_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (IS_ERR(chunk_root->node) || + !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; @@ -2834,8 +2836,8 @@ retry_root_backup: tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); @@ -2874,10 +2876,22 @@ retry_root_backup: btrfs_close_extra_devices(fs_devices, 1); + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + if (ret) { + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_device(fs_devices); + if (ret) { + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + goto fail_fsdev_sysfs; + } + ret = btrfs_sysfs_add_one(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); - goto fail_block_groups; + goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); @@ -3055,6 +3069,9 @@ fail_cleaner: fail_sysfs: btrfs_sysfs_remove_one(fs_info); +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3725,6 +3742,7 @@ void close_ctree(struct btrfs_root *root) } btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4053,6 +4071,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *tmp; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, @@ -4068,11 +4087,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((node = rb_first(&head->ref_root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, + list) { ref->in_tree = 0; - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec3acd14cbf..38b76cc02f48 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op, - int no_quota); + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, - int no_quota, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) @@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - /* - * Ok we were able to insert an inline extent and it appears to be a new - * reference, deal with the qgroup accounting. - */ - if (!ret && !no_quota) { - ASSERT(root->fs_info->quota_enabled); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) - type = BTRFS_QGROUP_OPER_ADD_SHARED; - btrfs_release_path(path); - - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - goto out; - } /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); - if (refs) - type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - if (ret) - goto out; - } - path->reada = 1; path->leave_spinning = 1; /* now insert the actual backref */ @@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - node->no_quota, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op, node->no_quota); + extent_op); } else { BUG(); } @@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->level, &ins, node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, node->no_quota, + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op, - node->no_quota); + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else { BUG(); } @@ -2323,28 +2293,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_node *ref, *last = NULL;; + if (list_empty(&head->ref_list)) + return NULL; - /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. - */ - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (ref->action == BTRFS_ADD_DELAYED_REF) - return ref; - else if (last == NULL) - last = ref; - node = rb_next(node); - } - return last; + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* @@ -2396,16 +2352,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2482,7 +2429,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root) || + if (!list_empty(&locked_ref->ref_list) || locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); @@ -2496,7 +2443,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else { actual_count++; ref->in_tree = 0; - rb_erase(&ref->rb_node, &locked_ref->ref_root); + list_del(&ref->list); } atomic_dec(&delayed_refs->num_entries); @@ -2864,9 +2811,6 @@ again: goto again; } out: - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); - if (ret) - return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2905,7 +2849,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2934,11 +2877,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - + list_for_each_entry(ref, &head->ref_list, list) { /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3693,7 +3632,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3721,7 +3661,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - found->full = 0; + if (total_bytes > 0) + found->full = 0; + else + found->full = 1; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3975,6 +3918,9 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; + if (need_commit > 0) + btrfs_wait_ordered_roots(fs_info, -1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4088,7 +4034,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -4102,24 +4048,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -4130,7 +4095,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -5188,6 +5167,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -6092,11 +6089,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; struct btrfs_path *path; @@ -6110,10 +6106,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int no_quota = node->no_quota; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; int last_ref = 0; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6294,7 +6292,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { - type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -6356,18 +6353,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* Deal with the quota accounting */ - if (!ret && last_ref && !no_quota) { - int mod_seq = 0; - - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && - type == BTRFS_QGROUP_OPER_SUB_SHARED) - mod_seq = 1; - - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, - bytenr, num_bytes, type, - mod_seq); - } out: btrfs_free_path(path); return ret; @@ -6393,7 +6378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (rb_first(&head->ref_root)) + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -7303,13 +7288,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - /* Always set parent to 0 here since its exclusive anyway. */ - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, ins->offset, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7391,14 +7369,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, num_bytes, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - } - ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7755,12 +7725,18 @@ reada: wc->reada_slot = slot; } +/* + * TODO: Modify related function to add related node/leaf to dirty_extent_root, + * for later qgroup accounting. + * + * Current, this function does nothing. + */ static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; + int i, extent_type; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; @@ -7783,13 +7759,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - bytenr, num_bytes, - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); - if (ret) - return ret; } return 0; } @@ -7858,6 +7827,8 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. + * TODO: Modify this function to mark all (including complete shared node) + * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7920,7 +7891,11 @@ walk_down: child_gen = btrfs_node_ptr_generation(eb, parent_slot); eb = read_tree_block(root, child_bytenr, child_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); ret = -EIO; goto out; } @@ -7931,16 +7906,6 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - child_bytenr, - root->nodesize, - BTRFS_QGROUP_OPER_SUB_SUBTREE, - 0); - if (ret) - goto out; - } if (level == 0) { @@ -8151,7 +8116,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, generation); - if (!next || !extent_buffer_uptodate(next)) { + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -8533,24 +8500,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - /* - * Qgroup update accounting is run from - * delayed ref handling. This usually works - * out because delayed refs are normally the - * only way qgroup updates are added. However, - * we may have added updates during our tree - * walk so run qgroups here to make sure we - * don't lose any updates. - */ - ret = btrfs_delayed_qgroup_accounting(trans, - root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8604,14 +8553,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -9562,6 +9503,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -9569,6 +9523,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/fs/btrfs/extent-tree.h diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c374e1e71e5f..02d05817cbdf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -4490,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + flags |= FIEMAP_EXTENT_UNWRITTEN; free_extent_map(em); em = NULL; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b072e17479aa..b823fac91c92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; + const u64 len = end - start + 1; trace_btrfs_sync_file(file, datasync); @@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * all extents are persisted and the respective file extent * items are in the fs/subvol btree. */ - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); } else { /* * Start any new ordered operations before starting to log the @@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9dbe5b548fa6..fb5a6b1c62a6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, { int ret = 0; struct btrfs_path *path = btrfs_alloc_path(); + bool locked = false; if (!path) { ret = -ENOMEM; @@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, } if (block_group) { + locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); @@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - mutex_unlock(&trans->transaction->cache_write_mutex); - btrfs_abort_transaction(trans, root, ret); - return ret; - } + if (ret) + goto fail; ret = btrfs_update_inode(trans, root, inode); - if (block_group) - mutex_unlock(&trans->transaction->cache_write_mutex); - fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bb013672aee..855935f6671a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4986,24 +4986,40 @@ static void evict_inode_truncate_pages(struct inode *inode) } write_unlock(&map_tree->lock); + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readpages (called from readahead) + * that didn't get their end io callbacks called yet or they are still + * in progress ((extent_io.c:end_bio_extent_readpage()). This means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; + u64 start; + u64 end; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); - atomic_inc(&state->refs); + start = state->start; + end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, state->start, state->end, - 0, &cached_state); - clear_extent_bit(io_tree, state->start, state->end, + lock_extent_bits(io_tree, start, end, 0, &cached_state); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); - free_extent_state(state); cond_resched(); spin_lock(&io_tree->lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1c22c6518504..c86b835da7a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -553,8 +553,8 @@ static noinline int create_subvol(struct inode *dir, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -1318,7 +1318,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index + 1; + max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be @@ -1368,7 +1368,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, cluster); - ra_index += max_cluster; + ra_index += cluster; } mutex_lock(&inode->i_mutex); @@ -2271,10 +2271,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) @@ -2282,13 +2279,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, inode = file_inode(file); + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); +out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2413,8 +2425,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_inode; } - d_invalidate(dentry); - down_write(&root->fs_info->subvol_sem); err = may_destroy_subvol(dest); @@ -2508,7 +2518,7 @@ out_up_write: out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { - shrink_dcache_sb(root->fs_info->sb); + d_invalidate(dentry); btrfs_invalidate_inodes(dest); d_delete(dentry); ASSERT(dest->send_in_progress == 0); @@ -2879,12 +2889,19 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, return ret; } -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, + u64 olen) { + u64 len = *plen; u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - if (off + len > inode->i_size || off + len < off) + if (off + olen > inode->i_size || off + olen < off) return -EINVAL; + + /* if we extend to eof, continue to block boundary */ + if (off + len == inode->i_size) + *plen = len = ALIGN(inode->i_size, bs) - off; + /* Check that we are block aligned - btrfs_clone() requires this */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) return -EINVAL; @@ -2892,10 +2909,11 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; + u64 len = olen; /* * btrfs_clone() can't handle extents in the same file @@ -2910,11 +2928,11 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, btrfs_double_lock(src, loff, dst, dst_loff, len); - ret = extent_same_check_offsets(src, loff, len); + ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) goto out_unlock; - ret = extent_same_check_offsets(dst, dst_loff, len); + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); if (ret) goto out_unlock; @@ -2927,7 +2945,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); if (ret == 0) - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff); out_unlock: btrfs_double_unlock(src, loff, dst, dst_loff, len); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 760c4a5e096b..89656d799ff6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && - !(type == BTRFS_ORDERED_NOCOW)) - entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - WARN_ON(entry->csum_bytes_left < sum->len); - entry->csum_bytes_left -= sum->len; - if (entry->csum_bytes_left == 0) - wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - list_add_tail(&ordered->trans_list, &trans->ordered); + /* + * If our ordered extent completed it means it updated the + * fs/subvol and csum trees already, so no need to make the + * current transaction's commit wait for it, as we end up + * holding memory unnecessarily and delaying the inode's iput + * until the transaction commit (we schedule an iput for the + * inode when the ordered extent's refcount drops to 0), which + * prevents it from being evictable until the transaction + * commits. + */ + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) + btrfs_put_ordered_extent(ordered); + else + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -844,6 +851,20 @@ out: return entry; } +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_extent *oe; + + oe = btrfs_lookup_ordered_range(inode, file_offset, len); + if (oe) { + btrfs_put_ordered_extent(oe); + return true; + } + return false; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e96cd4ccd805..7176cc0fe43f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -89,9 +89,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* number of bytes that still need csumming */ - u64 csum_bytes_left; - /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 file_offset, u64 len); +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3d6546581bb9..d5f1f033b7a0 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,6 +34,7 @@ #include "extent_io.h" #include "qgroup.h" + /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys @@ -84,11 +85,42 @@ struct btrfs_qgroup { /* * temp variables for accounting operations + * Refer to qgroup_shared_accouting() for details. */ u64 old_refcnt; u64 new_refcnt; }; +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + /* * glue structure to represent the relations between qgroups. */ @@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct ulist *tmp; int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1356,239 +1388,86 @@ out: return ret; } -static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - /* - * Ignore seq and type here, we're looking for any operation - * at all related to this extent on that root. - */ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - return 0; -} + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + u64 qgroup_to_skip; + int ret = 0; -static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node *n; - struct btrfs_qgroup_operation *cur; - int cmp; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; - spin_lock(&fs_info->qgroup_op_lock); - n = fs_info->qgroup_op_tree.rb_node; - while (n) { - cur = rb_entry(n, struct btrfs_qgroup_operation, n); - cmp = comp_oper_exist(cur, oper); - if (cmp < 0) { - n = n->rb_right; - } else if (cmp) { - n = n->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } + /* + * No need to do lock, since this function will only be called in + * btrfs_commmit_transaction(). + */ + node = rb_first(&delayed_refs->dirty_extent_root); + while (node) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, + &record->old_roots); + if (ret < 0) + break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); + node = rb_next(node); } - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -static int comp_oper(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; - if (oper1->type < oper2->type) - return -1; - if (oper1->type > oper2->type) - return 1; - return 0; + return ret; } -static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_qgroup_operation *cur; - int cmp; + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; - spin_lock(&fs_info->qgroup_op_lock); - p = &fs_info->qgroup_op_tree.rb_node; while (*p) { - parent = *p; - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); - cmp = comp_oper(cur, oper); - if (cmp < 0) { - p = &(*p)->rb_right; - } else if (cmp) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) p = &(*p)->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } - } - rb_link_node(&oper->n, parent, p); - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -/* - * Record a quota operation for processing later on. - * @trans: the transaction we are adding the delayed op to. - * @fs_info: the fs_info for this fs. - * @ref_root: the root of the reference we are acting on, - * @bytenr: the bytenr we are acting on. - * @num_bytes: the number of bytes in the reference. - * @type: the type of operation this is. - * @mod_seq: do we need to get a sequence number for looking up roots. - * - * We just add it to our trans qgroup_ref_list and carry on and process these - * operations in order at some later point. If the reference root isn't a fs - * root then we don't bother with doing anything. - * - * MUST BE HOLDING THE REF LOCK. - */ -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, - u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, int mod_seq) -{ - struct btrfs_qgroup_operation *oper; - int ret; - - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - return 0; - - oper = kmalloc(sizeof(*oper), GFP_NOFS); - if (!oper) - return -ENOMEM; - - oper->ref_root = ref_root; - oper->bytenr = bytenr; - oper->num_bytes = num_bytes; - oper->type = type; - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); - INIT_LIST_HEAD(&oper->elem.list); - oper->elem.seq = 0; - - trace_btrfs_qgroup_record_ref(oper); - - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { - /* - * If any operation for this bytenr/ref_root combo - * exists, then we know it's not exclusively owned and - * shouldn't be queued up. - * - * This also catches the case where we have a cloned - * extent that gets queued up multiple times during - * drop snapshot. - */ - if (qgroup_oper_exists(fs_info, oper)) { - kfree(oper); - return 0; - } - } - - ret = insert_qgroup_oper(fs_info, oper); - if (ret) { - /* Shouldn't happen so have an assert for developers */ - ASSERT(0); - kfree(oper); - return ret; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; } - list_add_tail(&oper->list, &trans->qgroup_ref_list); - if (mod_seq) - btrfs_get_tree_mod_seq(fs_info, &oper->elem); - - return 0; -} - -static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *tmp; - int sign = 0; - int ret = 0; - - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - - spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) - goto out; - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - sign = 1; - break; - case BTRFS_QGROUP_OPER_SUB_EXCL: - sign = -1; - break; - default: - ASSERT(0); - } - ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, - oper->num_bytes, sign); -out: - spin_unlock(&fs_info->qgroup_lock); - ulist_free(tmp); - return ret; + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return NULL; } +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 /* - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as - * properly. + * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, - u64 root_to_skip, struct ulist *tmp, - struct ulist *roots, struct ulist *qgroups, - u64 seq, int *old_roots, int rescan) +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct ulist_node *tmp_unode; struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret; + int ret = 0; + if (!roots) + return 0; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { - /* We don't count our current root here */ - if (unode->val == root_to_skip) - continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - /* - * We could have a pending removal of this same ref so we may - * not have actually found our ref root when doing - * btrfs_find_all_roots, so we need to keep track of how many - * old roots we find in case we removed ours and added a - * different one at the same time. I don't think this could - * happen in practice but that sort of thinking leads to pain - * and suffering and to the dark side. - */ - (*old_roots)++; ulist_reinit(tmp); ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), @@ -1603,29 +1482,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *glist; qg = u64_to_ptr(tmp_unode->aux); - /* - * We use this sequence number to keep from having to - * run the whole list and 0 out the refcnt every time. - * We basically use sequnce as the known 0 count and - * then add 1 everytime we see a qgroup. This is how we - * get how many of the roots actually point up to the - * upper level qgroups in order to determine exclusive - * counts. - * - * For rescan we want to set old_refcnt to seq so our - * exclusive calculations end up correct. - */ - if (rescan) - qg->old_refcnt = seq; - else if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); else - qg->old_refcnt++; - - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; + btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, ptr_to_u64(glist->group), @@ -1644,161 +1504,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, } /* - * We need to walk forward in our operation tree and account for any roots that - * were deleted after we made this operation. - */ -static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct ulist *tmp, - struct ulist *qgroups, u64 seq, - int *old_roots) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - struct btrfs_qgroup_operation *tmp_oper; - struct rb_node *n; - int ret; - - ulist_reinit(tmp); - - /* - * We only walk forward in the tree since we're only interested in - * removals that happened _after_ our operation. - */ - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - return 0; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - while (tmp_oper->bytenr == oper->bytenr) { - /* - * If it's not a removal we don't care, additions work out - * properly with our refcnt tracking. - */ - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) - goto next; - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); - if (!qg) - goto next; - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret) { - if (ret < 0) - return ret; - /* - * We only want to increase old_roots if this qgroup is - * not already in the list of qgroups. If it is already - * there then that means it must have been re-added or - * the delete will be discarded because we had an - * existing ref that we haven't looked up yet. In this - * case we don't want to increase old_roots. So if ret - * == 1 then we know that this is the first time we've - * seen this qgroup and we can bump the old_roots. - */ - (*old_roots)++; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - } -next: - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&tmp_oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - break; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - } - - /* Ok now process the qgroups we found */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* Add refcnt for the newly added reference. */ -static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct btrfs_qgroup *qgroup, - struct ulist *tmp, struct ulist *qgroups, - u64 seq) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - int ret; - - ulist_reinit(tmp); - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - } else { - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * This adjusts the counters for all referenced qgroups if need be. + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 part. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. */ -static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, - u64 root_to_skip, u64 num_bytes, - struct ulist *qgroups, u64 seq, - int old_roots, int new_roots, int rescan) +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1810,423 +1555,191 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, bool dirty = false; qg = u64_to_ptr(unode->aux); - /* - * Wasn't referenced before but is now, add to the reference - * counters. - */ - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } - - /* - * Was referenced before but isn't now, subtract from the - * reference counters. - */ - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } - if (qg->old_refcnt < seq) - cur_old_count = 0; - else - cur_old_count = qg->old_refcnt - seq; - if (qg->new_refcnt < seq) - cur_new_count = 0; - else - cur_new_count = qg->new_refcnt - seq; - - /* - * If our refcount was the same as the roots previously but our - * new count isn't the same as the number of roots now then we - * went from having a exclusive reference on this range to not. - */ - if (old_roots && cur_old_count == old_roots && - (cur_new_count != new_roots || new_roots == 0)) { - WARN_ON(cur_new_count != new_roots && new_roots == 0); - qg->excl -= num_bytes; - qg->excl_cmpr -= num_bytes; - dirty = true; + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } } - /* - * If we didn't reference all the roots before but now we do we - * have an exclusive reference to this range. - */ - if ((!old_roots || (old_roots && cur_old_count != old_roots)) - && cur_new_count == new_roots) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - dirty = true; + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } } + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } + } if (dirty) qgroup_dirty(fs_info, qg); } return 0; } -/* - * If we removed a data extent and there were other references for that bytenr - * then we need to lookup all referenced roots to make sure we still don't - * reference this bytenr. If we do then we can just discard this operation. - */ -static int check_existing_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; - - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - oper->elem.seq, &roots); - if (ret < 0) - return ret; - ret = 0; - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - if (unode->val == oper->ref_root) { - ret = 1; - break; - } - } - ulist_free(roots); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - - return ret; -} - -/* - * If we share a reference across multiple roots then we may need to adjust - * various qgroups referenced and exclusive counters. The basic premise is this - * - * 1) We have seq to represent a 0 count. Instead of looping through all of the - * qgroups and resetting their refcount to 0 we just constantly bump this - * sequence number to act as the base reference count. This means that if - * anybody is equal to or below this sequence they were never referenced. We - * jack this sequence up by the number of roots we found each time in order to - * make sure we don't have any overlap. - * - * 2) We first search all the roots that reference the area _except_ the root - * we're acting on currently. This makes up the old_refcnt of all the qgroups - * before. - * - * 3) We walk all of the qgroups referenced by the root we are currently acting - * on, and will either adjust old_refcnt in the case of a removal or the - * new_refcnt in the case of an addition. - * - * 4) Finally we walk all the qgroups that are referenced by this range - * including the root we are acting on currently. We will adjust the counters - * based on the number of roots we had and will have after this operation. - * - * Take this example as an illustration - * - * [qgroup 1/0] - * / | \ - * [qg 0/0] [qg 0/1] [qg 0/2] - * \ | / - * [ extent ] - * - * Say we are adding a reference that is covered by qg 0/0. The first step - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with - * old_roots being 2. Because it is adding new_roots will be 1. We then go - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a - * reference and thus must add the size to the referenced bytes. Everything - * else is the same so nothing else changes. - */ -static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, + struct ulist *old_roots, struct ulist *new_roots) { - struct ulist *roots = NULL; - struct ulist *qgroups, *tmp; - struct btrfs_qgroup *qgroup; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; u64 seq; - int old_roots = 0; - int new_roots = 0; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; int ret = 0; - if (oper->elem.seq) { - ret = check_existing_refs(trans, fs_info, oper); - if (ret < 0) - return ret; - if (ret) - return 0; - } + if (new_roots) + nr_new_roots = new_roots->nnodes; + if (old_roots) + nr_old_roots = old_roots->nnodes; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - return -ENOMEM; + if (!fs_info->quota_enabled) + goto out_free; + BUG_ON(!fs_info->quota_root); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } tmp = ulist_alloc(GFP_NOFS); if (!tmp) { - ulist_free(qgroups); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, - &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) { - ulist_free(qgroups); - ulist_free(tmp); - return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } } + mutex_unlock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; seq = fs_info->qgroup_seq; - /* - * So roots is the list of all the roots currently pointing at the - * bytenr, including the ref we are adding if we are adding, or not if - * we are removing a ref. So we pass in the ref_root to skip that root - * in our calculations. We set old_refnct and new_refcnt cause who the - * hell knows what everything looked like before, and it doesn't matter - * except... - */ - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, - seq, &old_roots, 0); + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); if (ret < 0) goto out; - /* - * Now adjust the refcounts of the qgroups that care about this - * reference, either the old_count in the case of removal or new_count - * in the case of an addition. - */ - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, - seq); + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); if (ret < 0) goto out; - /* - * ...in the case of removals. If we had a removal before we got around - * to processing this operation then we need to find that guy and count - * his references as if they really existed so we don't end up screwing - * up the exclusive counts. Then whenever we go to process the delete - * everything will be grand and we can account for whatever exclusive - * changes need to be made there. We also have to pass in old_roots so - * we have an accurate count of the roots as it pertains to this - * operations view of the world. - */ - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, - &old_roots); - if (ret < 0) - goto out; + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); /* - * We are adding our root, need to adjust up the number of roots, - * otherwise old_roots is the number of roots we want. + * Bump qgroup_seq to avoid seq overlap */ - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - new_roots = old_roots + 1; - } else { - new_roots = old_roots; - old_roots++; - } - fs_info->qgroup_seq += old_roots + 1; - - - /* - * And now the magic happens, bless Arne for having a pretty elegant - * solution for this. - */ - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, - qgroups, seq, old_roots, new_roots, 0); + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(qgroups); - ulist_free(roots); +out_free: ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); return ret; } -/* - * Process a reference to a shared subtree. This type of operation is - * queued during snapshot removal when we encounter extents which are - * shared between more than one root. - */ -static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup_list *glist; - struct ulist *parents; - int ret = 0; - int err; - struct btrfs_qgroup *qg; - u64 root_obj = 0; - struct seq_list elem = SEQ_LIST_INIT(elem); - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - elem.seq, &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) - goto out; - - if (roots->nnodes != 1) - goto out; - - ULIST_ITER_INIT(&uiter); - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ - /* - * If we find our ref root then that means all refs - * this extent has to the root have not yet been - * deleted. In that case, we do nothing and let the - * last ref for this bytenr drive our update. - * - * This can happen for example if an extent is - * referenced multiple times in a snapshot (clone, - * etc). If we are in the middle of snapshot removal, - * queued updates for such an extent will find the - * root if we have not yet finished removing the - * snapshot. - */ - if (unode->val == oper->ref_root) - goto out; - - root_obj = unode->val; - BUG_ON(!root_obj); - - spin_lock(&fs_info->qgroup_lock); - qg = find_qgroup_rb(fs_info, root_obj); - if (!qg) - goto out_unlock; - - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* - * Adjust counts for parent groups. First we find all - * parents, then in the 2nd loop we do the adjustment - * while adding parents of the parents to our ulist. - */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(parents, &uiter))) { - qg = u64_to_ptr(unode->aux); - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - } - -out_unlock: - spin_unlock(&fs_info->qgroup_lock); - -out: - ulist_free(roots); - ulist_free(parents); - return ret; -} - -/* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. - */ -static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + if (!ret) { + /* + * Use (u64)-1 as time_seq to do special search, which + * doesn't lock tree or delayed_refs and search current + * root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, + record->bytenr, (u64)-1, &new_roots); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); + ret = btrfs_qgroup_account_extent(trans, fs_info, + record->bytenr, record->num_bytes, + record->old_roots, new_roots); + record->old_roots = NULL; + new_roots = NULL; } - } - mutex_unlock(&fs_info->qgroup_rescan_lock); +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); - ASSERT(is_fstree(oper->ref_root)); - - trace_btrfs_qgroup_account(oper); - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - case BTRFS_QGROUP_OPER_SUB_EXCL: - ret = qgroup_excl_accounting(fs_info, oper); - break; - case BTRFS_QGROUP_OPER_ADD_SHARED: - case BTRFS_QGROUP_OPER_SUB_SHARED: - ret = qgroup_shared_accounting(trans, fs_info, oper); - break; - case BTRFS_QGROUP_OPER_SUB_SUBTREE: - ret = qgroup_subtree_accounting(trans, fs_info, oper); - break; - default: - ASSERT(0); - } - return ret; -} - -/* - * Needs to be called everytime we run delayed refs, even if there is an error - * in order to cleanup outstanding operations. - */ -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_operation *oper; - int ret = 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - oper = list_first_entry(&trans->qgroup_ref_list, - struct btrfs_qgroup_operation, list); - list_del_init(&oper->list); - if (!ret || !trans->aborted) - ret = btrfs_qgroup_account(trans, fs_info, oper); - spin_lock(&fs_info->qgroup_op_lock); - rb_erase(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - kfree(oper); } return ret; } @@ -2637,15 +2150,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *qgroups, - struct ulist *tmp, struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, + struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; - u64 seq; - int new_roots; int slot; int ret; @@ -2695,33 +2206,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ulist_reinit(qgroups); ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, &roots); if (ret < 0) goto out; - spin_lock(&fs_info->qgroup_lock); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - - new_roots = 0; - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, - seq, &new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, - seq, 0, new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, fs_info, + found.objectid, num_bytes, NULL, roots); + if (ret < 0) goto out; - } - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); } out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -2735,7 +2228,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; @@ -2743,12 +2235,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path = btrfs_alloc_path(); if (!path) goto out; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - goto out; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - goto out; scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); if (!scratch_leaf) goto out; @@ -2764,7 +2250,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - qgroups, tmp, scratch_leaf); + scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2774,8 +2260,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); - ulist_free(qgroups); - ulist_free(tmp); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c5242aa9a4b2..6387dcfa354c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -19,43 +19,18 @@ #ifndef __BTRFS_QGROUP__ #define __BTRFS_QGROUP__ +#include "ulist.h" +#include "delayed-ref.h" + /* - * A description of the operations, all of these operations only happen when we - * are adding the 1st reference for that subvolume in the case of adding space - * or on the last reference delete in the case of subtraction. The only - * exception is the last one, which is added for confusion. - * - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only - * one pointing at the bytes we are adding. This is called on the first - * allocation. - * - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be - * shared between subvols. This is called on the creation of a ref that already - * has refs from a different subvolume, so basically reflink. - * - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only - * one referencing the range. - * - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with - * refs with other subvolumes. + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. */ -enum btrfs_qgroup_operation_type { - BTRFS_QGROUP_OPER_ADD_EXCL, - BTRFS_QGROUP_OPER_ADD_SHARED, - BTRFS_QGROUP_OPER_SUB_EXCL, - BTRFS_QGROUP_OPER_SUB_SHARED, - BTRFS_QGROUP_OPER_SUB_SUBTREE, -}; - -struct btrfs_qgroup_operation { - u64 ref_root; +struct btrfs_qgroup_extent_record { + struct rb_node node; u64 bytenr; u64 num_bytes; - u64 seq; - enum btrfs_qgroup_operation_type type; - struct seq_list elem; - struct rb_node n; - struct list_head list; + struct ulist *old_roots; }; int btrfs_quota_enable(struct btrfs_trans_handle *trans, @@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, - int mod_seq); -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper); + struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74b24b01d574..827951fbf7fc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1847,8 +1847,10 @@ again: } eb = read_tree_block(dest, old_bytenr, old_ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { - ret = (!eb) ? -ENOMEM : -EIO; + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { + ret = -EIO; free_extent_buffer(eb); break; } @@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); eb = read_tree_block(root, bytenr, ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, generation); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + err = PTR_ERR(eb); + goto next; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; goto next; @@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.offset); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ab5811545a98..9f2feabe99f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity) kfree(sparity); } +static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +{ + struct scrub_parity *sparity = container_of(work, struct scrub_parity, + work); + struct scrub_ctx *sctx = sparity->sctx; + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); +} + static void scrub_parity_bio_endio(struct bio *bio, int error) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - struct scrub_ctx *sctx = sparity->sctx; if (error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); bio_put(bio); + + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, + scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, + &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -3589,6 +3601,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } + fs_info->scrub_parity_workers = + btrfs_alloc_workqueue("btrfs-scrubparity", flags, + max_active, 2); + if (!fs_info->scrub_parity_workers) { + ret = -ENOMEM; + goto out; + } } ++fs_info->scrub_workers_refcnt; out: @@ -3601,6 +3620,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a1216f9b4917..aa72bfd28f7d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -243,6 +243,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + bool orphanized; }; struct orphan_dir_info { @@ -1158,6 +1159,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + /* data offset in the file extent item */ + u64 data_offset; + /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (ret < 0) return ret; - if (offset + bctx->extent_len > i_size) + if (offset + bctx->data_offset + bctx->extent_len > i_size) return 0; /* @@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; + /* + * For non-compressed extents iterate_extent_inodes() gives us extent + * offsets that already take into account the data offset, but not for + * compressed extents, since the offset is logical and not relative to + * the physical extent locations. We must take this into account to + * avoid sending clone offsets that go beyond the source file's size, + * which would result in the clone ioctl failing with -EINVAL on the + * receiving end. + */ + if (compressed == BTRFS_COMPRESS_NONE) + backref_ctx->data_offset = 0; + else + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore ow_inode can actually be the + * same as sctx->send_progress. + */ + if (ow_inode <= sctx->send_progress) ret = 1; else ret = 0; @@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); if (is_waiting_for_rm(sctx, ino)) { @@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - if (is_waiting_for_move(sctx, ino)) { + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { @@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(sctx->parent_root->root_item.ctransid)); } @@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) return entry != NULL; } -static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->orphanized = orphanized; while (*p) { parent = *p; @@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, goto out; } - ret = add_waiting_dir_move(sctx, pm->ino); + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; @@ -3353,8 +3386,40 @@ out: return ret; } +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + static int wait_for_parent_move(struct send_ctx *sctx, - struct recorded_ref *parent_ref) + struct recorded_ref *parent_ref, + const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; @@ -3374,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx, * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after - * that ancestor is processed. + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { if (is_waiting_for_move(sctx, ino)) { - ret = 1; + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); break; } @@ -3420,7 +3498,7 @@ out: ino, &sctx->new_refs, &sctx->deleted_refs, - false); + is_orphan); if (!ret) ret = 1; } @@ -3589,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move @@ -3609,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = wait_for_parent_move(sctx, cur); - if (ret < 0) - goto out; - if (ret) { - *pending_move = 1; - } else { - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, - cur->full_path); - } + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -4508,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9e66f5e724db..cd7ef34d2dce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. */ +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - /* - * Report first abort since mount - */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", - errno); - } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. */ +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -841,33 +836,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(fs_root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } } + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); + } + return name; + +err: + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + u64 dir_id; + path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); + return -ENOMEM; path->leave_spinning = 1; /* @@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb, di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - if (!(sb->s_flags & MS_RDONLY)) { - int ret; - down_read(&fs_info->cleanup_work_sem); - ret = btrfs_orphan_cleanup(new_root); - up_read(&fs_info->cleanup_work_sem); - if (ret) - return ERR_PTR(ret); - } - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && d_inode(sb->s_root) == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_root(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, @@ -1108,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } @@ -1138,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode) } /* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. + * This will add subvolid=0 to the argument string while removing any subvol= + * and subvolid= arguments to make sure we get the top-level root for path + * walking to the subvol we want. */ static char *setup_root_args(char *args) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; + char *buf, *dst, *sep; - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ + if (!args) + return kstrdup("subvolid=0", GFP_NOFS); - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; - - buf = dst = kmalloc(len, GFP_NOFS); + /* The worst case is that we add ",subvolid=0" to the end. */ + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); if (!buf) return NULL; - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); + while (1) { + sep = strchrnul(args, ','); + if (!strstarts(args, "subvol=") && + !strstarts(args, "subvolid=")) { + memcpy(dst, args, sep - args); + dst += sep - args; + *dst++ = ','; + } + if (*sep) + args = sep + 1; + else + break; } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); - - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); + up_write(&mnt->mnt_sb->s_umount); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } - kfree(newargs); + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), + subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1303,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev = NULL; struct super_block *s; - struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1323,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); @@ -1392,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) { + if (error) { deactivate_locked_super(s); - error = PTR_ERR(root); goto error_sec_opts; } fs_info = btrfs_sb(s); error = setup_security_options(fs_info, s, &new_sec_opts); if (error) { - dput(root); deactivate_locked_super(s); goto error_sec_opts; } - return root; + return dget(s->s_root); error_close_devices: btrfs_close_devices(fs_devices); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e8a4c86d274d..603b0cc2b9bb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -33,6 +33,7 @@ #include "volumes.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) @@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); -static struct attribute *btrfs_attrs[] = { +static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), @@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = { static void btrfs_release_super_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs_info = to_fs_info(kobj); - complete(&fs_info->kobj_unregister); + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_super_kobj, - .default_attrs = btrfs_attrs, }; +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, super_kobj); +} + static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_info, super_kobj); + return to_fs_devs(kobj)->fs_info; } #define NUM_FEATURE_BITS 64 @@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, &agroup); } @@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) return 0; } -static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->device_dir_kobj) { + kobject_del(fs_devs->device_dir_kobj); + kobject_put(fs_devs->device_dir_kobj); + fs_devs->device_dir_kobj = NULL; + } + + if (fs_devs->super_kobj.state_initialized) { + kobject_del(&fs_devs->super_kobj); + kobject_put(&fs_devs->super_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - kobject_del(&fs_info->super_kobj); - kobject_put(&fs_info->super_kobj); - wait_for_completion(&fs_info->kobj_unregister); + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } } void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { + btrfs_reset_fs_info_ptr(fs_info); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } - kobject_del(fs_info->device_dir_kobj); - kobject_put(fs_info->device_dir_kobj); addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); - __btrfs_sysfs_remove_one(fs_info); + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -602,40 +635,60 @@ static void init_feature_attrs(void) } } -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +/* when one_device is NULL, it removes all device links */ + +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_info->device_dir_kobj) + if (!fs_devices->device_dir_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_info->device_dir_kobj, + sysfs_remove_link(fs_devices->device_dir_kobj, + disk_kobj->name); + } + + if (one_device) + return 0; + + list_for_each_entry(one_device, + &fs_devices->devices, dev_list) { + if (!one_device->bdev) + continue; + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_devices->device_dir_kobj, disk_kobj->name); } return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - int error = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *dev; - - if (!fs_info->device_dir_kobj) - fs_info->device_dir_kobj = kobject_create_and_add("devices", - &fs_info->super_kobj); + if (!fs_devs->device_dir_kobj) + fs_devs->device_dir_kobj = kobject_create_and_add("devices", + &fs_devs->super_kobj); - if (!fs_info->device_dir_kobj) + if (!fs_devs->device_dir_kobj) return -ENOMEM; + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_device *dev; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_info->device_dir_kobj, + error = sysfs_create_link(fs_devices->device_dir_kobj, disk_kobj, disk_kobj->name); if (error) break; @@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry; /* Debugging tunables and exported data */ u64 btrfs_debugfs_test; +/* + * Can be called by the device discovery thread. + * And parent can be specified for seed device + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->super_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); + return error; +} + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *super_kobj = &fs_devs->super_kobj; + + btrfs_set_fs_info_ptr(fs_info); - init_completion(&fs_info->kobj_unregister); - fs_info->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, - "%pU", fs_info->fsid); + error = btrfs_kobj_add_device(fs_devs, NULL); if (error) return error; - error = sysfs_create_group(&fs_info->super_kobj, - &btrfs_feature_attr_group); + error = sysfs_create_files(super_kobj, btrfs_attrs); if (error) { - __btrfs_sysfs_remove_one(fs_info); + btrfs_kobj_rm_device(fs_devs, NULL); return error; } - error = addrm_unknown_feature_attrs(fs_info, true); + error = sysfs_create_group(super_kobj, + &btrfs_feature_attr_group); if (error) goto failure; - error = btrfs_kobj_add_device(fs_info, NULL); + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - &fs_info->super_kobj); + super_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3a4bbed723fd..6392527bcc15 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent); +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index c32a7ba76bca..846d277b1901 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -21,6 +21,7 @@ #include "../transaction.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../backref.h" static void init_dummy_trans(struct btrfs_trans_handle *trans) { @@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + /* + * Since the test trans doesn't havee the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } @@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root) if (ret) return ret; - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root) test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } + old_roots = NULL; + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } ret = remove_extent_item(root, 4096, 4096); if (ret) return -EINVAL; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't remove space from the qgroup %d\n", ret); - return -EINVAL; + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } @@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = add_tree_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = remove_extent_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5628e25250c0..c0f18e7266b6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -225,12 +225,14 @@ loop: cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -509,6 +511,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->chunk_bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; @@ -792,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + btrfs_trans_release_chunk_metadata(trans); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { @@ -1290,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (pending->error) goto no_free_objectid; + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { @@ -1298,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) - goto no_free_objectid; + goto clear_skip_qgroup; } key.objectid = objectid; @@ -1396,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto fail; } - - /* - * We need to flush delayed refs in order to make sure all of our quota - * operations have been done before we call btrfs_qgroup_inherit. - */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); @@ -1497,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * account qgroup counters before qgroup_inherit() + */ + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); root_item_alloc_fail: @@ -1963,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto scrub_continue; } + /* Reocrd old roots for later qgroup accounting */ + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + /* * make sure none of the code above managed to slip in a * delayed item @@ -2004,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_free_log_root_tree(trans, root->fs_info); + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans, root->fs_info); + if (ret < 0) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); @@ -2054,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + btrfs_trans_release_chunk_metadata(trans); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2123,6 +2161,7 @@ scrub_continue: btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; if (trans->qgroup_reserved) { btrfs_qgroup_free(root, trans->qgroup_reserved); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0b24755596ba..eb09c2067fa8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -102,6 +102,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -153,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_unlock(&BTRFS_I(inode)->lock); } +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; +} + int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a63719cc9578..a4b9c8b2d35a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; - if (btrfs_test_opt(root, SSD)) - goto out; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d04968374e9d..1ce80c1c4eb6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 840a38b2778a..91feb2bdefee 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) return NULL; } +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; @@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, node->val = val; node->aux = aux; -#ifdef CONFIG_BTRFS_DEBUG - node->seqnum = ulist->nnodes; -#endif ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); @@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, return 1; } +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + /** * ulist_next - iterate ulist * @ulist: ulist to iterate @@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; -#ifdef CONFIG_BTRFS_DEBUG - uiter->i = 0; -#endif } node = list_entry(uiter->cur_list, struct ulist_node, list); -#ifdef CONFIG_BTRFS_DEBUG - ASSERT(node->seqnum == uiter->i); - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); - uiter->i++; -#endif return node; } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 4c29db604bbe..a01a2c45825f 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); /* just like ulist_add_merge() but take a pointer for the aux data */ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 53af23f2c087..4b438b4c8c91 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +struct list_head *btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} static struct btrfs_fs_devices *__alloc_fs_devices(void) { @@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } + +void btrfs_free_stale_device(struct btrfs_device *cur_dev) +{ + struct btrfs_fs_devices *fs_devs; + struct btrfs_device *dev; + + if (!cur_dev->name) + return; + + list_for_each_entry(fs_devs, &fs_uuids, list) { + int del = 1; + + if (fs_devs->opened) + continue; + if (fs_devs->seeding) + continue; + + list_for_each_entry(dev, &fs_devs->devices, dev_list) { + + if (dev == cur_dev) + continue; + if (!dev->name) + continue; + + /* + * Todo: This won't be enough. What if the same device + * comes back (with new uuid and) with its mapper path? + * But for now, this does help as mostly an admin will + * either use mapper or non mapper path throughout. + */ + rcu_read_lock(); + del = strcmp(rcu_str_deref(dev->name), + rcu_str_deref(cur_dev->name)); + rcu_read_unlock(); + if (!del) + break; + } + + if (!del) { + /* delete the stale device */ + if (fs_devs->num_devices == 1) { + btrfs_sysfs_remove_fsid(fs_devs); + list_del(&fs_devs->list); + free_fs_devices(fs_devs); + } else { + fs_devs->num_devices--; + list_del(&dev->dev_list); + rcu_string_free(dev->name); + kfree(dev); + } + break; + } + } +} + /* * Add new device to list of registered devices * @@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; + /* + * if there is new btrfs on an already registered device, + * then remove the stale device entry. + */ + btrfs_free_stale_device(device); + *fs_devices_ret = fs_devices; return ret; @@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head) static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; + struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; struct rcu_string *name; @@ -1067,15 +1132,31 @@ again: map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + u64 end; + if (map->stripes[i].dev != device) continue; if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= physical_start) continue; - *start = map->stripes[i].physical + - em->orig_block_len; - ret = 1; + /* + * Make sure that while processing the pinned list we do + * not override our *start with a lower value, because + * we can have pinned chunks that fall within this + * device hole and that have lower physical addresses + * than the pending chunks we processed before. If we + * do not take this special care we can end up getting + * 2 pending chunks that start at the same physical + * device offsets because the end offset of a pinned + * chunk can be equal to the start offset of some + * pending chunk. + */ + end = map->stripes[i].physical + em->orig_block_len; + if (end > *start) { + *start = end; + ret = 1; + } } } if (search_list == &trans->transaction->pending_chunks) { @@ -1706,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1875,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); + + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + if (tgtdev->bdev) { btrfs_scratch_superblock(tgtdev); fs_info->fs_devices->open_devices--; @@ -2211,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info, device); + btrfs_kobj_add_device(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2252,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) - goto error_trans; + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + fsid_buf)) + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2289,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2609,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = (struct map_lookup *)em->bdev; + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, extent_root, map->type); + unlock_chunks(root->fs_info->chunk_root); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -3908,9 +3996,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) uuid_root = btrfs_create_tree(trans, fs_info, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { - btrfs_abort_transaction(trans, tree_root, - PTR_ERR(uuid_root)); - return PTR_ERR(uuid_root); + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, tree_root, ret); + return ret; } fs_info->uuid_root = uuid_root; @@ -3965,6 +4053,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) int slot; int failed = 0; bool retried = false; + bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; @@ -4045,15 +4134,6 @@ again: goto again; } else if (failed && retried) { ret = -ENOSPC; - lock_chunks(root); - - btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); goto done; } @@ -4065,6 +4145,35 @@ again: } lock_chunks(root); + + /* + * We checked in the above loop all device extents that were already in + * the device tree. However before we have updated the device's + * total_bytes to the new size, we might have had chunk allocations that + * have not complete yet (new block groups attached to transaction + * handles), and therefore their device extents were not yet in the + * device tree and we missed them in the loop above. So if we have any + * pending chunk using a device extent that overlaps the device range + * that we can not use anymore, commit the current transaction and + * repeat the search on the device tree - this way we guarantee we will + * not have chunks using device extents that end beyond 'new_size'. + */ + if (!checked_pending_chunks) { + u64 start = new_size; + u64 len = old_size - new_size; + + if (contains_pending_extent(trans, device, &start, len)) { + unlock_chunks(root); + checked_pending_chunks = true; + failed = 0; + retried = false; + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto done; + goto again; + } + } + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -4079,6 +4188,16 @@ again: btrfs_end_transaction(trans, root); done: btrfs_free_path(path); + if (ret) { + lock_chunks(root); + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + } return ret; } @@ -6072,6 +6191,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); return -EIO; } + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", + devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6191,10 +6312,11 @@ static int read_one_dev(struct btrfs_root *root, if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + devid, dev_uuid); } else { if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -6722,3 +6844,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, } unlock_chunks(root); } + +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = fs_info; + fs_devices = fs_devices->seed; + } +} + +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = NULL; + fs_devices = fs_devices->seed; + } +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cedae0356558..95842a909e7f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { * nonrot flag set */ int rotating; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject super_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -535,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +struct list_head *btrfs_get_fs_uuids(void); +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8c52472d2efa..aecd0859eacb 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -43,7 +43,6 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ -#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ab857ab9f40d..fc1056f5c96a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, * call vfs_unlink(), vfs_rmdir() or vfs_rename() */ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry) + struct dentry *dentry, + enum fscache_why_object_killed why) { struct cachefiles_object *object; struct rb_node *p; @@ -132,8 +133,9 @@ found_dentry: pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); - } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - pr_err("Error: Object already preemptively buried\n"); + } else { + if (why != FSCACHE_OBJECT_IS_STALE) + fscache_object_mark_killed(&object->fscache, why); } write_unlock(&cache->active_lock); @@ -265,7 +267,8 @@ requeue: static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, struct dentry *rep, - bool preemptive) + bool preemptive, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -289,7 +292,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } mutex_unlock(&d_inode(dir)->i_mutex); @@ -394,7 +397,7 @@ try_again: "Rename failed with error %d", ret); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } unlock_rename(cache->graveyard, dir); @@ -422,7 +425,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); - if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); @@ -433,7 +436,8 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, * may have been renamed */ if (dir == object->dentry->d_parent) { ret = cachefiles_bury_object(cache, dir, - object->dentry, false); + object->dentry, false, + FSCACHE_OBJECT_WAS_RETIRED); } else { /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore @@ -522,7 +526,7 @@ lookup_again: if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mkdir(&path, next, 0); @@ -551,7 +555,7 @@ lookup_again: if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mknod(&path, next, S_IFREG, 0); @@ -602,7 +606,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true); + ret = cachefiles_bury_object(cache, dir, next, true, + FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -610,6 +615,7 @@ lookup_again: goto delete_error; _debug("redo lookup"); + fscache_object_retrying_stale(&object->fscache); goto lookup_again; } } @@ -662,6 +668,8 @@ lookup_again: _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); return 0; +no_space_error: + fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); create_error: _debug("create error %d", ret); if (ret == -EIO) @@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false); + ret = cachefiles_bury_object(cache, dir, victim, false, + FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 64fa248343f6..8f84646f10e9 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, val_size2 = posix_acl_xattr_size(default_acl->a_count); err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); if (!tmp_buf) goto out_err; - pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); + pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); if (!pagelist) goto out_err; ceph_pagelist_init(pagelist); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e162bcd105ee..890c50971a69 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page) inode = mapping->host; ci = ceph_inode(inode); - /* - * Note that we're grabbing a snapc ref here without holding - * any locks! - */ - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); - /* dirty the head */ spin_lock(&ci->i_ceph_lock); - if (ci->i_head_snapc == NULL) - ci->i_head_snapc = ceph_get_snap_context(snapc); - ++ci->i_wrbuffer_ref_head; + BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + capsnap->dirty_pages++; + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + ++ci->i_wrbuffer_ref_head; + } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; @@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) /* build page vector */ nr_pages = calc_pages_for(0, len); - pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); ret = -ENOMEM; if (!pages) goto out; @@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_NOFS)) { + GFP_KERNEL)) { ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", @@ -436,7 +440,7 @@ out: * only snap context we are allowed to write back. */ static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) + loff_t *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); + loff_t snap_size = -1; long writeback_stat; - u64 truncate_size, snap_size = 0; + u64 truncate_size; u32 truncate_seq; int err = 0, len = PAGE_CACHE_SIZE; @@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) + if (snap_size == -1) snap_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); @@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync = 0; - u64 truncate_size, snap_size; + loff_t snap_size, i_size; + u64 truncate_size; u32 truncate_seq; /* @@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); - snap_size = 0; + snap_size = -1; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -749,16 +755,13 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } - if (snap_size == 0) - snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) - snap_size = i_size_read(inode); + i_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); if (last_snapc && snapc != last_snapc) { @@ -828,8 +831,10 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if (page_offset(page) >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_offset(page) >= + (snap_size == -1 ? i_size : snap_size)) { + dout("%p page eof %llu\n", page, + (snap_size == -1 ? i_size : snap_size)); done = 1; unlock_page(page); break; @@ -884,7 +889,8 @@ get_more_pages: } if (do_sync) - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, + CEPH_OSD_OP_STARTSYNC, 0); req->r_callback = writepages_finish; req->r_inode = inode; @@ -944,10 +950,18 @@ get_more_pages: } /* Format the osd request message and submit the write */ - offset = page_offset(pages[0]); - len = min(snap_size - offset, - (u64)locked_pages << PAGE_CACHE_SHIFT); + len = (u64)locked_pages << PAGE_CACHE_SHIFT; + if (snap_size == -1) { + len = min(len, (u64)i_size_read(inode) - offset); + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + len = max(len, 1 + + ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); + } else { + len = min(len, snap_size - offset); + } dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); @@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1044,10 +1057,6 @@ retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); - /* check snap context */ - BUG_ON(!ci->i_snap_realm); - down_read(&mdsc->snap_rwsem); - BUG_ON(!ci->i_snap_realm->cached_context); snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* @@ -1055,7 +1064,6 @@ retry_locked: * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL); - up_read(&mdsc->snap_rwsem); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); @@ -1112,7 +1120,6 @@ retry_locked: } /* we need to read it. */ - up_read(&mdsc->snap_rwsem); r = readpage_nounlock(file, page); if (r < 0) goto fail_nosnap; @@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, /* * we don't do anything in here that simple_write_end doesn't do - * except adjust dirty page accounting and drop read lock on - * mdsc->snap_rwsem. + * except adjust dirty page accounting */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); unlock_page(page); - up_read(&mdsc->snap_rwsem); page_cache_release(page); if (check_cap) @@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_cap_flush *prealloc_cf; struct page *page = vmf->page; loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; int want, got, ret; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return VM_FAULT_SIGBUS; + if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; if (off == 0) { @@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) - return VM_FAULT_SIGBUS; + if (ret < 0) { + ret = VM_FAULT_SIGBUS; + goto out_free; + } } if (off + PAGE_CACHE_SIZE <= size) @@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) break; if (ret != -ERESTARTSYS) { WARN_ON(1); - return VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; + goto out_free; } } dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", @@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret == 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); - up_read(&mdsc->snap_rwsem); ret = VM_FAULT_LOCKED; } else { if (ret == -ENOMEM) @@ -1389,7 +1398,8 @@ out: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1398,6 +1408,8 @@ out: dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); +out_free: + ceph_free_cap_flush(prealloc_cf); return ret; } @@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, - 0, 0, false); + ceph_empty_snapc, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, + ceph_empty_snapc, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &ceph_vmops; return 0; } + +enum { + POOL_READ = 1, + POOL_WRITE = 2, +}; + +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; + struct rb_node **p, *parent; + struct ceph_pool_perm *perm; + struct page **pages; + int err = 0, err2 = 0, have = 0; + + down_read(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + while (*p) { + perm = rb_entry(*p, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + up_read(&mdsc->pool_perm_rwsem); + if (*p) + goto out; + + dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + + down_write(&mdsc->pool_perm_rwsem); + parent = NULL; + while (*p) { + parent = *p; + perm = rb_entry(parent, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + if (*p) { + up_write(&mdsc->pool_perm_rwsem); + goto out; + } + + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!rd_req) { + err = -ENOMEM; + goto out_unlock; + } + + rd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); + rd_req->r_base_oloc.pool = pool; + snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), + "%llx.00000000", ci->i_vino.ino); + rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!wr_req) { + err = -ENOMEM; + goto out_unlock; + } + + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); + wr_req->r_base_oloc.pool = pool; + wr_req->r_base_oid = rd_req->r_base_oid; + + /* one page should be large enough for STAT data */ + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) { + err = PTR_ERR(pages); + goto out_unlock; + } + + osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, + 0, false, true); + ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + + ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + if (!err2) + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + + if (err >= 0 || err == -ENOENT) + have |= POOL_READ; + else if (err != -EPERM) + goto out_unlock; + + if (err2 == 0 || err2 == -EEXIST) + have |= POOL_WRITE; + else if (err2 != -EPERM) { + err = err2; + goto out_unlock; + } + + perm = kmalloc(sizeof(*perm), GFP_NOFS); + if (!perm) { + err = -ENOMEM; + goto out_unlock; + } + + perm->pool = pool; + perm->perm = have; + rb_link_node(&perm->node, parent, p); + rb_insert_color(&perm->node, &mdsc->pool_perm_tree); + err = 0; +out_unlock: + up_write(&mdsc->pool_perm_rwsem); + + if (rd_req) + ceph_osdc_put_request(rd_req); + if (wr_req) + ceph_osdc_put_request(wr_req); +out: + if (!err) + err = have; + dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + return err; +} + +int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +{ + u32 pool; + int ret, flags; + + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + NOPOOLPERM)) + return 0; + + spin_lock(&ci->i_ceph_lock); + flags = ci->i_ceph_flags; + pool = ceph_file_layout_pg_pool(ci->i_layout); + spin_unlock(&ci->i_ceph_lock); +check: + if (flags & CEPH_I_POOL_PERM) { + if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { + dout("ceph_pool_perm_check pool %u no read perm\n", + pool); + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { + dout("ceph_pool_perm_check pool %u no write perm\n", + pool); + return -EPERM; + } + return 0; + } + + ret = __ceph_pool_perm_get(ci, pool); + if (ret < 0) + return ret; + + flags = CEPH_I_POOL_PERM; + if (ret & POOL_READ) + flags |= CEPH_I_POOL_RD; + if (ret & POOL_WRITE) + flags |= CEPH_I_POOL_WR; + + spin_lock(&ci->i_ceph_lock); + if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { + ci->i_ceph_flags = flags; + } else { + pool = ceph_file_layout_pg_pool(ci->i_layout); + flags = ci->i_ceph_flags; + } + spin_unlock(&ci->i_ceph_lock); + goto check; +} + +void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) +{ + struct ceph_pool_perm *perm; + struct rb_node *n; + + while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { + n = rb_first(&mdsc->pool_perm_tree); + perm = rb_entry(n, struct ceph_pool_perm, node); + rb_erase(n, &mdsc->pool_perm_tree); + kfree(perm); + } +} diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index be5ea6af8366..dc10c9dd36c1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + if (ci->i_rdcache_ref || + (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - /* - * s_cap_reconnect is protected by s_cap_lock. no one changes - * s_cap_gen while session is in the reconnect state. - */ - if (queue_release && - (!session->s_cap_reconnect || - cap->cap_gen == session->s_cap_gen)) - __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, - cap->mseq, cap->issue_seq); - if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; + + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + cap->queue_release = 1; + if (removed) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + spin_unlock(&session->s_cap_lock); /* remove from inode list */ @@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) static int send_cap_msg(struct ceph_mds_session *session, u64 ino, u64 cid, int op, int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, - u64 size, u64 max_size, + u32 seq, u64 flush_tid, u64 oldest_flush_tid, + u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, @@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session, size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u mseq %u follows %lld size %llu/%llu" + " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), ceph_cap_string(dirty), - seq, issue_seq, mseq, follows, size, max_size, + seq, issue_seq, flush_tid, oldest_flush_tid, + mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size */ - extra_len = 4 + 8 + 4; + /* flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ + extra_len = 4 + 8 + 4 + 4 + 8; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; + msg->hdr.version = cpu_to_le16(6); msg->hdr.tid = cpu_to_le64(flush_tid); fc = msg->front.iov_base; @@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); + /* osd_epoch_barrier */ + ceph_encode_32(&p, 0); + /* oldest_flush_tid */ + ceph_encode_64(&p, oldest_flush_tid); fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { @@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %llx release to mds%d msg %p (%d left)\n", - ino, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ino); - item->cap_id = cpu_to_le64(cap_id); - item->migrate_seq = cpu_to_le32(migrate_seq); - item->seq = cpu_to_le32(issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } -} - /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. @@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode) */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, - unsigned *pflush_tid) + u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; @@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; int delayed = 0; - u64 flush_tid = 0; - int i; int ret; bool inline_data; @@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; - if (flushing) { - /* - * assign a tid for flush operations so we can avoid - * flush1 -> dirty1 -> flush2 -> flushack1 -> mark - * clean type races. track latest tid for every bit - * so we can handle flush AxFw, flush Fw, and have the - * first ack clean Ax. - */ - flush_tid = ++ci->i_cap_flush_last_tid; - if (pflush_tid) - *pflush_tid = flush_tid; - dout(" cap_flush_tid %d\n", (int)flush_tid); - for (i = 0; i < CEPH_CAP_BITS; i++) - if (flushing & (1 << i)) - ci->i_cap_flush_tid[i] = flush_tid; - - follows = ci->i_head_snapc->seq; - } else { - follows = 0; - } + follows = flushing ? ci->i_head_snapc->seq : 0; keep = cap->implemented; seq = cap->seq; @@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + op, keep, want, flushing, seq, + flush_tid, oldest_flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, follows, inline_data); @@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to + * Unless @kick is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, - int again) + int kick) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { @@ -1297,11 +1257,8 @@ retry: if (capsnap->dirty_pages || capsnap->writing) break; - /* - * if cap writeback already occurred, we should have dropped - * the capsnap in ceph_put_wrbuffer_cap_refs. - */ - BUG_ON(capsnap->dirty == 0); + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); /* pick mds, take s_mutex */ if (ci->i_auth_cap == NULL) { @@ -1310,7 +1267,7 @@ retry: } /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { + if (!kick && !list_empty(&capsnap->flushing_item)) { dout("already flushed %p, skipping\n", capsnap); continue; } @@ -1320,6 +1277,9 @@ retry: if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); + if (kick) + goto out; + mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; @@ -1343,20 +1303,22 @@ retry: goto retry; } - capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->flush_tid = ++mdsc->last_cap_flush_tid; + spin_unlock(&mdsc->cap_dirty_lock); + atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + if (list_empty(&capsnap->flushing_item)) + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, - capsnap->size, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, + 0, mseq, capsnap->size, 0, &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, @@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ -int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - if (!ci->i_head_snapc) + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); + } dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); @@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ihold(inode); dirty |= I_DIRTY_SYNC; } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && @@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) return dirty; } +static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &ci->i_cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, i_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->i_node, parent, p); + rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); +} + +static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &mdsc->cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, g_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->g_node, parent, p); + rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); +} + +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + struct rb_node *n = rb_first(&mdsc->cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, g_node); + return cf->tid; + } + return 0; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. @@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; int flushing; BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", @@ -1463,22 +1504,31 @@ static int __mark_caps_flushing(struct inode *inode, ci->i_dirty_caps = 0; dout(" inode %p now !dirty\n", inode); + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + cf->kick = false; + spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); + cf->tid = ++mdsc->last_cap_flush_tid; + __add_cap_flushing_to_mdsc(mdsc, cf); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing tid %llu\n", inode, cf->tid); } else { list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing (more) tid %llu\n", + inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); + __add_cap_flushing_to_inode(ci, cf); + + *flush_tid = cf->tid; return flushing; } @@ -1524,6 +1574,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; @@ -1553,13 +1604,13 @@ retry: retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); - want = file_wanted | used; issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; - retain = want | CEPH_CAP_PIN; + want = file_wanted; + retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { - if (want) { + if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && @@ -1602,9 +1653,10 @@ retry_locked: * If we fail, it's because pages are locked.... try again later. */ if ((!is_delayed || mdsc->stopping) && - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ + !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { @@ -1742,17 +1794,25 @@ ack: took_snap_rwsem = 1; } - if (cap == ci->i_auth_cap && ci->i_dirty_caps) - flushing = __mark_caps_flushing(inode, session); - else + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = __mark_caps_flushing(inode, session, + &flush_tid, + &oldest_flush_tid); + } else { flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } mds = cap->mds; /* remember mds, so we don't repeat */ sent++; /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, NULL); + want, retain, flushing, + flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1781,12 +1841,13 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int flushing = 0; struct ceph_mds_session *session = NULL; + int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1811,42 +1872,54 @@ retry: if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session); + flushing = __mark_caps_flushing(inode, session, &flush_tid, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - cap->issued | cap->implemented, flushing, - flush_tid); - if (!delayed) - goto out_unlocked; + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + struct rb_node *n = rb_last(&ci->i_cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, i_node); + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); } out: - spin_unlock(&ci->i_ceph_lock); -out_unlocked: if (session) mutex_unlock(&session->s_mutex); + + *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ -static int caps_are_flushed(struct inode *inode, unsigned tid) +static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int i, ret = 1; + struct ceph_cap_flush *cf; + struct rb_node *n; + int ret = 1; spin_lock(&ci->i_ceph_lock); - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((ci->i_flushing_caps & (1 << i)) && - ci->i_cap_flush_tid[i] <= tid) { - /* still flushing this bit */ + n = rb_first(&ci->i_cap_flush_tree); + if (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid <= flush_tid) ret = 0; - break; - } + } spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1864,13 +1937,16 @@ static void sync_write_wait(struct inode *inode) struct ceph_osd_request *req; u64 last_tid; + if (!S_ISREG(inode->i_mode)) + return; + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; /* set upper bound as _last_ entry in chain */ - req = list_entry(head->prev, struct ceph_osd_request, - r_unsafe_item); + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); last_tid = req->r_tid; do { @@ -1888,18 +1964,64 @@ static void sync_write_wait(struct inode *inode) */ if (list_empty(head)) break; - req = list_entry(head->next, struct ceph_osd_request, - r_unsafe_item); + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); } +/* + * wait for any uncommitted directory operations to commit. + */ +static int unsafe_dirop_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + if (!S_ISDIR(inode->i_mode)) + return 0; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_last_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + ret = !wait_for_completion_timeout(&req->r_safe_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (ret) + ret = -EIO; /* timed out */ + + ceph_mdsc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + if (ret || list_empty(head)) + break; + req = list_first_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); + return ret; +} + int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int ret; int dirty; @@ -1908,25 +2030,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) - return ret; + goto out; + + if (datasync) + goto out; + mutex_lock(&inode->i_mutex); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + ret = unsafe_dirop_wait(inode); + /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - dout("fsync waiting for flush_tid %u\n", flush_tid); + if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { ret = wait_event_interruptible(ci->i_cap_wq, - caps_are_flushed(inode, flush_tid)); + caps_are_flushed(inode, flush_tid)); } - - dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); mutex_unlock(&inode->i_mutex); +out: + dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; } @@ -1939,7 +2066,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int err = 0; int dirty; int wait = wbc->sync_mode == WB_SYNC_ALL; @@ -1994,6 +2121,104 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } } +static int __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + bool kick_all) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + struct rb_node *n; + int delayed = 0; + u64 first_tid = 0; + u64 oldest_flush_tid; + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + while (true) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + break; + } + + for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid < first_tid) + continue; + if (kick_all || cf->kick) + break; + } + if (!n) { + spin_unlock(&ci->i_ceph_lock); + break; + } + + cf = rb_entry(n, struct ceph_cap_flush, i_node); + cf->kick = false; + + first_tid = cf->tid + 1; + + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, + cap, cf->tid, ceph_cap_string(cf->caps)); + delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + } + return delayed; +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + struct rb_node *n; + + dout("early_kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + spin_unlock(&ci->i_ceph_lock); + if (!__kick_flushing_caps(mdsc, session, ci, true)) + continue; + spin_lock(&ci->i_ceph_lock); + } + + for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + cf->kick = true; + } + + spin_unlock(&ci->i_ceph_lock); + } +} + void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { @@ -2003,28 +2228,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - int delayed = 0; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p %s\n", inode, - cap, ceph_cap_string(ci->i_flushing_caps)); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + int delayed = __kick_flushing_caps(mdsc, session, ci, false); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } } @@ -2036,26 +2243,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - int delayed = 0; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, - ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + dout("kick_flushing_inode_caps %p flushing %s\n", inode, + ceph_cap_string(ci->i_flushing_caps)); __ceph_flush_snaps(ci, &session, 1); if (ci->i_flushing_caps) { + int delayed; + spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); + spin_unlock(&ci->i_ceph_lock); + + delayed = __kick_flushing_caps(mdsc, session, ci, true); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2073,7 +2279,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got) +static void __take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) { if (got & CEPH_CAP_PIN) ci->i_pin_ref++; @@ -2081,8 +2288,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; - if (got & CEPH_CAP_FILE_WR) + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } ci->i_wr_ref++; + } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); @@ -2100,16 +2313,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, int *check_max, int *err) + loff_t endoff, bool nonblock, int *got, int *err) { struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; int file_wanted; + bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2125,6 +2341,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } @@ -2136,7 +2356,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) { - *check_max = 1; + *err = -EAGAIN; ret = 1; } goto out_unlock; @@ -2164,8 +2384,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (nonblock) { + *err = -EAGAIN; + ret = 1; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } *got = need | (have & want); - __take_cap_refs(ci, *got); + __take_cap_refs(ci, *got, true); ret = 1; } } else { @@ -2189,6 +2430,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } out_unlock: spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); @@ -2231,50 +2474,70 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, check_max, ret, err = 0; + int _got, ret, err = 0; -retry: - if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); - _got = 0; - check_max = 0; - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - &_got, &check_max, &err)); - if (err) - ret = err; + ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - if (check_max) - goto retry; + while (true) { + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { - struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); + err = 0; _got = 0; + ret = try_get_cap_refs(ci, need, want, endoff, + false, &_got, &err); + if (ret) { + if (err == -EAGAIN) + continue; + if (err < 0) + return err; + } else { + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, endoff, + true, &_got, &err)); + if (err == -EAGAIN) + continue; + if (err < 0) + ret = err; + if (ret < 0) + return ret; + } - /* getattr request will bring inline data into page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret < 0) - return ret; - goto retry; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = + find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + break; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; } -out: + *got = _got; return 0; } @@ -2286,10 +2549,31 @@ out: void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps); + __take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + + dout("dropping cap_snap %p follows %llu\n", + capsnap, capsnap->follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + /* * Release cap refs. * @@ -2303,7 +2587,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; - struct ceph_cap_snap *capsnap; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -2325,17 +2608,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { last++; - if (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - if (capsnap->writing) { - capsnap->writing = 0; - flushsnaps = - __ceph_finish_cap_snap(ci, - capsnap); - wake = 1; - } + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(capsnap)) + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) @@ -2352,7 +2642,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ceph_flush_snaps(ci); if (wake) wake_up_all(&ci->i_cap_wq); - if (put) + while (put-- > 0) iput(inode); } @@ -2380,7 +2670,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && - ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2401,25 +2693,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = 1; - if (capsnap->dirty == 0) - /* cap writeback completed before we created - * the cap_snap; no FLUSHSNAP is needed */ - drop_capsnap = 1; + drop_capsnap = ceph_try_drop_cap_snap(capsnap); } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s%s\n", + " snap %lld %d/%d -> %d/%d %s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : "", - drop_capsnap ? " (drop capsnap)" : ""); - if (drop_capsnap) { - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - } + complete_capsnap ? " (complete capsnap)" : ""); } spin_unlock(&ci->i_ceph_lock); @@ -2526,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode)) { @@ -2732,16 +3015,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *cf; + struct rb_node *n; + LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; int drop = 0; - int i; - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((dirty & (1 << i)) && - (u16)flush_tid == ci->i_cap_flush_tid[i]) - cleaned |= 1 << i; + n = rb_first(&ci->i_cap_flush_tree); + while (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + n = rb_next(&cf->i_node); + if (cf->tid == flush_tid) + cleaned = cf->caps; + if (cf->tid <= flush_tid) { + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add_tail(&cf->list, &to_remove); + } else { + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," " flushing %s -> %s\n", @@ -2749,12 +3045,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); - if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); + + if (!list_empty(&to_remove)) { + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (!cf || cf->tid > flush_tid) + wake_up_all(&mdsc->cap_flushing_wq); + } + if (ci->i_flushing_caps == 0) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) @@ -2764,14 +3071,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; - if (ci->i_wrbuffer_ref_head == 0) { + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2785,6 +3092,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, out: spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } if (drop) iput(inode); } @@ -2800,6 +3114,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; int drop = 0; @@ -2823,6 +3138,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); + wake_up_all(&mdsc->cap_flushing_wq); drop = 1; break; } else { @@ -2971,7 +3287,6 @@ retry: mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } - ceph_add_cap_releases(mdsc, tsession); new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); @@ -3167,16 +3482,20 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); - if (op == CEPH_CAP_OP_IMPORT) - ceph_add_cap_releases(mdsc, session); - if (!inode) { dout(" i don't have ino %llx\n", vino.ino); if (op == CEPH_CAP_OP_IMPORT) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = cap_id; + cap->mseq = mseq; + cap->seq = seq; spin_lock(&session->s_cap_lock); - __queue_cap_release(session, vino.ino, cap_id, - mseq, seq); + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; spin_unlock(&session->s_cap_lock); } goto flush_cap_releases; @@ -3252,11 +3571,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, flush_cap_releases: /* - * send any full release message to try to move things + * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); done: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4248307fea90..9314b4ea2375 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r) } /* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len, unsigned next_offset) +{ + char *buf = kmalloc(len+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + kfree(fi->last_name); + fi->last_name = buf; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + fi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +/* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on * d_child when we initially get results back from the MDS, and @@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; struct inode *dir = d_inode(parent); - struct list_head *p; - struct dentry *dentry, *last; + struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *); int err = 0; + loff_t ptr_pos = 0; + struct ceph_readdir_cache_control cache_ctl = {}; - /* claim ref on last dentry we returned */ - last = fi->dentry; - fi->dentry = NULL; - - dout("__dcache_readdir %p v%u at %llu (last %p)\n", - dir, shared_gen, ctx->pos, last); + dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); - spin_lock(&parent->d_lock); - - /* start at beginning? */ - if (ctx->pos == 2 || last == NULL || - fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { - if (list_empty(&parent->d_subdirs)) - goto out_unlock; - p = parent->d_subdirs.prev; - dout(" initial p %p/%p\n", p->prev, p->next); - } else { - p = last->d_child.prev; + /* we can calculate cache index for the first dirfrag */ + if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { + cache_ctl.index = fpos_off(ctx->pos) - 2; + BUG_ON(cache_ctl.index < 0); + ptr_pos = cache_ctl.index * sizeof(struct dentry *); } -more: - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - while (1) { - dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, - d_unhashed(dentry) ? "!hashed" : "hashed", - parent->d_subdirs.prev, parent->d_subdirs.next); - if (p == &parent->d_subdirs) { + while (true) { + pgoff_t pgoff; + bool emit_dentry; + + if (ptr_pos >= i_size_read(dir)) { fi->flags |= CEPH_F_ATEND; - goto out_unlock; + err = 0; + break; + } + + err = -EAGAIN; + pgoff = ptr_pos >> PAGE_CACHE_SHIFT; + if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { + ceph_readdir_cache_release(&cache_ctl); + cache_ctl.page = find_lock_page(&dir->i_data, pgoff); + if (!cache_ctl.page) { + dout(" page %lu not found\n", pgoff); + break; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(cache_ctl.page); + cache_ctl.dentries = kmap(cache_ctl.page); } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && + ptr_pos < i_size_read(dir)) + dentry = cache_ctl.dentries[cache_ctl.index % nsize]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + if (!dentry) + break; + + emit_dentry = false; + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && d_really_is_positive(dentry) && + d_really_is_positive(dentry) && ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && - fpos_cmp(ctx->pos, di->offset) <= 0) - break; - dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, - dentry, di->offset, - ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !d_inode(dentry) ? " null" : ""); + fpos_cmp(ctx->pos, di->offset) <= 0) { + emit_dentry = true; + } spin_unlock(&dentry->d_lock); - p = p->prev; - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - } - - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete_ordered(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - dput(dentry); - err = -EAGAIN; - goto out; - } + if (emit_dentry) { + dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dentry, dentry, d_inode(dentry)); + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, + ceph_translate_ino(dentry->d_sb, + d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { + dput(dentry); + err = 0; + break; + } + ctx->pos++; - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, - dentry, dentry, d_inode(dentry)); - if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), - d_inode(dentry)->i_mode >> 12)) { - if (last) { - /* remember our position */ - fi->dentry = last; - fi->next_offset = fpos_off(di->offset); + if (last) + dput(last); + last = dentry; + } else { + dput(dentry); } - dput(dentry); - return 0; - } - - ctx->pos = di->offset + 1; - if (last) - dput(last); - last = dentry; - - spin_lock(&parent->d_lock); - p = p->prev; /* advance to next dentry */ - goto more; - -out_unlock: - spin_unlock(&parent->d_lock); -out: - if (last) + cache_ctl.index++; + ptr_pos += sizeof(struct dentry *); + } + ceph_readdir_cache_release(&cache_ctl); + if (last) { + int ret; + di = ceph_dentry(last); + ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, + fpos_off(di->offset) + 1); + if (ret < 0) + err = ret; dput(last); + } return err; } -/* - * make note of the last dentry we read, so we can - * continue at the same lexicographical point, - * regardless of what dir changes take place on the - * server. - */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, - int len) -{ - kfree(fi->last_name); - fi->last_name = kmalloc(len+1, GFP_NOFS); - if (!fi->last_name) - return -ENOMEM; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - dout("note_last_dentry '%s'\n", fi->last_name); - return 0; -} - static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((ctx->pos == 2 || fi->dentry) && - ceph_test_mount_opt(fsc, DCACHE) && + if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) } else { spin_unlock(&ci->i_ceph_lock); } - if (fi->dentry) { - err = note_last_dentry(fi, fi->dentry->d_name.name, - fi->dentry->d_name.len); - if (err) - return err; - dput(fi->dentry); - fi->dentry = NULL; - } /* proceed with a normal readdir */ - - if (ctx->pos == 2) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - fi->dir_ordered_count = ci->i_ordered_count; - } - more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -342,12 +336,15 @@ more: req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; if (fi->last_name) { - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } } + req->r_dir_release_cnt = fi->dir_release_count; + req->r_dir_ordered_cnt = fi->dir_ordered_count; + req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); @@ -364,26 +361,38 @@ more: (int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_complete); - if (!req->r_did_prepopulate) { - dout("readdir !did_prepopulate"); - /* preclude from marking dir complete */ - fi->dir_release_count--; - } /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - off = fi->next_offset; + off = req->r_readdir_offset; + fi->next_offset = off; } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; + if (req->r_did_prepopulate) { + fi->readdir_cache_idx = req->r_readdir_cache_idx; + if (fi->readdir_cache_idx < 0) { + /* preclude from marking dir ordered */ + fi->dir_ordered_count = 0; + } else if (ceph_frag_is_leftmost(frag) && off == 2) { + /* note dir version at start of readdir so + * we can tell if any dentries get dropped */ + fi->dir_release_count = req->r_dir_release_cnt; + fi->dir_ordered_count = req->r_dir_ordered_cnt; + } + } else { + dout("readdir !did_prepopulate"); + /* disable readdir cache */ + fi->readdir_cache_idx = -1; + /* preclude from marking dir complete */ + fi->dir_release_count = 0; + } + if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; @@ -394,10 +403,10 @@ more: } else { err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1]); + rinfo->dir_dname_len[rinfo->dir_nr-1], + fi->next_offset + rinfo->dir_nr); if (err) return err; - fi->next_offset += rinfo->dir_nr; } } @@ -453,16 +462,22 @@ more: * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&ci->i_ceph_lock); - if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - if (ci->i_ordered_count == fi->dir_ordered_count) + if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { dout(" marking %p complete and ordered\n", inode); - else + /* use i_size to track number of entries in + * readdir cache */ + BUG_ON(fi->readdir_cache_idx < 0); + i_size_write(inode, fi->readdir_cache_idx * + sizeof(struct dentry*)); + } else { dout(" marking %p complete\n", inode); + } __ceph_dir_set_complete(ci, fi->dir_release_count, fi->dir_ordered_count); + spin_unlock(&ci->i_ceph_lock); } - spin_unlock(&ci->i_ceph_lock); dout("readdir %p file %p done.\n", inode, file); return 0; @@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) } kfree(fi->last_name); fi->last_name = NULL; + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; if (ceph_frag_is_leftmost(frag)) fi->next_offset = 2; /* compensate for . and .. */ else fi->next_offset = 0; - if (fi->dentry) { - dput(fi->dentry); - fi->dentry = NULL; - } fi->flags &= ~CEPH_F_ATEND; } @@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) mutex_lock(&inode->i_mutex); retval = -EINVAL; switch (whence) { - case SEEK_END: - offset += inode->i_size + 2; /* FIXME */ - break; case SEEK_CUR: offset += file->f_pos; case SEEK_SET: break; + case SEEK_END: + retval = -EOPNOTSUPP; default: goto out; } @@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } retval = offset; - /* - * discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk. - */ if (offset == 0 || fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { + /* discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk */ dout("dir_llseek dropping %p content\n", file); reset_readdir(fi, fpos_frag(offset)); + } else if (fpos_cmp(offset, old_offset) > 0) { + /* reset dir_release_count if we did a forward seek */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; } - - /* bump dir_release_count if we did a forward seek */ - if (fpos_cmp(offset, old_offset) > 0) - fi->dir_release_count--; } out: mutex_unlock(&inode->i_mutex); @@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } - req->r_path2 = kstrdup(dest, GFP_NOFS); + req->r_path2 = kstrdup(dest, GFP_KERNEL); if (!req->r_path2) { err = -ENOMEM; ceph_mdsc_put_request(req); @@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); - - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(old_dir); - ceph_dir_clear_complete(new_dir); - } ceph_mdsc_put_request(req); return err; @@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(bufsize, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_KERNEL); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = @@ -1224,66 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, } /* - * an fsync() on a dir will wait for any uncommitted directory - * operations to commit. - */ -static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_dirops; - struct ceph_mds_request *req; - u64 last_tid; - int ret = 0; - - dout("dir_fsync %p\n", inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - req = list_entry(head->prev, - struct ceph_mds_request, r_unsafe_dir_item); - last_tid = req->r_tid; - - do { - ceph_mdsc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("dir_fsync %p wait on tid %llu (until %llu)\n", - inode, req->r_tid, last_tid); - if (req->r_timeout) { - unsigned long time_left = wait_for_completion_timeout( - &req->r_safe_completion, - req->r_timeout); - if (time_left > 0) - ret = 0; - else - ret = -EIO; /* timed out */ - } else { - wait_for_completion(&req->r_safe_completion); - } - ceph_mdsc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - if (ret || list_empty(head)) - break; - req = list_entry(head->next, - struct ceph_mds_request, r_unsafe_dir_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* * We maintain a private dentry LRU. * * FIXME: this needs to be changed to a per-mds lru to be useful. @@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = { .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, - .fsync = ceph_dir_fsync, + .fsync = ceph_fsync, }; const struct file_operations ceph_snapdir_fops = { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b6b522b4b31..8b79d87eaf46 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); - cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); if (cf == NULL) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } cf->fmode = fmode; cf->next_offset = 2; + cf->readdir_cache_idx = -1; file->private_data = cf; BUG_ON(inode->i_fop->release != ceph_release); break; @@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); - dput(cf->dentry); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ @@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, } } else { num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = striped_read(inode, off, len, pages, @@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t start; ssize_t n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, @@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { @@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t left; int n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, 1, @@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -860,7 +858,7 @@ again: struct page *page = NULL; loff_t i_size; if (retry_op == READ_INLINE) { - page = __page_cache_alloc(GFP_NOFS); + page = __page_cache_alloc(GFP_KERNEL); if (!page) return -ENOMEM; } @@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; loff_t pos; @@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -959,7 +962,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; count = iov_iter_count(from); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -996,14 +999,30 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct ceph_snap_context *snapc; struct iov_iter data; mutex_unlock(&inode->i_mutex); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos); + written = ceph_sync_direct_write(iocb, &data, pos, + snapc); else - written = ceph_sync_write(iocb, &data, pos); + written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1014,6 +1033,7 @@ retry_snap: } if (written > 0) iov_iter_advance(from, written); + ceph_put_snap_context(snapc); } else { loff_t old_size = inode->i_size; /* @@ -1035,7 +1055,8 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1059,6 +1080,7 @@ retry_snap: out: mutex_unlock(&inode->i_mutex); out_unlocked: + ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; return written ? written : err; } @@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; + struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; int ret = 0; @@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: mutex_unlock(&inode->i_mutex); + ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 571acd88606c..96d2bd829902 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_ordered_count = 0; - atomic_set(&ci->i_release_count, 1); - atomic_set(&ci->i_complete_count, 0); + atomic64_set(&ci->i_ordered_count, 1); + atomic64_set(&ci->i_release_count, 1); + atomic64_set(&ci->i_complete_seq[0], 0); + atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_flushing_caps = 0; INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); - ci->i_cap_flush_seq = 0; - ci->i_cap_flush_last_tid = 0; - memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + ci->i_prealloc_cap_flush = NULL; + ci->i_cap_flush_tree = RB_ROOT; init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; @@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), @@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); + i_size_write(inode, 0); __ceph_dir_set_complete(ci, - atomic_read(&ci->i_release_count), - ci->i_ordered_count); + atomic64_read(&ci->i_release_count), + atomic64_read(&ci->i_ordered_count)); } wake = true; @@ -1212,6 +1216,10 @@ retry_lookup: dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); + d_move(req->r_old_dentry, dn); dout(" src %p '%pd' dst %p '%pd'\n", req->r_old_dentry, @@ -1222,10 +1230,6 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_ordered(dir); - ceph_dir_clear_ordered(olddir); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, return err; } +void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) +{ + if (ctl->page) { + kunmap(ctl->page); + page_cache_release(ctl->page); + ctl->page = NULL; + } +} + +static int fill_readdir_cache(struct inode *dir, struct dentry *dn, + struct ceph_readdir_cache_control *ctl, + struct ceph_mds_request *req) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*); + unsigned idx = ctl->index % nsize; + pgoff_t pgoff = ctl->index / nsize; + + if (!ctl->page || pgoff != page_index(ctl->page)) { + ceph_readdir_cache_release(ctl); + ctl->page = grab_cache_page(&dir->i_data, pgoff); + if (!ctl->page) { + ctl->index = -1; + return -ENOMEM; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(ctl->page); + ctl->dentries = kmap(ctl->page); + } + + if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && + req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { + dout("readdir cache dn %p idx %d\n", dn, ctl->index); + ctl->dentries[idx] = dn; + ctl->index++; + } else { + dout("disable readdir cache\n"); + ctl->index = -1; + } + return 0; +} + int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; - u64 r_readdir_offset = req->r_readdir_offset; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + struct ceph_readdir_cache_control cache_ctl = {}; + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { @@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); if (ceph_frag_is_leftmost(frag)) - r_readdir_offset = 2; + req->r_readdir_offset = 2; else - r_readdir_offset = 0; + req->r_readdir_offset = 0; } - if (req->r_aborted) - return readdir_prepopulate_inodes_only(req, session); - if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { snapdir = ceph_get_snapdir(d_inode(parent)); parent = d_find_alias(snapdir); @@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } + + cache_ctl.index = req->r_readdir_cache_idx; + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1413,13 +1471,6 @@ retry_lookup: d_delete(dn); dput(dn); goto retry_lookup; - } else { - /* reorder parent's d_subdirs */ - spin_lock(&parent->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_child, &parent->d_subdirs); - spin_unlock(&dn->d_lock); - spin_unlock(&parent->d_lock); } /* inode */ @@ -1436,13 +1487,15 @@ retry_lookup: } } - if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation) < 0) { + ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (ret < 0) { pr_err("fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) iput(in); d_drop(dn); + err = ret; goto next_item; } @@ -1458,19 +1511,28 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); update_dentry_lease(dn, rinfo->dir_dlease[i], req->r_session, req->r_request_started); + + if (err == 0 && cache_ctl.index >= 0) { + ret = fill_readdir_cache(d_inode(parent), dn, + &cache_ctl, req); + if (ret < 0) + err = ret; + } next_item: if (dn) dput(dn); } - if (err == 0) - req->r_did_prepopulate = true; - out: + if (err == 0) { + req->r_did_prepopulate = true; + req->r_readdir_cache_idx = cache_ctl.index; + } + ceph_readdir_cache_release(&cache_ctl); if (snapdir) { iput(snapdir); dput(parent); @@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf; int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; + bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, USE_AUTH_MDS); - if (IS_ERR(req)) + if (IS_ERR(req)) { + ceph_free_cap_flush(prealloc_cf); return PTR_ERR(req); + } spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); + + if (!ci->i_head_snapc && + (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + } + } + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ia_valid & ATTR_UID) { @@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dout("setattr %p ATTR_FILE ... hrm!\n", inode); if (dirtied) { - inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, + &prealloc_cf); inode->i_ctime = CURRENT_TIME; } release &= issued; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_mdsc_put_request(req); if (mask & CEPH_SETATTR_SIZE) __ceph_do_pending_vmtruncate(inode); + ceph_free_cap_flush(prealloc_cf); return err; out_put: ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84f37f34f9aa..6aa07af67603 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include <linux/ratelimit.h> #include "super.h" #include "mds_client.h" @@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); - INIT_LIST_HEAD(&s->s_cap_releases_done); INIT_LIST_HEAD(&s->s_cap_flushing); INIT_LIST_HEAD(&s->s_cap_snaps_flushing); @@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); + if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) + mdsc->oldest_tid = req->r_tid; + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + + if (req->r_tid == mdsc->oldest_tid) { + struct rb_node *p = rb_next(&req->r_node); + mdsc->oldest_tid = 0; + while (p) { + struct ceph_mds_request *next_req = + rb_entry(p, struct ceph_mds_request, r_node); + if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { + mdsc->oldest_tid = next_req->r_tid; + break; + } + p = rb_next(p); + } + } + rb_erase(&req->r_node, &mdsc->request_tree); RB_CLEAR_NODE(&req->r_node); @@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* - * Free preallocated cap messages assigned to this session - */ -static void cleanup_cap_releases(struct ceph_mds_session *session) +/* caller holds s_cap_lock, we drop it */ +static void cleanup_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) + __releases(session->s_cap_lock) { - struct ceph_msg *msg; + LIST_HEAD(tmp_list); + list_splice_init(&session->s_cap_releases, &tmp_list); + session->s_num_cap_releases = 0; + spin_unlock(&session->s_cap_lock); - spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); - } - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); + dout("cleanup_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + struct ceph_cap *cap; + /* zero out the in-progress message */ + cap = list_first_entry(&tmp_list, + struct ceph_cap, session_caps); + list_del(&cap->session_caps); + ceph_put_cap(mdsc, cap); } - spin_unlock(&session->s_cap_lock); } static void cleanup_session_requests(struct ceph_mds_client *mdsc, @@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); list_del_init(&req->r_unsafe_item); - pr_info(" dropping unsafe request %llu\n", req->r_tid); + pr_warn_ratelimited(" dropping unsafe request %llu\n", + req->r_tid); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session, dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); + cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - cap->session = NULL; - old_cap = cap; /* put_cap it w/o locks held */ + if (cap->queue_release) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + } else { + old_cap = cap; /* put_cap it w/o locks held */ + } } if (ret < 0) goto out; @@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + LIST_HEAD(to_remove); int drop = 0; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { + struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + while (true) { + struct rb_node *n = rb_first(&ci->i_cap_flush_tree); + if (!n) + break; + cf = rb_entry(n, struct ceph_cap_flush, i_node); + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add(&cf->list, &to_remove); + } + spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + if (!list_empty(&ci->i_dirty_item)) { - pr_info(" dropping dirty %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_ino(inode)); ci->i_dirty_caps = 0; @@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } if (!list_empty(&ci->i_flushing_item)) { - pr_info(" dropping dirty+flushing %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_ino(inode)); ci->i_flushing_caps = 0; @@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } spin_unlock(&mdsc->cap_dirty_lock); + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + list_add(&ci->i_prealloc_cap_flush->list, &to_remove); + ci->i_prealloc_cap_flush = NULL; + } } spin_unlock(&ci->i_ceph_lock); + while (!list_empty(&to_remove)) { + struct ceph_cap_flush *cf; + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } while (drop--) iput(inode); return 0; @@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session) spin_lock(&session->s_cap_lock); } } - spin_unlock(&session->s_cap_lock); + + // drop cap expires and unlock s_cap_lock + cleanup_cap_releases(session->s_mdsc, session); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); - cleanup_cap_releases(session); } /* @@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), ceph_cap_string(used), ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { - if (ci->i_dirty_caps | ci->i_flushing_caps) + if (ci->i_dirty_caps || ci->i_flushing_caps || + !list_empty(&ci->i_cap_snaps)) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; @@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); return 0; } -/* - * Allocate cap_release messages. If there is a partially full message - * in the queue, try to allocate enough to cover it's remainder, so that - * we can send it immediately. - * - * Called under s_mutex. - */ -int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int check_capsnap_flush(struct ceph_inode_info *ci, + u64 want_snap_seq) { - struct ceph_msg *msg, *partial = NULL; - struct ceph_mds_cap_release *head; - int err = -ENOMEM; - int extra = mdsc->fsc->mount_options->cap_release_safety; - int num; - - dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, - extra); - - spin_lock(&session->s_cap_lock); - - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - if (num) { - dout(" partial %p with (%d/%d)\n", msg, num, - (int)CEPH_CAPS_PER_RELEASE); - extra += CEPH_CAPS_PER_RELEASE - num; - partial = msg; - } - } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { - spin_unlock(&session->s_cap_lock); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS, false); - if (!msg) - goto out_unlocked; - dout("add_cap_releases %p msg %p now %d\n", session, msg, - (int)msg->front.iov_len); - head = msg->front.iov_base; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - spin_lock(&session->s_cap_lock); - list_add(&msg->list_head, &session->s_cap_releases); - session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; - } - - if (partial) { - head = partial->front.iov_base; - num = le32_to_cpu(head->num); - dout(" queueing partial %p with %d/%d\n", partial, num, - (int)CEPH_CAPS_PER_RELEASE); - list_move_tail(&partial->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + int ret = 1; + spin_lock(&ci->i_ceph_lock); + if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + ret = capsnap->follows >= want_snap_seq; } - err = 0; - spin_unlock(&session->s_cap_lock); -out_unlocked: - return err; + spin_unlock(&ci->i_ceph_lock); + return ret; } -static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +static int check_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) { - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - spin_lock(&ci->i_ceph_lock); - if (ci->i_flushing_caps) - ret = ci->i_cap_flush_seq >= want_flush_seq; - else - ret = 1; - spin_unlock(&ci->i_ceph_lock); + struct rb_node *n; + struct ceph_cap_flush *cf; + int ret = 1; + + spin_lock(&mdsc->cap_dirty_lock); + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (cf && cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid %llu <= %llu\n", + cf->tid, want_flush_tid); + ret = 0; + } + spin_unlock(&mdsc->cap_dirty_lock); return ret; } /* * flush all dirty inode data to disk. * - * returns true if we've flushed through want_flush_seq + * returns true if we've flushed through want_flush_tid */ -static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid, u64 want_snap_seq) { int mds; - dout("check_cap_flush want %lld\n", want_flush_seq); + dout("check_caps_flush want %llu snap want %llu\n", + want_flush_tid, want_snap_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; ) { struct ceph_mds_session *session = mdsc->sessions[mds]; struct inode *inode = NULL; - if (!session) + if (!session) { + mds++; continue; + } get_session(session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_flushing)) { - struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); - - if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { - dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", - &ci->vfs_inode, ci->i_cap_flush_seq, - want_flush_seq, session->s_mds); + if (!list_empty(&session->s_cap_snaps_flushing)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&session->s_cap_snaps_flushing, + struct ceph_cap_snap, + flushing_item); + struct ceph_inode_info *ci = capsnap->ci; + if (!check_capsnap_flush(ci, want_snap_seq)) { + dout("check_cap_flush still flushing snap %p " + "follows %lld <= %lld to mds%d\n", + &ci->vfs_inode, capsnap->follows, + want_snap_seq, mds); inode = igrab(&ci->vfs_inode); } } @@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) if (inode) { wait_event(mdsc->cap_flushing_wq, - check_cap_flush(inode, want_flush_seq)); + check_capsnap_flush(ceph_inode(inode), + want_snap_seq)); iput(inode); + } else { + mds++; } mutex_lock(&mdsc->mutex); } - mutex_unlock(&mdsc->mutex); - dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + + wait_event(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid)); + + dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); } /* @@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg = NULL; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + struct ceph_cap *cap; + LIST_HEAD(tmp_list); + int num_cap_releases; - dout("send_cap_releases mds%d\n", session->s_mds); spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - spin_unlock(&session->s_cap_lock); - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); - ceph_con_send(&session->s_con, msg); - spin_lock(&session->s_cap_lock); - } +again: + list_splice_init(&session->s_cap_releases, &tmp_list); + num_cap_releases = session->s_num_cap_releases; + session->s_num_cap_releases = 0; spin_unlock(&session->s_cap_lock); -} -static void discard_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - unsigned num; - - dout("discard_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + if (!msg) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, + PAGE_CACHE_SIZE, GFP_NOFS, false); + if (!msg) + goto out_err; + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + } + cap = list_first_entry(&tmp_list, struct ceph_cap, + session_caps); + list_del(&cap->session_caps); + num_cap_releases--; - if (!list_empty(&session->s_cap_releases)) { - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", - session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + le32_add_cpu(&head->num, 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(cap->cap_ino); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + msg->front.iov_len += sizeof(*item); + + ceph_put_cap(mdsc, cap); + + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + msg = NULL; + } } - /* requeue completed messages */ - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); + BUG_ON(num_cap_releases != 0); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, - num); - session->s_num_cap_releases += num; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - list_add(&msg->list_head, &session->s_cap_releases); + spin_lock(&session->s_cap_lock); + if (!list_empty(&session->s_cap_releases)) + goto again; + spin_unlock(&session->s_cap_lock); + + if (msg) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); } + return; +out_err: + pr_err("send_cap_releases mds%d, failed to allocate message\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + list_splice(&tmp_list, &session->s_cap_releases); + session->s_num_cap_releases += num_cap_releases; + spin_unlock(&session->s_cap_lock); } /* @@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, order); if (rinfo->dir_in) break; @@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) struct ceph_mds_request, r_node); } -static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { - struct ceph_mds_request *req = __get_oldest_req(mdsc); - - if (req) - return req->r_tid; - return 0; + return mdsc->oldest_tid; } /* @@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* wait */ mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); - if (req->r_timeout) { - err = (long)wait_for_completion_killable_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - err = -EIO; - } else if (req->r_wait_for_completion) { + if (!req->r_timeout && req->r_wait_for_completion) { err = req->r_wait_for_completion(mdsc, req); } else { - err = wait_for_completion_killable(&req->r_completion); + long timeleft = wait_for_completion_killable_timeout( + &req->r_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (timeleft > 0) + err = 0; + else if (!timeleft) + err = -EIO; /* timed out */ + else + err = timeleft; /* killed */ } dout("do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); @@ -2496,7 +2529,6 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - discard_cap_releases(mdsc, session); - spin_unlock(&session->s_cap_lock); + cleanup_cap_releases(mdsc, session); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) @@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, reply->hdr.data_len = cpu_to_le32(pagelist->length); ceph_msg_data_add_pagelist(reply, pagelist); + + ceph_early_kick_flushing_caps(mdsc, session); + ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; + mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; + mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); - mdsc->cap_flush_seq = 0; + mdsc->last_cap_flush_tid = 1; + mdsc->cap_flush_tree = RB_ROOT; INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; @@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) ceph_caps_init(mdsc); ceph_adjust_min_caps(mdsc, fsc->min_caps); + init_rwsem(&mdsc->pool_perm_rwsem); + mdsc->pool_perm_tree = RB_ROOT; + return 0; } @@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) */ static void wait_requests(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; - struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - fsc->client->options->mount_timeout * HZ); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3485,7 +3524,8 @@ restart: nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; - if ((req->r_op & CEPH_MDS_OP_WRITE)) { + if (req->r_op != CEPH_MDS_OP_SETFILELOCK && + (req->r_op & CEPH_MDS_OP_WRITE)) { /* write op */ ceph_mdsc_get_request(req); if (nextreq) @@ -3513,7 +3553,7 @@ restart: void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush; + u64 want_tid, want_flush, want_snap; if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; @@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); - want_flush = mdsc->cap_flush_seq; + want_flush = mdsc->last_cap_flush_tid; spin_unlock(&mdsc->cap_dirty_lock); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + down_read(&mdsc->snap_rwsem); + want_snap = mdsc->last_snap_seq; + up_read(&mdsc->snap_rwsem); + + dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", + want_tid, want_flush, want_snap); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush); + wait_caps_flush(mdsc, want_flush, want_snap); } /* @@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc) */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_session *session; int i; - struct ceph_fs_client *fsc = mdsc->fsc; - unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), - timeout); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); @@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1875b5d985c6..762757e6cebf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -139,7 +139,6 @@ struct ceph_mds_session { int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; /* protected by mutex */ @@ -228,7 +227,7 @@ struct ceph_mds_request { int r_err; bool r_aborted; - unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ @@ -254,12 +253,21 @@ struct ceph_mds_request { bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; + long long r_dir_release_cnt; + long long r_dir_ordered_cnt; + int r_readdir_cache_idx; u32 r_readdir_offset; struct ceph_cap_reservation r_caps_reservation; int r_num_caps; }; +struct ceph_pool_perm { + struct rb_node node; + u32 pool; + int perm; +}; + /* * mds client state */ @@ -284,12 +292,15 @@ struct ceph_mds_client { * references (implying they contain no inodes with caps) that * should be destroyed. */ + u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ + u64 oldest_tid; /* oldest incomplete mds request, + excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ @@ -298,7 +309,8 @@ struct ceph_mds_client { struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; - u64 cap_flush_seq; + u64 last_cap_flush_tid; + struct rb_root cap_flush_tree; struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ @@ -328,6 +340,9 @@ struct ceph_mds_client { spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; + + struct rw_semaphore pool_perm_rwsem; + struct rb_root pool_perm_tree; }; extern const char *ceph_mds_op_name(int op); @@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a97e39f09ba6..233d906aec02 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) } -static struct ceph_snap_context *empty_snapc; +struct ceph_snap_context *ceph_empty_snapc; /* * build the snap context for a given realm. @@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } - if (num == 0 && realm->seq == empty_snapc->seq) { - ceph_get_snap_context(empty_snapc); - snapc = empty_snapc; + if (num == 0 && realm->seq == ceph_empty_snapc->seq) { + ceph_get_snap_context(ceph_empty_snapc); + snapc = ceph_empty_snapc; goto done; } @@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) return 0; } +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} /* * When a snapshot is applied, the size/mtime inode metadata is queued @@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; + struct ceph_snap_context *old_snapc, *new_snapc; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish @@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) writes in progress now were started before the previous cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); - kfree(capsnap); - } else if (ci->i_snap_realm->cached_context == empty_snapc) { - dout("queue_cap_snap %p empty snapc\n", inode); - kfree(capsnap); - } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { - struct ceph_snap_context *snapc = ci->i_head_snapc; - - /* - * if we are a sync write, we may need to go to the snaprealm - * to get the current snapc. - */ - if (!snapc) - snapc = ci->i_snap_realm->cached_context; + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + goto update_snapc; + } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", - inode, capsnap, snapc, ceph_cap_string(dirty)); - ihold(inode); + BUG_ON(!old_snapc); - atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; - INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); - - capsnap->follows = snapc->seq; - capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = dirty; - - capsnap->mode = inode->i_mode; - capsnap->uid = inode->i_uid; - capsnap->gid = inode->i_gid; - - if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - capsnap->xattr_blob = - ceph_buffer_get(ci->i_xattrs.blob); - capsnap->xattr_version = ci->i_xattrs.version; - } else { - capsnap->xattr_blob = NULL; - capsnap->xattr_version = 0; + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + dout("queue_cap_snap %p " + "no new_snap|dirty_page|writing\n", inode); + goto update_snapc; } + } - capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - - /* dirty page count moved from _head to this cap_snap; - all subsequent writes page dirties occur _after_ this - snapshot. */ - capsnap->dirty_pages = ci->i_wrbuffer_ref_head; - ci->i_wrbuffer_ref_head = 0; - capsnap->context = snapc; - ci->i_head_snapc = - ceph_get_snap_context(ci->i_snap_realm->cached_context); - dout(" new snapc is %p\n", ci->i_head_snapc); - list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - - if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, snapc, snapc->seq); - capsnap->writing = 1; - } else { - /* note mtime, size NOW. */ - __ceph_finish_cap_snap(ci, capsnap); - } + dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", + inode, capsnap, old_snapc, ceph_cap_string(dirty), + capsnap->need_flush ? "" : "no_flush"); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; } else { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); - kfree(capsnap); + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + old_snapc = NULL; + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + capsnap = NULL; + +update_snapc: + if (ci->i_head_snapc) { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + dout(" new snapc is %p\n", new_snapc); + } spin_unlock(&ci->i_ceph_lock); + + kfree(capsnap); + ceph_put_snap_context(old_snapc); } /* @@ -699,6 +730,8 @@ more: /* queue realm for cap_snap creation */ list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; invalidate = 1; } else if (!realm->cached_context) { @@ -964,14 +997,14 @@ out: int __init ceph_snap_init(void) { - empty_snapc = ceph_create_snap_context(0, GFP_NOFS); - if (!empty_snapc) + ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!ceph_empty_snapc) return -ENOMEM; - empty_snapc->seq = 1; + ceph_empty_snapc->seq = 1; return 0; } void ceph_snap_exit(void) { - ceph_put_snap_context(empty_snapc); + ceph_put_snap_context(ceph_empty_snapc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e9905374078..d1c833c321b9 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -134,10 +134,12 @@ enum { Opt_noino32, Opt_fscache, Opt_nofscache, + Opt_poolperm, + Opt_nopoolperm, #ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, #endif - Opt_noacl + Opt_noacl, }; static match_table_t fsopt_tokens = { @@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { {Opt_noino32, "noino32"}, {Opt_fscache, "fsc"}, {Opt_nofscache, "nofsc"}, + {Opt_poolperm, "poolperm"}, + {Opt_nopoolperm, "nopoolperm"}, #ifdef CONFIG_CEPH_FS_POSIX_ACL {Opt_acl, "acl"}, #endif @@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; break; + case Opt_poolperm: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + printk ("pool perm"); + break; + case Opt_nopoolperm: + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= MS_POSIXACL; @@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) + seq_puts(m, ",nopoolperm"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) @@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -634,6 +648,10 @@ static int __init init_caches(void) SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (ceph_cap_cachep == NULL) goto bad_cap; + ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_flush_cachep == NULL) + goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); @@ -652,6 +670,8 @@ static int __init init_caches(void) bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: + kmem_cache_destroy(ceph_cap_flush_cachep); +bad_cap_flush: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -668,6 +688,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); @@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fa20e1318939..860cc016e70d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -35,6 +35,7 @@ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ +#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ CEPH_MOUNT_OPT_DCACHE) @@ -121,11 +122,21 @@ struct ceph_cap { struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ - int mds; u64 cap_id; /* unique cap id (mds provided) */ - int issued; /* latest, from the mds */ - int implemented; /* implemented superset of issued (for revocation) */ - int mds_wanted; + union { + /* in-use caps */ + struct { + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of + issued (for revocation) */ + int mds, mds_wanted; + }; + /* caps to release */ + struct { + u64 cap_ino; + int queue_release; + }; + }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; @@ -163,6 +174,7 @@ struct ceph_cap_snap { int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; + bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -174,6 +186,17 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) } } +struct ceph_cap_flush { + u64 tid; + int caps; + bool kick; + struct rb_node g_node; // global + union { + struct rb_node i_node; // inode + struct list_head list; + }; +}; + /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where @@ -259,9 +282,9 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - int i_ordered_count; - atomic_t i_release_count; - atomic_t i_complete_count; + atomic64_t i_release_count; + atomic64_t i_ordered_count; + atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -283,11 +306,11 @@ struct ceph_inode_info { struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ struct list_head i_dirty_item, i_flushing_item; - u64 i_cap_flush_seq; /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ - u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + struct ceph_cap_flush *i_prealloc_cap_flush; + struct rb_root i_cap_flush_tree; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ @@ -438,36 +461,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ +#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ +#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ +#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ + static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count, int ordered_count) + long long release_count, + long long ordered_count) { - atomic_set(&ci->i_complete_count, release_count); - if (ci->i_ordered_count == ordered_count) - ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; - else - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + smp_mb__before_atomic(); + atomic64_set(&ci->i_complete_seq[0], release_count); + atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - atomic_inc(&ci->i_release_count); + atomic64_inc(&ci->i_release_count); +} + +static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { - return atomic_read(&ci->i_complete_count) == - atomic_read(&ci->i_release_count); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count); } static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) { - return __ceph_dir_is_complete(ci) && - (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count) && + atomic64_read(&ci->i_complete_seq[1]) == + atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) @@ -477,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) static inline void ceph_dir_clear_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&ci->i_ceph_lock); - ci->i_ordered_count++; - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; - spin_unlock(&ci->i_ceph_lock); + __ceph_dir_clear_ordered(ceph_inode(inode)); } static inline bool ceph_dir_is_complete_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool ret; - spin_lock(&ci->i_ceph_lock); - ret = __ceph_dir_is_complete_ordered(ci); - spin_unlock(&ci->i_ceph_lock); + bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); + smp_rmb(); return ret; } @@ -552,7 +578,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); +extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf); extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); @@ -606,16 +635,20 @@ struct ceph_file_info { unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ - struct dentry *dentry; /* next dentry (for dcache readdir) */ - int dir_release_count; - int dir_ordered_count; + long long dir_release_count; + long long dir_ordered_count; + int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; - +struct ceph_readdir_cache_control { + struct page *page; + struct dentry **dentries; + int index; +}; /* * A "snap realm" describes a subset of the file hierarchy sharing @@ -687,6 +720,7 @@ static inline int default_congestion_kb(void) /* snap.c */ +extern struct ceph_snap_context *ceph_empty_snapc; struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, @@ -713,8 +747,8 @@ extern void ceph_snap_exit(void); static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && - list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, - ci_item)->writing; + list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, + ci_item)->writing; } /* inode.c */ @@ -838,12 +872,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, - u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); +extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, @@ -879,6 +913,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; @@ -890,7 +927,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; @@ -911,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); +extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index cd7ffad4041d..819163d8313b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int dirty = 0; @@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + goto out; + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, name_len, val_len); @@ -966,7 +984,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -978,21 +996,28 @@ retry: flags, value ? 1 : -1, &xattr); if (!err) { - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; } spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); err = ceph_sync_setxattr(dentry, name, value, size, flags); out: + ceph_free_cap_flush(prealloc_cf); kfree(newname); kfree(newval); kfree(xattr); @@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int required_blob_size; int dirty; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) goto do_sync_unlocked; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -1080,7 +1123,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -1090,18 +1133,24 @@ retry: err = __remove_xattr_by_name(ceph_inode(inode), name); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + ceph_free_cap_flush(prealloc_cf); err = ceph_send_removexattr(dentry, name); -out: return err; } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index a2172f3f69e3..e7b478b49985 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -192,6 +192,15 @@ config CIFS_SMB2 options are also slightly simpler (compared to CIFS) due to protocol improvements. +config CIFS_SMB311 + bool "SMB3.1.1 network file system support (Experimental)" + depends on CIFS_SMB2 && INET + + help + This enables experimental support for the newest, SMB3.1.1, dialect. + This dialect includes improved security negotiation features. + If unsure, say N + config CIFS_FSCACHE bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 22b289a3b1c4..b406a32deb1f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -171,6 +171,10 @@ enum smb_version { Smb_21, Smb_30, Smb_302, +#ifdef CONFIG_CIFS_SMB311 + Smb_311, +#endif /* SMB311 */ + Smb_version_err }; struct mid_q_entry; @@ -368,6 +372,8 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -386,6 +392,9 @@ struct smb_version_operations { int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); + int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src, + struct cifsFileInfo *target_file, u64 src_off, u64 len, + u64 dest_off); int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, @@ -1617,4 +1626,8 @@ extern struct smb_version_values smb30_values; #define SMB302_VERSION_STRING "3.02" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; +#define SMB311_VERSION_STRING "3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" +extern struct smb_version_operations smb311_operations; +extern struct smb_version_values smb311_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 5f9822ac0245..47b030da0781 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2255,6 +2255,8 @@ typedef struct { /* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */ +#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */ #define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 #define FILE_SUPPORTS_USN_JOURNAL 0x02000000 #define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 @@ -2310,6 +2312,16 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ char FileName[1]; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ +typedef struct { + __le64 AllocationSize; + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad; +} __attribute__((packed)) FILE_STANDARD_INFO; /* level 0x102 QPathInfo */ + + /* defines for enumerating possible values of the Unix type field below */ #define UNIX_FILE 0 #define UNIX_DIR 1 diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f26ffbfc64d8..672ef35c9f73 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -625,9 +625,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->negflavor = CIFS_NEGFLAVOR_UNENCAP; memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); - } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || - server->capabilities & CAP_EXTENDED_SECURITY) && - (pSMBr->EncryptionKeyLength == 0)) { + } else if (pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || + server->capabilities & CAP_EXTENDED_SECURITY) { server->negflavor = CIFS_NEGFLAVOR_EXTENDED; rc = decode_ext_sec_blob(ses, pSMBr); } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8383d5ea4202..773f4dc77630 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -280,6 +280,11 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, +#ifdef CONFIG_CIFS_SMB311 + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, +#endif /* SMB311 */ + { Smb_version_err, NULL } }; static int ip_connect(struct TCP_Server_Info *server); @@ -1133,6 +1138,12 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb30_operations; /* currently identical with 3.0 */ vol->vals = &smb302_values; break; +#ifdef CONFIG_CIFS_SMB311 + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; +#endif /* SMB311 */ #endif default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); @@ -3461,6 +3472,8 @@ try_mount_again: else if (ses) cifs_put_smb_ses(ses); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; + free_xid(xid); } #endif diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 8b7898b7670f..49b8b6e41a18 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -31,12 +31,15 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include <linux/btrfs.h> #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff) + unsigned long srcfd, u64 off, u64 len, u64 destoff, + bool dup_extents) { int rc; struct cifsFileInfo *smb_file_target = dst_file->private_data; @@ -109,9 +112,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, truncate_inode_pages_range(&target_inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len)-1); - if (target_tcon->ses->server->ops->clone_range) + if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else if (!dup_extents && target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ @@ -205,7 +213,20 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); + break; + case BTRFS_IOC_CLONE: + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + break; + case CIFS_IOC_SET_INTEGRITY: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->set_integrity) + rc = tcon->ses->server->ops->set_integrity(xid, + tcon, pSMBFile); + else + rc = -EOPNOTSUPP; break; default: cifs_dbg(FYI, "unsupported ioctl\n"); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 54daee5ad4c1..df91bcf56d67 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -806,6 +806,53 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, &eof, false); } +#ifdef CONFIG_CIFS_SMB311 +static int +smb2_duplicate_extents(const unsigned int xid, + struct cifsFileInfo *srcfile, + struct cifsFileInfo *trgtfile, u64 src_off, + u64 len, u64 dest_off) +{ + int rc; + unsigned int ret_data_len; + char *retbuf = NULL; + struct duplicate_extents_to_file dup_ext_buf; + struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); + + /* server fileays advertise duplicate extent support with this flag */ + if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & + FILE_SUPPORTS_BLOCK_REFCOUNTING) == 0) + return -EOPNOTSUPP; + + dup_ext_buf.VolatileFileHandle = srcfile->fid.volatile_fid; + dup_ext_buf.PersistentFileHandle = srcfile->fid.persistent_fid; + dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off); + dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off); + dup_ext_buf.ByteCount = cpu_to_le64(len); + cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld", + src_off, dest_off, len); + + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, + FSCTL_DUPLICATE_EXTENTS_TO_FILE, + true /* is_fsctl */, (char *)&dup_ext_buf, + sizeof(struct duplicate_extents_to_file), + (char **)&retbuf, + &ret_data_len); + + if (ret_data_len > 0) + cifs_dbg(FYI, "non-zero response length in duplicate extents"); + +duplicate_extents_out: + return rc; +} +#endif /* CONFIG_CIFS_SMB311 */ + + static int smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) @@ -815,6 +862,28 @@ smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } static int +smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + struct fsctl_set_integrity_information_req integr_info; + char *retbuf = NULL; + unsigned int ret_data_len; + + integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); + integr_info.Flags = 0; + integr_info.Reserved = 0; + + return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SET_INTEGRITY_INFORMATION, + true /* is_fsctl */, (char *)&integr_info, + sizeof(struct fsctl_set_integrity_information_req), + (char **)&retbuf, + &ret_data_len); + +} + +static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -1624,6 +1693,7 @@ struct smb_version_operations smb30_operations = { .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, .set_oplock_level = smb3_set_oplock_level, .create_lease_buf = smb3_create_lease_buf, @@ -1635,6 +1705,94 @@ struct smb_version_operations smb30_operations = { .fallocate = smb3_fallocate, }; +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_operations smb311_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, + .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, + .clone_range = smb2_clone_range, + .duplicate_extents = smb2_duplicate_extents, +/* .validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */ + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, +}; +#endif /* CIFS_SMB311 */ + struct smb_version_values smb20_values = { .version_string = SMB20_VERSION_STRING, .protocol_id = SMB20_PROT_ID, @@ -1714,3 +1872,25 @@ struct smb_version_values smb302_values = { .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, .create_lease_size = sizeof(struct create_lease_v2), }; + +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_values smb311_values = { + .version_string = SMB311_VERSION_STRING, + .protocol_id = SMB311_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; +#endif /* SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 54cbe19d9c08..b8b4f08ee094 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -304,6 +304,59 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_SMB311 +/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */ +#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */ + + +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) + +static void +build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(38); + pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); + pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; +} + +static void +build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(6); + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; +} + +static void +assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + + /* +4 is to account for the RFC1001 len field */ + char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4; + + build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); + /* Add 2 to size to round to 8 byte boundary */ + pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); + build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); + req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + req->NegotiateContextCount = cpu_to_le16(2); + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ +} +#else +static void assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + return; +} +#endif /* SMB311 */ + + /* * * SMB2 Worker functions follow: @@ -363,10 +416,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* ClientGUID must be zero for SMB2.02 dialect */ if (ses->server->vals->protocol_id == SMB20_PROT_ID) memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); - else + else { memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); - + if (ses->server->vals->protocol_id == SMB311_PROT_ID) + assemble_neg_contexts(req); + } iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; @@ -393,8 +448,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); +#ifdef CONFIG_CIFS_SMB311 + else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n"); +#endif /* SMB311 */ else { - cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n", le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; @@ -572,7 +631,7 @@ ssetup_ntlmssp_authenticate: return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ - req->VcNumber = 0; /* MBZ */ + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 70867d54fb8b..451108284a2f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -136,9 +136,6 @@ struct smb2_transform_hdr { __u64 SessionId; } __packed; -/* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) - /* * SMB2 flag definitions */ @@ -191,7 +188,10 @@ struct smb2_negotiate_req { __le16 Reserved; /* MBZ */ __le32 Capabilities; __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - __le64 ClientStartTime; /* MBZ */ + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ } __packed; @@ -200,6 +200,7 @@ struct smb2_negotiate_req { #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 #define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 #define BAD_PROT_ID 0xFFFF /* SecurityMode flags */ @@ -217,12 +218,38 @@ struct smb2_negotiate_req { #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) + +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ + __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ +} __packed; + struct smb2_negotiate_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; - __le16 Reserved; /* MBZ */ + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ __u8 ServerGUID[16]; __le32 Capabilities; __le32 MaxTransactSize; @@ -232,14 +259,18 @@ struct smb2_negotiate_rsp { __le64 ServerStartTime; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le32 Reserved2; /* may be any value, ignore */ + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + struct smb2_sess_setup_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 25 */ - __u8 VcNumber; + __u8 Flags; __u8 SecurityMode; __le32 Capabilities; __le32 Channel; @@ -274,10 +305,13 @@ struct smb2_logoff_rsp { __le16 Reserved; } __packed; +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 + struct smb2_tree_connect_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ - __le16 Reserved; + __le16 Reserved; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; __u8 Buffer[1]; /* variable length */ @@ -587,6 +621,29 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; +struct fsctl_set_integrity_information_req { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; +} __packed; + +struct fsctl_get_integrity_information_rsp { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; + __le32 ChecksumChunkSizeInBytes; + __le32 ClusterSizeInBytes; +} __packed; + +/* Integrity ChecksumAlgorithm choices for above */ +#define CHECKSUM_TYPE_NONE 0x0000 +#define CHECKSUM_TYPE_CRC64 0x0002 +#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ + +/* Integrity flags for above */ +#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 + + struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; @@ -620,6 +677,14 @@ struct compress_ioctl { __le16 CompressionState; /* See cifspdu.h for possible flag values */ } __packed; +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + struct smb2_ioctl_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 83efa59535be..a639d0dab453 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -75,10 +75,13 @@ #define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 #define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ #define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ /* strange that the number for this op is not sequential with previous op */ diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d6f7a76a1f5b..f829fe963f5b 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -79,7 +79,7 @@ void coda_sysctl_clean(void); static inline struct coda_inode_info *ITOC(struct inode *inode) { - return list_entry(inode, struct coda_inode_info, vfs_inode); + return container_of(inode, struct coda_inode_info, vfs_inode); } static __inline__ struct CodaFid *coda_i2f(struct inode *inode) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 8d89f5fd0331..eae87575e681 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { + if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 537356742091..a8f3b589a2df 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -129,8 +129,6 @@ void configfs_release_fs(void) } -static struct kobject *config_kobj; - static int __init configfs_init(void) { int err = -ENOMEM; @@ -141,8 +139,8 @@ static int __init configfs_init(void) if (!configfs_dir_cachep) goto out; - config_kobj = kobject_create_and_add("config", kernel_kobj); - if (!config_kobj) + err = sysfs_create_mount_point(kernel_kobj, "config"); + if (err) goto out2; err = register_filesystem(&configfs_fs_type); @@ -152,7 +150,7 @@ static int __init configfs_init(void) return 0; out3: pr_err("Unable to register filesystem!\n"); - kobject_put(config_kobj); + sysfs_remove_mount_point(kernel_kobj, "config"); out2: kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; @@ -163,7 +161,7 @@ out: static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); - kobject_put(config_kobj); + sysfs_remove_mount_point(kernel_kobj, "config"); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; } diff --git a/fs/coredump.c b/fs/coredump.c index e52e0064feac..c5ecde6f3eed 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -140,7 +140,7 @@ static int cn_print_exe_file(struct core_name *cn) goto put_exe_file; } - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + path = file_path(exe_file, pathbuf, PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); goto free_buf; @@ -155,7 +155,7 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, } if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter(addr, max - pos, iter); + len = copy_from_iter_nocache(addr, max - pos, iter); else if (!hole) len = copy_to_iter(addr, max - pos, iter); else @@ -209,7 +209,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - inode_dio_begin(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); @@ -219,7 +220,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_end(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(inode); out: return retval; } @@ -309,14 +311,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, out: i_mmap_unlock_read(mapping); - if (bh->b_end_io) - bh->b_end_io(bh, 1); - return error; } -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +/** + * __dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. __dax_fault() assumes the caller has done all + * the necessary locking for the page fault to proceed successfully. + */ +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -417,7 +426,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } + /* + * If we successfully insert the new mapping over an unwritten extent, + * we need to ensure we convert the unwritten extent. If there is an + * error inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, same + * as for normal BH based IO completions. + */ error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !error); out: if (error == -ENOMEM) @@ -434,6 +455,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } goto out; } +EXPORT_SYMBOL(__dax_fault); /** * dax_fault - handle a page fault on a DAX file @@ -445,7 +467,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -454,7 +476,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block); + result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); diff --git a/fs/dcache.c b/fs/dcache.c index 592c4b582495..7a3f3e5f9cea 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1673,7 +1673,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE )); + DCACHE_OP_DELETE | + DCACHE_OP_SELECT_INODE)); dentry->d_op = op; if (!op) return; @@ -1689,6 +1690,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; + if (op->d_select_inode) + dentry->d_flags |= DCACHE_OP_SELECT_INODE; } EXPORT_SYMBOL(d_set_d_op); @@ -2927,17 +2930,6 @@ restart: vfsmnt = &mnt->mnt; continue; } - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || - dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, - dentry->d_name.name); - } if (!error) error = is_mounted(vfsmnt) ? 1 : 2; break; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7eaec88ea970..c711be8d6a3c 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb) return inode; } -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - struct debugfs_mount_opts { kuid_t uid; kgid_t gid; @@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (debugfs_positive(dentry)) { + if (simple_positive(dentry)) { dget(dentry); if (d_is_dir(dentry)) ret = simple_rmdir(d_inode(parent), dentry); @@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!debugfs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * debugfs_positive() check. + * simple_positive() check. */ goto loop; } @@ -716,20 +711,17 @@ bool debugfs_initialized(void) } EXPORT_SYMBOL_GPL(debugfs_initialized); - -static struct kobject *debug_kobj; - static int __init debugfs_init(void) { int retval; - debug_kobj = kobject_create_and_add("debug", kernel_kobj); - if (!debug_kobj) - return -EINVAL; + retval = sysfs_create_mount_point(kernel_kobj, "debug"); + if (retval) + return retval; retval = register_filesystem(&debug_fs_type); if (retval) - kobject_put(debug_kobj); + sysfs_remove_mount_point(kernel_kobj, "debug"); else debugfs_registered = true; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index add566303c68..c35ffdc12bba 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = &fsi->mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,16 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; + table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(&devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4deb0b05b011..e5bb2abf77f9 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page) page_cache_release(page); } -/* Accesses dir's inode->i_size must be called under inode lock */ -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) { loff_t last_byte = inode->i_size; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 796b491e6978..0c6638b40f21 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3a0a6c6406d0..3b57c9f83c9b 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -28,12 +28,12 @@ #ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block); + return dax_fault(vma, vmf, ext2_get_block, NULL); } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block); + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); } static const struct vm_operations_struct ext2_dax_vm_ops = { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac517f15741c..bc313ac5d3fa 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -192,15 +192,27 @@ out: } #ifdef CONFIG_FS_DAX +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block); + return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); /* Is this the right get_block? */ } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block); + return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f8a8d4ee7459..41f8e55afcd1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -656,18 +656,6 @@ has_zeroout: return retval; } -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + if (IS_DAX(inode) && buffer_unwritten(bh)) { + /* + * dgc: I suspect unwritten conversion on ext4+DAX is + * fundamentally broken here when there are concurrent + * read/write in progress on this inode. + */ + WARN_ON_ONCE(io_end); bh->b_assoc_map = inode->i_mapping; bh->b_private = (void *)(unsigned long)iblock; - bh->b_end_io = ext4_end_io_unwritten; } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5c787647afe2..58987b5c514b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -452,7 +452,7 @@ void __ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { - path = d_path(&(file->f_path), pathname, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); diff --git a/fs/file.c b/fs/file.c index 93c5f89c248b..6c672ad329e9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + + /* make sure all __fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. + */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); + spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) __free_fdtable(new_fdt); return -EMFILE; } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock - */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int expanded = 0; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + might_sleep(); + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) @@ -635,11 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask) struct file *file; rcu_read_lock(); +loop: file = fcheck_files(files, fd); if (file) { - /* File object ref couldn't be taken */ - if ((file->f_mode & mask) || !get_file_rcu(file)) + /* File object ref couldn't be taken. + * dup2() atomicity guarantee is the reason + * we loop to catch the new file (or NULL pointer) + */ + if (file->f_mode & mask) file = NULL; + else if (!get_file_rcu(file)) + goto loop; } rcu_read_unlock(); diff --git a/fs/file_table.c b/fs/file_table.c index 294174dcc226..7f9d407c7595 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -20,7 +20,6 @@ #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> -#include <linux/lglock.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/hardirq.h> diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 99c7f0a37af4..484b32d3234a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = { .iterate = vxfs_readdir, }; - -static inline u_long -dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static inline u_long dir_blocks(struct inode *ip) { diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 89acec742e0b..d403c69bee08 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (fscache_object_is_dead(object)) { + if (fscache_object_is_dying(object) || + fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } @@ -671,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL); + fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -695,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); - ret = fscache_wait_for_operation_activation(object, op, - NULL, NULL, NULL); + ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 7872a62ef30c..97ec45110957 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -124,8 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *)); +extern int fscache_cancel_op(struct fscache_operation *, bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); @@ -138,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); extern int fscache_wait_for_operation_activation(struct fscache_object *, struct fscache_operation *, atomic_t *, - atomic_t *, - void (*)(struct fscache_operation *)); + atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); /* @@ -164,6 +162,7 @@ extern atomic_t fscache_n_op_pend; extern atomic_t fscache_n_op_run; extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_initialised; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; @@ -271,6 +270,11 @@ extern atomic_t fscache_n_cop_write_page; extern atomic_t fscache_n_cop_uncache_page; extern atomic_t fscache_n_cop_dissociate_pages; +extern atomic_t fscache_n_cache_no_space_reject; +extern atomic_t fscache_n_cache_stale_objects; +extern atomic_t fscache_n_cache_retired_objects; +extern atomic_t fscache_n_cache_culled_objects; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index da032daf0e0d..9e792e30f4db 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -328,6 +328,17 @@ void fscache_object_init(struct fscache_object *object, EXPORT_SYMBOL(fscache_object_init); /* + * Mark the object as no longer being live, making sure that we synchronise + * against op submission. + */ +static inline void fscache_mark_object_dead(struct fscache_object *object) +{ + spin_lock(&object->lock); + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + spin_unlock(&object->lock); +} + +/* * Abort object initialisation before we start it. */ static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, @@ -610,6 +621,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); + set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); + cookie = object->cookie; set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) @@ -629,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob _enter("{OBJ%x,%d,%d},%d", object->debug_id, object->n_ops, object->n_children, event); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); object->oob_event_mask = 0; if (list_empty(&object->dependents) && @@ -948,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + fscache_operation_init(op, object->cache->ops->invalidate_object, + NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -974,13 +988,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj return transit_to(UPDATE_OBJECT); nomem: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); fscache_unuse_cookie(object); _leave(" [ENOMEM]"); return transit_to(KILL_OBJECT); submit_op_failed: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); spin_unlock(&cookie->lock); fscache_unuse_cookie(object); kfree(op); @@ -1016,3 +1030,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * _leave(""); return transit_to(WAIT_FOR_CMD); } + +/** + * fscache_object_retrying_stale - Note retrying stale object + * @object: The object that will be retried + * + * Note that an object lookup found an on-disk object that was adjudged to be + * stale and has been deleted. The lookup will be retried. + */ +void fscache_object_retrying_stale(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cache_no_space_reject); +} +EXPORT_SYMBOL(fscache_object_retrying_stale); + +/** + * fscache_object_mark_killed - Note that an object was killed + * @object: The object that was culled + * @why: The reason the object was killed. + * + * Note that an object was killed. Returns true if the object was + * already marked killed, false if it wasn't. + */ +void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why) +{ + if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { + pr_err("Error: Object already killed by cache [%s]\n", + object->cache->identifier); + return; + } + + switch (why) { + case FSCACHE_OBJECT_NO_SPACE: + fscache_stat(&fscache_n_cache_no_space_reject); + break; + case FSCACHE_OBJECT_IS_STALE: + fscache_stat(&fscache_n_cache_stale_objects); + break; + case FSCACHE_OBJECT_WAS_RETIRED: + fscache_stat(&fscache_n_cache_retired_objects); + break; + case FSCACHE_OBJECT_WAS_CULLED: + fscache_stat(&fscache_n_cache_culled_objects); + break; + } +} +EXPORT_SYMBOL(fscache_object_mark_killed); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7b87a0e5185..de67745e1cd7 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,35 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +static void fscache_operation_dummy_cancel(struct fscache_operation *op) +{ +} + +/** + * fscache_operation_init - Do basic initialisation of an operation + * @op: The operation to initialise + * @release: The release function to assign + * + * Do basic initialisation of an operation. The caller must still set flags, + * object and processor if needed. + */ +void fscache_operation_init(struct fscache_operation *op, + fscache_operation_processor_t processor, + fscache_operation_cancel_t cancel, + fscache_operation_release_t release) +{ + INIT_WORK(&op->work, fscache_op_work_func); + atomic_set(&op->usage, 1); + op->state = FSCACHE_OP_ST_INITIALISED; + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + op->processor = processor; + op->cancel = cancel ?: fscache_operation_dummy_cancel; + op->release = release; + INIT_LIST_HEAD(&op->pend_link); + fscache_stat(&fscache_n_op_initialised); +} +EXPORT_SYMBOL(fscache_operation_init); + /** * fscache_enqueue_operation - Enqueue an operation for processing * @op: The operation to enqueue @@ -76,6 +105,43 @@ static void fscache_run_op(struct fscache_object *object, } /* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + const struct fscache_state *ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* * submit an exclusive operation for an object * - other ops are excluded from running simultaneously with this one * - this gets any extra refs it needs on an op @@ -83,6 +149,8 @@ static void fscache_run_op(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); @@ -95,8 +163,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object, ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); + ostate = object->state; + smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -118,7 +199,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -126,12 +207,15 @@ int fscache_submit_exclusive_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } else { - /* If we're in any other state, there must have been an I/O - * error of some nature. - */ - ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); - ret = -EIO; + fscache_report_unexpected_submission(object, op, ostate); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } spin_unlock(&object->lock); @@ -139,43 +223,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, } /* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - -/* * submit an operation for an object * - objects may be submitted only in the following states: * - during object creation (write ops may be submitted) @@ -187,6 +234,7 @@ int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -204,7 +252,17 @@ int fscache_submit_op(struct fscache_object *object, smp_rmb(); op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; @@ -222,23 +280,21 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (fscache_object_is_dying(object)) { - fscache_stat(&fscache_n_op_rejected); + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; - } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -293,9 +349,10 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *)) + bool cancel_in_progress_op) { struct fscache_object *object = op->object; + bool put = false; int ret; _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); @@ -309,19 +366,37 @@ int fscache_cancel_op(struct fscache_operation *op, ret = -EBUSY; if (op->state == FSCACHE_OP_ST_PENDING) { ASSERT(!list_empty(&op->pend_link)); - fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); - if (do_cancel) - do_cancel(op); + put = true; + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + ret = 0; + } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { + ASSERTCMP(object->n_in_progress, >, 0); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); ret = 0; } + if (put) + fscache_put_operation(op); spin_unlock(&object->lock); _leave(" = %d", ret); return ret; @@ -345,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object) list_del_init(&op->pend_link); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) @@ -377,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); - op->state = cancelled ? - FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + if (!cancelled) { + op->state = FSCACHE_OP_ST_COMPLETE; + } else { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + } if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -409,9 +489,9 @@ void fscache_put_operation(struct fscache_operation *op) return; _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && + op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); - op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -419,37 +499,39 @@ void fscache_put_operation(struct fscache_operation *op) op->release(op); op->release = NULL; } + op->state = FSCACHE_OP_ST_DEAD; object = op->object; + if (likely(object)) { + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; + spin_unlock(&object->lock); } - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); _leave(" [done]"); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index de33b3fccca6..483bbc613bf0 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL); + fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -239,7 +239,7 @@ nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_operation(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); @@ -249,6 +249,17 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + +/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -258,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, + atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) - fscache_put_context(op->op.object->cookie, op->context); + fscache_put_context(op->cookie, op->context); _leave(""); } @@ -285,15 +297,24 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, + fscache_do_cancel_retrieval, + fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->cookie = cookie; op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); + + /* Pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure. + */ + if (context) + fscache_get_context(op->cookie, context); return op; } @@ -330,24 +351,12 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) } /* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - -/* * wait for an object to become active (or dead) */ int fscache_wait_for_operation_activation(struct fscache_object *object, struct fscache_operation *op, atomic_t *stat_op_waits, - atomic_t *stat_object_dead, - void (*do_cancel)(struct fscache_operation *)) + atomic_t *stat_object_dead) { int ret; @@ -359,7 +368,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel); + ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -377,11 +386,13 @@ check_if_dead: _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } - if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); - fscache_cancel_op(op, do_cancel); + if (unlikely(fscache_object_is_dying(object) || + fscache_cache_is_broken(object))) { + enum fscache_operation_state state = op->state; + fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [obj dead %d]", state); return -ENOBUFS; } return 0; @@ -453,17 +464,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -503,7 +509,7 @@ nobufs_unlock: spin_unlock(&cookie->lock); if (wake_cookie) __fscache_wake_unused_cookie(cookie); - kfree(op); + fscache_put_retrieval(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -584,17 +590,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -632,7 +633,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -700,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_allocs_object_dead)); if (ret < 0) goto error; @@ -726,7 +726,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -944,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, + fscache_operation_init(&op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | @@ -1016,7 +1016,7 @@ already_pending: spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); fscache_stat(&fscache_n_stores_ok); _leave(" = 0"); return 0; @@ -1036,7 +1036,7 @@ nobufs_unlock_obj: nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); @@ -1044,7 +1044,7 @@ nobufs: return -ENOBUFS; nomem_free: - kfree(op); + fscache_put_operation(&op->op); nomem: fscache_stat(&fscache_n_stores_oom); _leave(" = -ENOMEM"); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 40d13c70ef51..7cfa0aacdf6d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -23,6 +23,7 @@ atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; @@ -130,6 +131,11 @@ atomic_t fscache_n_cop_write_page; atomic_t fscache_n_cop_uncache_page; atomic_t fscache_n_cop_dissociate_pages; +atomic_t fscache_n_cache_no_space_reject; +atomic_t fscache_n_cache_stale_objects; +atomic_t fscache_n_cache_retired_objects; +atomic_t fscache_n_cache_culled_objects; + /* * display the general statistics */ @@ -246,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_enqueue), atomic_read(&fscache_n_op_cancelled), atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_initialised), atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); @@ -271,6 +278,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cop_write_page), atomic_read(&fscache_n_cop_uncache_page), atomic_read(&fscache_n_cop_dissociate_pages)); + seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", + atomic_read(&fscache_n_cache_no_space_reject), + atomic_read(&fscache_n_cache_stale_objects), + atomic_read(&fscache_n_cache_retired_objects), + atomic_read(&fscache_n_cache_culled_objects)); return 0; } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e5bbf748b698..eae2c11268bc 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -489,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc) */ static int cuse_channel_open(struct inode *inode, struct file *file) { + struct fuse_dev *fud; struct cuse_conn *cc; int rc; @@ -499,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc); + fud = fuse_dev_alloc(&cc->fc); + if (!fud) { + kfree(cc); + return -ENOMEM; + } + INIT_LIST_HEAD(&cc->list); cc->fc.release = cuse_fc_release; - cc->fc.connected = 1; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { - fuse_conn_put(&cc->fc); + fuse_dev_free(fud); return rc; } - file->private_data = &cc->fc; /* channel owns base reference to cc */ + file->private_data = fud; return 0; } @@ -527,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct cuse_conn *cc = fc_to_cc(file->private_data); + struct fuse_dev *fud = file->private_data; + struct cuse_conn *cc = fc_to_cc(fud->fc); int rc; /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c8b68ab2e574..80cc1b35d460 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,13 +25,13 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; -static struct fuse_conn *fuse_get_conn(struct file *file) +static struct fuse_dev *fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return file->private_data; + return ACCESS_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, @@ -48,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; + __set_bit(FR_PENDING, &req->flags); } static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) @@ -168,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (!fc->connected) goto out; + err = -ECONNREFUSED; + if (fc->conn_error) + goto out; + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) { @@ -177,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, } fuse_req_init_context(req); - req->waiting = 1; - req->background = for_background; + __set_bit(FR_WAITING, &req->flags); + if (for_background) + __set_bit(FR_BACKGROUND, &req->flags); + return req; out: @@ -268,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, req = get_reserved_req(fc, file); fuse_req_init_context(req); - req->waiting = 1; - req->background = 0; + __set_bit(FR_WAITING, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - if (unlikely(req->background)) { + if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background * request was allocated but not sent @@ -287,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) spin_unlock(&fc->lock); } - if (req->waiting) + if (test_bit(FR_WAITING, &req->flags)) { + __clear_bit(FR_WAITING, &req->flags); atomic_dec(&fc->num_waiting); + } if (req->stolen_file) put_reserved_req(fc, req); @@ -309,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } -static u64 fuse_get_unique(struct fuse_conn *fc) +static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - fc->reqctr++; - /* zero is special */ - if (fc->reqctr == 0) - fc->reqctr = 1; - - return fc->reqctr; + return ++fiq->reqctr; } -static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - list_add_tail(&req->list, &fc->pending); - req->state = FUSE_REQ_PENDING; - if (!req->waiting) { - req->waiting = 1; - atomic_inc(&fc->num_waiting); - } - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + list_add_tail(&req->list, &fiq->pending); + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, u64 nodeid, u64 nlookup) { + struct fuse_iqueue *fiq = &fc->iq; + forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fc->lock); - if (fc->connected) { - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } else { kfree(forget); } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -356,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc) while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; + struct fuse_iqueue *fiq = &fc->iq; req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); + spin_unlock(&fiq->waitq.lock); } } @@ -372,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc) * was closed. The requester thread is woken up (if still waiting), * the 'end' callback is called if given, else the reference to the * request is released - * - * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; - list_del(&req->list); - list_del(&req->intr_entry); - req->state = FUSE_REQ_FINISHED; - if (req->background) { - req->background = 0; + struct fuse_iqueue *fiq = &fc->iq; + + if (test_and_set_bit(FR_FINISHED, &req->flags)) + return; + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + WARN_ON(test_bit(FR_PENDING, &req->flags)); + WARN_ON(test_bit(FR_SENT, &req->flags)); + if (test_bit(FR_BACKGROUND, &req->flags)) { + spin_lock(&fc->lock); + clear_bit(FR_BACKGROUND, &req->flags); if (fc->num_background == fc->max_background) fc->blocked = 0; @@ -401,122 +407,105 @@ __releases(fc->lock) fc->num_background--; fc->active_background--; flush_bg_queue(fc); + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); wake_up(&req->waitq); - if (end) - end(fc, req); + if (req->end) + req->end(fc, req); fuse_put_request(fc, req); } -static void wait_answer_interruptible(struct fuse_conn *fc, - struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) -{ - if (signal_pending(current)) - return; - - spin_unlock(&fc->lock); - wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); -} - -static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - list_add_tail(&req->intr_entry, &fc->interrupts); - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + wake_up_locked(&fiq->waitq); + } + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) { + struct fuse_iqueue *fiq = &fc->iq; + int err; + if (!fc->no_interrupt) { /* Any signal may interrupt this */ - wait_answer_interruptible(fc, req); - - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) return; - req->interrupted = 1; - if (req->state == FUSE_REQ_SENT) - queue_interrupt(fc, req); + set_bit(FR_INTERRUPTED, &req->flags); + /* matches barrier in fuse_dev_do_read() */ + smp_mb__after_atomic(); + if (test_bit(FR_SENT, &req->flags)) + queue_interrupt(fiq, req); } - if (!req->force) { + if (!test_bit(FR_FORCE, &req->flags)) { sigset_t oldset; /* Only fatal signals may interrupt this */ block_sigs(&oldset); - wait_answer_interruptible(fc, req); + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); restore_sigs(&oldset); - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + if (!err) return; + spin_lock(&fiq->waitq.lock); /* Request is not yet in userspace, bail out */ - if (req->state == FUSE_REQ_PENDING) { + if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); + spin_unlock(&fiq->waitq.lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } + spin_unlock(&fiq->waitq.lock); } /* * Either request is already in userspace, or it was forced. * Wait it out. */ - spin_unlock(&fc->lock); - wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); - - if (!req->aborted) - return; - - aborted: - BUG_ON(req->state != FUSE_REQ_FINISHED); - if (req->locked) { - /* This is uninterruptible sleep, because data is - being copied to/from the buffers of req. During - locked state, there mustn't be any filesystem - operation (e.g. page fault), since that could lead - to deadlock */ - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - spin_lock(&fc->lock); - } + wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(req->background); - spin_lock(&fc->lock); - if (!fc->connected) + struct fuse_iqueue *fiq = &fc->iq; + + BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) { + spin_unlock(&fiq->waitq.lock); req->out.h.error = -ENOTCONN; - else if (fc->conn_error) - req->out.h.error = -ECONNREFUSED; - else { - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + } else { + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); /* acquire extra reference, since request is still needed after request_end() */ __fuse_get_request(req); + spin_unlock(&fiq->waitq.lock); request_wait_answer(fc, req); + /* Pairs with smp_wmb() in request_end() */ + smp_rmb(); } - spin_unlock(&fc->lock); } void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; + __set_bit(FR_ISREPLY, &req->flags); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } __fuse_request_send(fc, req); } EXPORT_SYMBOL_GPL(fuse_request_send); @@ -586,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -static void fuse_request_send_nowait_locked(struct fuse_conn *fc, - struct fuse_req *req) +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) { - BUG_ON(!req->background); + BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } + __set_bit(FR_ISREPLY, &req->flags); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -602,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, flush_bg_queue(fc); } -static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { + BUG_ON(!req->end); spin_lock(&fc->lock); if (fc->connected) { - fuse_request_send_nowait_locked(fc, req); + fuse_request_send_background_locked(fc, req); spin_unlock(&fc->lock); } else { + spin_unlock(&fc->lock); req->out.h.error = -ENOTCONN; - request_end(fc, req); + req->end(fc, req); + fuse_put_request(fc, req); } } - -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait(fc, req); -} EXPORT_SYMBOL_GPL(fuse_request_send_background); static int fuse_request_send_notify_reply(struct fuse_conn *fc, struct fuse_req *req, u64 unique) { int err = -ENODEV; + struct fuse_iqueue *fiq = &fc->iq; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fc->lock); - if (fc->connected) { - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + queue_request(fiq, req); err = 0; } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait_locked(fc, req); -} - void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); @@ -665,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); __fuse_request_send(fc, req); /* ignore errors */ fuse_put_request(fc, req); @@ -676,38 +661,39 @@ void fuse_force_forget(struct file *file, u64 nodeid) * anything that could cause a page-fault. If the request was already * aborted bail out. */ -static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +static int lock_request(struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fc->lock); - if (req->aborted) + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; else - req->locked = 1; - spin_unlock(&fc->lock); + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } return err; } /* - * Unlock request. If it was aborted during being locked, the - * requester thread is currently waiting for it to be unlocked, so - * wake it up. + * Unlock request. If it was aborted while locked, caller is responsible + * for unlocking and ending the request. */ -static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +static int unlock_request(struct fuse_req *req) { + int err = 0; if (req) { - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) - wake_up(&req->waitq); - spin_unlock(&fc->lock); + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) + err = -ENOENT; + else + clear_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } + return err; } struct fuse_copy_state { - struct fuse_conn *fc; int write; struct fuse_req *req; struct iov_iter *iter; @@ -721,13 +707,10 @@ struct fuse_copy_state { unsigned move_pages:1; }; -static void fuse_copy_init(struct fuse_copy_state *cs, - struct fuse_conn *fc, - int write, +static void fuse_copy_init(struct fuse_copy_state *cs, int write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); - cs->fc = fc; cs->write = write; cs->iter = iter; } @@ -760,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct page *page; int err; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); if (cs->pipebufs) { struct pipe_buffer *buf = cs->pipebufs; @@ -809,7 +795,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) iov_iter_advance(cs->iter, err); } - return lock_request(cs->fc, cs->req); + return lock_request(cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -860,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); err = buf->ops->confirm(cs->pipe, buf); @@ -914,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) lru_cache_add_file(newpage); err = 0; - spin_lock(&cs->fc->lock); - if (cs->req->aborted) + spin_lock(&cs->req->waitq.lock); + if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else *pagep = newpage; - spin_unlock(&cs->fc->lock); + spin_unlock(&cs->req->waitq.lock); if (err) { unlock_page(newpage); @@ -939,7 +928,7 @@ out_fallback: cs->pg = buf->page; cs->offset = buf->offset; - err = lock_request(cs->fc, cs->req); + err = lock_request(cs->req); if (err) return err; @@ -950,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count) { struct pipe_buffer *buf; + int err; if (cs->nr_segs == cs->pipe->buffers) return -EIO; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); buf = cs->pipebufs; @@ -1065,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } -static int forget_pending(struct fuse_conn *fc) +static int forget_pending(struct fuse_iqueue *fiq) { - return fc->forget_list_head.next != NULL; + return fiq->forget_list_head.next != NULL; } -static int request_pending(struct fuse_conn *fc) +static int request_pending(struct fuse_iqueue *fiq) { - return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || - forget_pending(fc); -} - -/* Wait until a request is available on the pending list */ -static void request_wait(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&fc->waitq, &wait); - while (fc->connected && !request_pending(fc)) { - set_current_state(TASK_INTERRUPTIBLE); - if (signal_pending(current)) - break; - - spin_unlock(&fc->lock); - schedule(); - spin_lock(&fc->lock); - } - set_current_state(TASK_RUNNING); - remove_wait_queue(&fc->waitq, &wait); + return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) || + forget_pending(fiq); } /* @@ -1103,11 +1075,12 @@ __acquires(fc->lock) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fc->lock held, releases it + * Called with fiq->waitq.lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_interrupt(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fc->lock) +__releases(fiq->waitq.lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1115,7 +1088,7 @@ __releases(fc->lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fc); + req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; @@ -1123,7 +1096,7 @@ __releases(fc->lock) ih.unique = req->intr_unique; arg.unique = req->in.h.unique; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); if (nbytes < reqsize) return -EINVAL; @@ -1135,21 +1108,21 @@ __releases(fc->lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, +static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, unsigned max, unsigned *countp) { - struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; unsigned count; for (count = 0; *newhead != NULL && count < max; count++) newhead = &(*newhead)->next; - fc->forget_list_head.next = *newhead; + fiq->forget_list_head.next = *newhead; *newhead = NULL; - if (fc->forget_list_head.next == NULL) - fc->forget_list_tail = &fc->forget_list_head; + if (fiq->forget_list_head.next == NULL) + fiq->forget_list_tail = &fiq->forget_list_head; if (countp != NULL) *countp = count; @@ -1157,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, return head; } -static int fuse_read_single_forget(struct fuse_conn *fc, +static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1190,9 +1163,9 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_batch_forget(struct fuse_conn *fc, +static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; unsigned max_forgets; @@ -1201,18 +1174,18 @@ __releases(fc->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; if (nbytes < ih.len) { - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fc, max_forgets, &count); - spin_unlock(&fc->lock); + head = dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->waitq.lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1239,14 +1212,15 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { - if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) - return fuse_read_single_forget(fc, cs, nbytes); + if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fiq, cs, nbytes); else - return fuse_read_batch_forget(fc, cs, nbytes); + return fuse_read_batch_forget(fiq, cs, nbytes); } /* @@ -1258,46 +1232,51 @@ __releases(fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, +static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_copy_state *cs, size_t nbytes) { - int err; + ssize_t err; + struct fuse_conn *fc = fud->fc; + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_in *in; unsigned reqsize; restart: - spin_lock(&fc->lock); + spin_lock(&fiq->waitq.lock); err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fc->connected && - !request_pending(fc)) + if ((file->f_flags & O_NONBLOCK) && fiq->connected && + !request_pending(fiq)) goto err_unlock; - request_wait(fc); - err = -ENODEV; - if (!fc->connected) + err = wait_event_interruptible_exclusive_locked(fiq->waitq, + !fiq->connected || request_pending(fiq)); + if (err) goto err_unlock; - err = -ERESTARTSYS; - if (!request_pending(fc)) + + err = -ENODEV; + if (!fiq->connected) goto err_unlock; - if (!list_empty(&fc->interrupts)) { - req = list_entry(fc->interrupts.next, struct fuse_req, + if (!list_empty(&fiq->interrupts)) { + req = list_entry(fiq->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, cs, nbytes, req); + return fuse_read_interrupt(fiq, cs, nbytes, req); } - if (forget_pending(fc)) { - if (list_empty(&fc->pending) || fc->forget_batch-- > 0) - return fuse_read_forget(fc, cs, nbytes); + if (forget_pending(fiq)) { + if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0) + return fuse_read_forget(fc, fiq, cs, nbytes); - if (fc->forget_batch <= -8) - fc->forget_batch = 16; + if (fiq->forget_batch <= -8) + fiq->forget_batch = 16; } - req = list_entry(fc->pending.next, struct fuse_req, list); - req->state = FUSE_REQ_READING; - list_move(&req->list, &fc->io); + req = list_entry(fiq->pending.next, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + spin_unlock(&fiq->waitq.lock); in = &req->in; reqsize = in->h.len; @@ -1310,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, request_end(fc, req); goto restart; } - spin_unlock(&fc->lock); + spin_lock(&fpq->lock); + list_add(&req->list, &fpq->io); + spin_unlock(&fpq->lock); cs->req = req; err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) { - request_end(fc, req); - return -ENODEV; + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) { + err = -ENODEV; + goto out_end; } if (err) { req->out.h.error = -EIO; - request_end(fc, req); - return err; + goto out_end; } - if (!req->isreply) - request_end(fc, req); - else { - req->state = FUSE_REQ_SENT; - list_move_tail(&req->list, &fc->processing); - if (req->interrupted) - queue_interrupt(fc, req); - spin_unlock(&fc->lock); + if (!test_bit(FR_ISREPLY, &req->flags)) { + err = reqsize; + goto out_end; } + list_move_tail(&req->list, &fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + if (test_bit(FR_INTERRUPTED, &req->flags)) + queue_interrupt(fiq, req); + return reqsize; +out_end: + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); + return err; + err_unlock: - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } @@ -1359,16 +1349,17 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; if (!iter_is_iovec(to)) return -EINVAL; - fuse_copy_init(&cs, fc, 1, to); + fuse_copy_init(&cs, 1, to); - return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to)); + return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, @@ -1380,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(in); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(in); + + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, fc, 1, NULL); + fuse_copy_init(&cs, 1, NULL); cs.pipebufs = bufs; cs.pipe = pipe; - ret = fuse_dev_do_read(fc, in, &cs, len); + ret = fuse_dev_do_read(fud, in, &cs, len); if (ret < 0) goto out; @@ -1830,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { struct fuse_req *req; - list_for_each_entry(req, &fc->processing, list) { + list_for_each_entry(req, &fpq->processing, list) { if (req->in.h.unique == unique || req->intr_unique == unique) return req; } @@ -1871,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_do_write(struct fuse_conn *fc, +static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) { int err; + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_out_header oh; @@ -1902,63 +1896,60 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, if (oh.error <= -1000 || oh.error > 0) goto err_finish; - spin_lock(&fc->lock); + spin_lock(&fpq->lock); err = -ENOENT; - if (!fc->connected) - goto err_unlock; + if (!fpq->connected) + goto err_unlock_pq; - req = request_find(fc, oh.unique); + req = request_find(fpq, oh.unique); if (!req) - goto err_unlock; + goto err_unlock_pq; - if (req->aborted) { - spin_unlock(&fc->lock); - fuse_copy_finish(cs); - spin_lock(&fc->lock); - request_end(fc, req); - return -ENOENT; - } /* Is it an interrupt reply? */ if (req->intr_unique == oh.unique) { + spin_unlock(&fpq->lock); + err = -EINVAL; if (nbytes != sizeof(struct fuse_out_header)) - goto err_unlock; + goto err_finish; if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(fc, req); + queue_interrupt(&fc->iq, req); - spin_unlock(&fc->lock); fuse_copy_finish(cs); return nbytes; } - req->state = FUSE_REQ_WRITING; - list_move(&req->list, &fc->io); + clear_bit(FR_SENT, &req->flags); + list_move(&req->list, &fpq->io); req->out.h = oh; - req->locked = 1; + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&fpq->lock); cs->req = req; if (!req->out.page_replace) cs->move_pages = 0; - spin_unlock(&fc->lock); err = copy_out_args(cs, &req->out, nbytes); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (!err) { - if (req->aborted) - err = -ENOENT; - } else if (!req->aborted) + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) + err = -ENOENT; + else if (err) req->out.h.error = -EIO; + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); return err ? err : nbytes; - err_unlock: - spin_unlock(&fc->lock); + err_unlock_pq: + spin_unlock(&fpq->lock); err_finish: fuse_copy_finish(cs); return err; @@ -1967,16 +1958,17 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + + if (!fud) return -EPERM; if (!iter_is_iovec(from)) return -EINVAL; - fuse_copy_init(&cs, fc, 0, from); + fuse_copy_init(&cs, 0, from); - return fuse_dev_do_write(fc, &cs, iov_iter_count(from)); + return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, @@ -1987,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc; + struct fuse_dev *fud; size_t rem; ssize_t ret; - fc = fuse_get_conn(out); - if (!fc) + fud = fuse_get_dev(out); + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2039,7 +2031,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, fc, 0, NULL); + fuse_copy_init(&cs, 0, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; @@ -2047,7 +2039,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, if (flags & SPLICE_F_MOVE) cs.move_pages = 1; - ret = fuse_dev_do_write(fc, &cs, len); + ret = fuse_dev_do_write(fud, &cs, len); for (idx = 0; idx < nbuf; idx++) { struct pipe_buffer *buf = &bufs[idx]; @@ -2061,18 +2053,21 @@ out: static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_iqueue *fiq; + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return POLLERR; - poll_wait(file, &fc->waitq, wait); + fiq = &fud->fc->iq; + poll_wait(file, &fiq->waitq, wait); - spin_lock(&fc->lock); - if (!fc->connected) + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) mask = POLLERR; - else if (request_pending(fc)) + else if (request_pending(fiq)) mask |= POLLIN | POLLRDNORM; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return mask; } @@ -2083,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(fc->lock) -__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - request_end(fc, req); - spin_lock(&fc->lock); - } -} - -/* - * Abort requests under I/O - * - * The requests are set to aborted and finished, and the request - * waiter is woken up. This will make request_wait_answer() wait - * until the request is unlocked and then return. - * - * If the request is asynchronous, then the end function needs to be - * called after waiting for the request to be unlocked (if it was - * locked). - */ -static void end_io_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - while (!list_empty(&fc->io)) { - struct fuse_req *req = - list_entry(fc->io.next, struct fuse_req, list); - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - - req->aborted = 1; - req->out.h.error = -ECONNABORTED; - req->state = FUSE_REQ_FINISHED; + clear_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - wake_up(&req->waitq); - if (end) { - req->end = NULL; - __fuse_get_request(req); - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - end(fc, req); - fuse_put_request(fc, req); - spin_lock(&fc->lock); - } + request_end(fc, req); } } -static void end_queued_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - fc->max_background = UINT_MAX; - flush_bg_queue(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); - while (forget_pending(fc)) - kfree(dequeue_forget(fc, 1, NULL)); -} - static void end_polls(struct fuse_conn *fc) { struct rb_node *p; @@ -2162,67 +2108,156 @@ static void end_polls(struct fuse_conn *fc) /* * Abort all requests. * - * Emergency exit in case of a malicious or accidental deadlock, or - * just a hung filesystem. + * Emergency exit in case of a malicious or accidental deadlock, or just a hung + * filesystem. * - * The same effect is usually achievable through killing the - * filesystem daemon and all users of the filesystem. The exception - * is the combination of an asynchronous request and the tricky - * deadlock (see Documentation/filesystems/fuse.txt). + * The same effect is usually achievable through killing the filesystem daemon + * and all users of the filesystem. The exception is the combination of an + * asynchronous request and the tricky deadlock (see + * Documentation/filesystems/fuse.txt). * - * During the aborting, progression of requests from the pending and - * processing lists onto the io list, and progression of new requests - * onto the pending list is prevented by req->connected being false. - * - * Progression of requests under I/O to the processing list is - * prevented by the req->aborted flag being true for these requests. - * For this reason requests on the io list must be aborted first. + * Aborting requests under I/O goes as follows: 1: Separate out unlocked + * requests, they should be finished off immediately. Locked requests will be + * finished after unlock; see unlock_request(). 2: Finish off the unlocked + * requests. It is possible that some request will finish before we can. This + * is OK, the request will in that case be removed from the list before we touch + * it. */ void fuse_abort_conn(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + spin_lock(&fc->lock); if (fc->connected) { + struct fuse_dev *fud; + struct fuse_req *req, *next; + LIST_HEAD(to_end1); + LIST_HEAD(to_end2); + fc->connected = 0; fc->blocked = 0; fuse_set_initialized(fc); - end_io_requests(fc); - end_queued_requests(fc); + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + fpq->connected = 0; + list_for_each_entry_safe(req, next, &fpq->io, list) { + req->out.h.error = -ECONNABORTED; + spin_lock(&req->waitq.lock); + set_bit(FR_ABORTED, &req->flags); + if (!test_bit(FR_LOCKED, &req->flags)) { + set_bit(FR_PRIVATE, &req->flags); + list_move(&req->list, &to_end1); + } + spin_unlock(&req->waitq.lock); + } + list_splice_init(&fpq->processing, &to_end2); + spin_unlock(&fpq->lock); + } + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + + spin_lock(&fiq->waitq.lock); + fiq->connected = 0; + list_splice_init(&fiq->pending, &to_end2); + while (forget_pending(fiq)) + kfree(dequeue_forget(fiq, 1, NULL)); + wake_up_all_locked(&fiq->waitq); + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); - wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_unlock(&fc->lock); + + while (!list_empty(&to_end1)) { + req = list_first_entry(&to_end1, struct fuse_req, list); + __fuse_get_request(req); + list_del_init(&req->list); + request_end(fc, req); + } + end_requests(fc, &to_end2); + } else { + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); } EXPORT_SYMBOL_GPL(fuse_abort_conn); int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc = fuse_get_conn(file); - if (fc) { - spin_lock(&fc->lock); - fc->connected = 0; - fc->blocked = 0; - fuse_set_initialized(fc); - end_queued_requests(fc); - end_polls(fc); - wake_up_all(&fc->blocked_waitq); - spin_unlock(&fc->lock); - fuse_conn_put(fc); - } + struct fuse_dev *fud = fuse_get_dev(file); + if (fud) { + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; + + WARN_ON(!list_empty(&fpq->io)); + end_requests(fc, &fpq->processing); + /* Are we the last open device? */ + if (atomic_dec_and_test(&fc->dev_count)) { + WARN_ON(fc->iq.fasync != NULL); + fuse_abort_conn(fc); + } + fuse_dev_free(fud); + } return 0; } EXPORT_SYMBOL_GPL(fuse_dev_release); static int fuse_dev_fasync(int fd, struct file *file, int on) { - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; /* No locking - fasync_helper does its own locking */ - return fasync_helper(fd, file, on, &fc->fasync); + return fasync_helper(fd, file, on, &fud->fc->iq.fasync); +} + +static int fuse_device_clone(struct fuse_conn *fc, struct file *new) +{ + struct fuse_dev *fud; + + if (new->private_data) + return -EINVAL; + + fud = fuse_dev_alloc(fc); + if (!fud) + return -ENOMEM; + + new->private_data = fud; + atomic_inc(&fc->dev_count); + + return 0; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err = -ENOTTY; + + if (cmd == FUSE_DEV_IOC_CLONE) { + int oldfd; + + err = -EFAULT; + if (!get_user(oldfd, (__u32 __user *) arg)) { + struct file *old = fget(oldfd); + + err = -EINVAL; + if (old) { + struct fuse_dev *fud = fuse_get_dev(old); + + if (fud) { + mutex_lock(&fuse_mutex); + err = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + fput(old); + } + } + } + return err; } const struct file_operations fuse_dev_operations = { @@ -2236,6 +2271,8 @@ const struct file_operations fuse_dev_operations = { .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, + .unlocked_ioctl = fuse_dev_ioctl, + .compat_ioctl = fuse_dev_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8c5e2fa68835..f523f2f04c19 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) * Drop the release request when client does not * implement 'open' */ - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); fuse_request_send_background(ff->fc, req); } kfree(ff); @@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags) { WARN_ON(atomic_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); - ff->reserved_req->force = 1; - ff->reserved_req->background = 0; + __set_bit(FR_FORCE, &ff->reserved_req->flags); + __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags); fuse_request_send(ff->fc, ff->reserved_req); fuse_put_request(ff->fc, ff->reserved_req); kfree(ff); @@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->force = 1; + __set_bit(FR_FORCE, &req->flags); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err <= 0) goto out; - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -1611,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page) if (!req) goto err; - req->background = 1; /* writeback always goes to bg_queue */ + /* writeback always goes to bg_queue */ + __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; @@ -1742,8 +1743,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, } } - if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || - old_req->state == FUSE_REQ_PENDING)) { + if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); @@ -1830,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page, req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->misc.write.next = NULL; req->in.argpages = 1; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); req->num_pages = 0; req->end = fuse_writepage_end; req->inode = inode; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7354dc142a50..405113101db8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -241,16 +241,6 @@ struct fuse_args { #define FUSE_ARGS(args) struct fuse_args args = {} -/** The request state */ -enum fuse_req_state { - FUSE_REQ_INIT = 0, - FUSE_REQ_PENDING, - FUSE_REQ_READING, - FUSE_REQ_SENT, - FUSE_REQ_WRITING, - FUSE_REQ_FINISHED -}; - /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { int async; @@ -267,7 +257,40 @@ struct fuse_io_priv { }; /** + * Request flags + * + * FR_ISREPLY: set if the request has reply + * FR_FORCE: force sending of the request even if interrupted + * FR_BACKGROUND: request is sent in the background + * FR_WAITING: request is counted as "waiting" + * FR_ABORTED: the request was aborted + * FR_INTERRUPTED: the request has been interrupted + * FR_LOCKED: data is being copied to/from the request + * FR_PENDING: request is not yet in userspace + * FR_SENT: request is in userspace, waiting for an answer + * FR_FINISHED: request is finished + * FR_PRIVATE: request is on private list + */ +enum fuse_req_flag { + FR_ISREPLY, + FR_FORCE, + FR_BACKGROUND, + FR_WAITING, + FR_ABORTED, + FR_INTERRUPTED, + FR_LOCKED, + FR_PENDING, + FR_SENT, + FR_FINISHED, + FR_PRIVATE, +}; + +/** * A request to the client + * + * .waitq.lock protects the following fields: + * - FR_ABORTED + * - FR_LOCKED (may also be modified under fc->lock, tested under both) */ struct fuse_req { /** This can be on either pending processing or io lists in @@ -283,35 +306,8 @@ struct fuse_req { /** Unique ID for the interrupt request */ u64 intr_unique; - /* - * The following bitfields are either set once before the - * request is queued or setting/clearing them is protected by - * fuse_conn->lock - */ - - /** True if the request has reply */ - unsigned isreply:1; - - /** Force sending of the request even if interrupted */ - unsigned force:1; - - /** The request was aborted */ - unsigned aborted:1; - - /** Request is sent in the background */ - unsigned background:1; - - /** The request has been interrupted */ - unsigned interrupted:1; - - /** Data is being copied to/from the request */ - unsigned locked:1; - - /** Request is counted as "waiting" */ - unsigned waiting:1; - - /** State of the request */ - enum fuse_req_state state; + /* Request flags, updated with test/set/clear_bit() */ + unsigned long flags; /** The request input */ struct fuse_in in; @@ -380,6 +376,61 @@ struct fuse_req { struct file *stolen_file; }; +struct fuse_iqueue { + /** Connection established */ + unsigned connected; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The next unique request id */ + u64 reqctr; + + /** The list of pending requests */ + struct list_head pending; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; +}; + +struct fuse_pqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; +}; + +/** + * Fuse device instance + */ +struct fuse_dev { + /** Fuse connection for this device */ + struct fuse_conn *fc; + + /** Processing queue */ + struct fuse_pqueue pq; + + /** list entry on fc->devices */ + struct list_head entry; +}; + /** * A Fuse connection. * @@ -394,6 +445,9 @@ struct fuse_conn { /** Refcount */ atomic_t count; + /** Number of fuse_dev's */ + atomic_t dev_count; + struct rcu_head rcu; /** The user id for this mount */ @@ -411,17 +465,8 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Readers of the connection are waiting on this */ - wait_queue_head_t waitq; - - /** The list of pending requests */ - struct list_head pending; - - /** The list of requests being processed */ - struct list_head processing; - - /** The list of requests under I/O */ - struct list_head io; + /** Input queue */ + struct fuse_iqueue iq; /** The next unique kernel file handle */ u64 khctr; @@ -444,16 +489,6 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; - /** Pending interrupts */ - struct list_head interrupts; - - /** Queue of pending forgets */ - struct fuse_forget_link forget_list_head; - struct fuse_forget_link *forget_list_tail; - - /** Batching of FORGET requests (positive indicates FORGET batch) */ - int forget_batch; - /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -469,9 +504,6 @@ struct fuse_conn { /** waitq for reserved requests */ wait_queue_head_t reserved_req_waitq; - /** The next unique request id */ - u64 reqctr; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -594,9 +626,6 @@ struct fuse_conn { /** number of dentries used in the above array */ int ctl_ndents; - /** O_ASYNC requests */ - struct fasync_struct *fasync; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -614,6 +643,9 @@ struct fuse_conn { /** Read/write semaphore to hold when accessing sb. */ struct rw_semaphore killsb; + + /** List of device instances belonging to this connection */ + struct list_head devices; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -826,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc); */ void fuse_conn_put(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +void fuse_dev_free(struct fuse_dev *fud); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 082ac1c97f39..2913db2a5b99 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc) if (req && fc->conn_init) { fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; - req->force = 1; - req->background = 0; + __set_bit(FR_FORCE, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(fc, req); fuse_put_request(fc, req); } @@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void fuse_iqueue_init(struct fuse_iqueue *fiq) +{ + memset(fiq, 0, sizeof(struct fuse_iqueue)); + init_waitqueue_head(&fiq->waitq); + INIT_LIST_HEAD(&fiq->pending); + INIT_LIST_HEAD(&fiq->interrupts); + fiq->forget_list_tail = &fiq->forget_list_head; + fiq->connected = 1; +} + +static void fuse_pqueue_init(struct fuse_pqueue *fpq) +{ + memset(fpq, 0, sizeof(struct fuse_pqueue)); + spin_lock_init(&fpq->lock); + INIT_LIST_HEAD(&fpq->processing); + INIT_LIST_HEAD(&fpq->io); + fpq->connected = 1; +} + void fuse_conn_init(struct fuse_conn *fc) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); atomic_set(&fc->count, 1); - init_waitqueue_head(&fc->waitq); + atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); + fuse_iqueue_init(&fc->iq); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); - fc->forget_list_tail = &fc->forget_list_head; + INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; - fc->reqctr = 0; fc->blocked = 0; fc->initialized = 0; + fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } @@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) static void fuse_free_conn(struct fuse_conn *fc) { + WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } @@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); + if (fud) { + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + } + + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_free(struct fuse_dev *fud) +{ + struct fuse_conn *fc = fud->fc; + + if (fc) { + spin_lock(&fc->lock); + list_del(&fud->entry); + spin_unlock(&fc->lock); + + fuse_conn_put(fc); + } + kfree(fud); +} +EXPORT_SYMBOL_GPL(fuse_dev_free); + static int fuse_fill_super(struct super_block *sb, void *data, int silent) { + struct fuse_dev *fud; struct fuse_conn *fc; struct inode *root; struct fuse_mount_data d; @@ -1026,12 +1077,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_fput; fuse_conn_init(fc); + fc->release = fuse_free_conn; + + fud = fuse_dev_alloc(fc); + if (!fud) + goto err_put_conn; fc->dev = sb->s_dev; fc->sb = sb; err = fuse_bdi_init(fc, sb); if (err) - goto err_put_conn; + goto err_dev_free; sb->s_bdi = &fc->bdi; @@ -1040,7 +1096,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->release = fuse_free_conn; fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; @@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) root = fuse_get_root_inode(sb, d.rootmode); root_dentry = d_make_root(root); if (!root_dentry) - goto err_put_conn; + goto err_dev_free; /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; - init_req->background = 1; + __set_bit(FR_BACKGROUND, &init_req->flags); if (is_bdev) { fc->destroy_req = fuse_request_alloc(0); @@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - fc->connected = 1; - file->private_data = fuse_conn_get(fc); + file->private_data = fud; mutex_unlock(&fuse_mutex); /* * atomic_dec_and_test() in fput() provides the necessary @@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_request_free(init_req); err_put_root: dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); err_put_conn: fuse_bdi_destroy(fc); fuse_conn_put(fc); @@ -1238,7 +1294,6 @@ static void fuse_fs_cleanup(void) } static struct kobject *fuse_kobj; -static struct kobject *connections_kobj; static int fuse_sysfs_init(void) { @@ -1250,11 +1305,9 @@ static int fuse_sysfs_init(void) goto out_err; } - connections_kobj = kobject_create_and_add("connections", fuse_kobj); - if (!connections_kobj) { - err = -ENOMEM; + err = sysfs_create_mount_point(fuse_kobj, "connections"); + if (err) goto out_fuse_unregister; - } return 0; @@ -1266,7 +1319,7 @@ static int fuse_sysfs_init(void) static void fuse_sysfs_cleanup(void) { - kobject_put(connections_kobj); + sysfs_remove_mount_point(fuse_kobj, "connections"); kobject_put(fuse_kobj); } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 95d255219b1e..1f1c7dcbcc2f 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); #define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) #define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) -#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b0441d65fa54..f91a1faf819e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -263,7 +263,7 @@ struct hfsplus_inode_info { static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { - return list_entry(inode, struct hfsplus_inode_info, vfs_inode); + return container_of(inode, struct hfsplus_inode_info, vfs_inode); } /* diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b63b75fa00e7..bb04b58d1d69 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -304,7 +304,7 @@ extern const struct address_space_operations hpfs_symlink_aops; static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) { - return list_entry(inode, struct hpfs_inode_info, vfs_inode); + return container_of(inode, struct hpfs_inode_info, vfs_inode); } static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) diff --git a/fs/inode.c b/fs/inode.c index 069721f0cc0e..d30640f7a193 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -841,7 +841,11 @@ unsigned int get_next_ino(void) } #endif - *p = ++res; + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) + res++; + *p = res; put_cpu_var(last_ino); return res; } @@ -1674,7 +1678,31 @@ int should_remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(should_remove_suid); -static int __remove_suid(struct dentry *dentry, int kill) +/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). + */ +int dentry_needs_remove_privs(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int mask = 0; + int ret; + + if (IS_NOSEC(inode)) + return 0; + + mask = should_remove_suid(dentry); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; + if (ret) + mask |= ATTR_KILL_PRIV; + return mask; +} +EXPORT_SYMBOL(dentry_needs_remove_privs); + +static int __remove_privs(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1686,33 +1714,32 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs, NULL); } -int file_remove_suid(struct file *file) +/* + * Remove special file priviledges (suid, capabilities) when file is written + * to or truncated. + */ +int file_remove_privs(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); - int killsuid; - int killpriv; + int kill; int error = 0; /* Fast path for nothing security related */ if (IS_NOSEC(inode)) return 0; - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; + kill = file_needs_remove_privs(file); + if (kill < 0) + return kill; + if (kill) + error = __remove_privs(dentry, kill); + if (!error) + inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_suid); +EXPORT_SYMBOL(file_remove_privs); /** * file_update_time - update mtime and ctime time @@ -1967,9 +1994,8 @@ EXPORT_SYMBOL(inode_dio_wait); * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one - * code path which doesn't today --- for example, - * __generic_file_aio_write() calls file_remove_suid() without holding - * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * code path which doesn't today so we use cmpxchg() out of an abundance + * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure diff --git a/fs/internal.h b/fs/internal.h index 01dce1d1476b..4d5af583ab03 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); +extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index d200a9b8fd5e..824e61ede465 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -19,7 +19,7 @@ struct kstatfs; struct kvec; -#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode)) #define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) #define JFFS2_SB_INFO(sb) (sb->s_fs_info) #define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index fa7e795bd8ae..1f26d1910409 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -206,7 +206,7 @@ struct jfs_sb_info { static inline struct jfs_inode_info *JFS_IP(struct inode *inode) { - return list_entry(inode, struct jfs_inode_info, vfs_inode); + return container_of(inode, struct jfs_inode_info, vfs_inode); } static inline int jfs_dirtable_inline(struct inode *inode) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index fffca9517321..2d48d28e1640 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -592,6 +592,9 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; ret = -ENOENT; + if (parent->flags & KERNFS_EMPTY_DIR) + goto out_unlock; + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) goto out_unlock; @@ -783,6 +786,38 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, return ERR_PTR(rc); } +/** + * kernfs_create_empty_dir - create an always empty directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, + const char *name) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->flags |= KERNFS_EMPTY_DIR; + kn->dir.root = parent->dir.root; + kn->ns = NULL; + kn->priv = NULL; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -1254,7 +1289,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&kernfs_mutex); error = -ENOENT; - if (!kernfs_active(kn) || !kernfs_active(new_parent)) + if (!kernfs_active(kn) || !kernfs_active(new_parent) || + (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 2da8493a380b..756dd56aaf60 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -296,6 +296,8 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; + if (kn->flags & KERNFS_EMPTY_DIR) + make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; diff --git a/fs/libfs.c b/fs/libfs.c index 65e1feca8b98..102edfd39000 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,11 +20,6 @@ #include "internal.h" -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -1108,3 +1103,98 @@ const struct inode_operations simple_symlink_inode_operations = { .readlink = generic_readlink }; EXPORT_SYMBOL(simple_symlink_inode_operations); + +/* + * Operations for a permanently empty directory. + */ +static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return ERR_PTR(-ENOENT); +} + +static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + return 0; +} + +static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) +{ + return -EPERM; +} + +static int empty_dir_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int empty_dir_removexattr(struct dentry *dentry, const char *name) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) +{ + return -EOPNOTSUPP; +} + +static const struct inode_operations empty_dir_inode_operations = { + .lookup = empty_dir_lookup, + .permission = generic_permission, + .setattr = empty_dir_setattr, + .getattr = empty_dir_getattr, + .setxattr = empty_dir_setxattr, + .getxattr = empty_dir_getxattr, + .removexattr = empty_dir_removexattr, + .listxattr = empty_dir_listxattr, +}; + +static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) +{ + /* An empty directory has two entries . and .. at offsets 0 and 1 */ + return generic_file_llseek_size(file, offset, whence, 2, 2); +} + +static int empty_dir_readdir(struct file *file, struct dir_context *ctx) +{ + dir_emit_dots(file, ctx); + return 0; +} + +static const struct file_operations empty_dir_operations = { + .llseek = empty_dir_llseek, + .read = generic_read_dir, + .iterate = empty_dir_readdir, + .fsync = noop_fsync, +}; + + +void make_empty_dir_inode(struct inode *inode) +{ + set_nlink(inode, 2); + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_rdev = 0; + inode->i_size = 2; + inode->i_blkbits = PAGE_SHIFT; + inode->i_blocks = 0; + + inode->i_op = &empty_dir_inode_operations; + inode->i_fop = &empty_dir_operations; +} + +bool is_empty_dir_inode(struct inode *inode) +{ + return (inode->i_fop == &empty_dir_operations) && + (inode->i_op == &empty_dir_inode_operations); +} diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 118e4e7bc935..d19ac258105a 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 1ebd11854622..01ad81dcacc5 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb) static inline struct minix_inode_info *minix_i(struct inode *inode) { - return list_entry(inode, struct minix_inode_info, vfs_inode); + return container_of(inode, struct minix_inode_info, vfs_inode); } static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) diff --git a/fs/mount.h b/fs/mount.h index b5b8082bfa42..14db05d424f7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -118,7 +118,6 @@ static inline void unlock_mount_hash(void) } struct proc_mounts { - struct seq_file m; struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); @@ -127,8 +126,6 @@ struct proc_mounts { loff_t cached_index; }; -#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) - extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); diff --git a/fs/namei.c b/fs/namei.c index 2dad0eaf91d3..ae4e4c18b2ac 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -792,7 +792,7 @@ static void set_root(struct nameidata *nd) get_fs_root(current->fs, &nd->root); } -static unsigned set_root_rcu(struct nameidata *nd) +static void set_root_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; unsigned seq; @@ -802,7 +802,6 @@ static unsigned set_root_rcu(struct nameidata *nd) nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); - return nd->root_seq; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -1998,7 +1997,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (*s == '/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - nd->seq = set_root_rcu(nd); + set_root_rcu(nd); + nd->seq = nd->root_seq; } else { set_root(nd); path_get(&nd->root); diff --git a/fs/namespace.c b/fs/namespace.c index 9c1c43d0d4f1..c7cb8a526c05 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1226,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options); /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; down_read(&namespace_sem); if (p->cached_event == p->ns->event) { @@ -1247,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; p->cached_mount = seq_list_next(v, &p->ns->list, pos); p->cached_index = *pos; @@ -1261,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v) static int m_show(struct seq_file *m, void *v) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = list_entry(v, struct mount, mnt_list); return p->show(m, &r->mnt); } @@ -2343,6 +2343,8 @@ unlock: return err; } +static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); + /* * create a new mount for userspace and request it to be added into the * namespace's tree @@ -2374,6 +2376,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, flags |= MS_NODEV; mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } + if (type->fs_flags & FS_USERNS_VISIBLE) { + if (!fs_fully_visible(type, &mnt_flags)) + return -EPERM; + } } mnt = vfs_kern_mount(type, flags, name, data); @@ -3175,9 +3181,10 @@ bool current_chrooted(void) return chrooted; } -bool fs_fully_visible(struct file_system_type *type) +static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; + int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; @@ -3196,16 +3203,36 @@ bool fs_fully_visible(struct file_system_type *type) if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; - /* This mount is not fully visible if there are any child mounts - * that cover anything except for empty directories. + /* Verify the mount flags are equal to or more permissive + * than the proposed new mount. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(new_flags & MNT_READONLY)) + continue; + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(new_flags & MNT_NODEV)) + continue; + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + continue; + + /* This mount is not fully visible if there are any + * locked child mounts that cover anything except for + * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; - if (!S_ISDIR(inode->i_mode)) - goto next; - if (inode->i_nlink > 2) + /* Only worry about locked mounts */ + if (!(mnt->mnt.mnt_flags & MNT_LOCKED)) + continue; + /* Is the directory permanetly empty? */ + if (!is_empty_dir_inode(inode)) goto next; } + /* Preserve the locked attributes */ + *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_NODEV | \ + MNT_LOCK_ATIME); visible = true; goto found; next: ; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 80021c709af9..93575e91a7aa 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, case 0x00: ncp_dbg(1, "renamed %pd -> %pd\n", old_dentry, new_dentry); + ncp_d_prune(old_dentry); + ncp_d_prune(new_dentry); break; case 0x9E: error = -ENAMETOOLONG; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 8d129bb7355a..682529c00996 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * pg_authenticate method for nfsv4 callback threads. * * The authflavor has been negotiated, so an incorrect flavor is a server - * bug. Drop packets with incorrect authflavor. + * bug. Deny packets with incorrect authflavor. * * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound @@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) - return SVC_DROP; + return SVC_DENIED; break; case RPC_AUTH_GSS: /* No RPC_AUTH_GSS support yet in NFSv4.1 */ if (svc_is_backchannel(rqstp)) - return SVC_DROP; + return SVC_DENIED; } return SVC_OK; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 197806fb87ff..29e3c1b011b7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); /* Normal */ - if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { - slot->seq_nr++; + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) goto out_ok; - } /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { @@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_process_state *cps) { struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) goto out; + tbl = &clp->cl_session->bc_slot_table; + slot = tbl->slots + args->csa_slotid; spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { - spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session * in order to reset it. */ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) status = htonl(NFS4ERR_BADSESSION); - goto out; + goto out_unlock; } - status = validate_seqid(&clp->cl_session->bc_slot_table, args); - spin_unlock(&tbl->slot_tbl_lock); + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + + status = validate_seqid(tbl, args); if (status) - goto out; + goto out_unlock; cps->slotid = args->csa_slotid; @@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, */ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { status = htonl(NFS4ERR_DELAY); - goto out; + goto out_unlock; } - memcpy(&res->csr_sessionid, &args->csa_sessionid, - sizeof(res->csr_sessionid)); - res->csr_sequenceid = args->csa_sequenceid; - res->csr_slotid = args->csa_slotid; - res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + /* + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + slot->seq_nr++; +out_unlock: + spin_unlock(&tbl->slot_tbl_lock); out: cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 19ca95cdfd9b..6b1697a01dde 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r xdr_init_encode(&xdr_out, &rqstp->rq_res, p); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); - if (status == __constant_htonl(NFS4ERR_RESOURCE)) + if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; if (hdr_arg.minorversion == 0) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 892aefff3630..ecebb406cc1a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -825,7 +825,6 @@ error: * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, - struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, mntfh, &fsinfo); + nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_fs_nfs; - static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); @@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; - char dev[8], fsid[17]; + char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' + char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + seq_puts(m, "NV SERVER PORT DEV FSID" + " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; - snprintf(dev, 8, "%u:%u", + snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); - snprintf(fsid, 17, "%llx:%llx", + snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); - seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), @@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net) */ int __init nfs_fs_proc_init(void) { - struct proc_dir_entry *p; - - proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); - if (!proc_fs_nfs) + if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ - p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); - if (!p) + if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ - p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); - if (!p) - goto error_2; - return 0; + if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) + goto error_1; -error_2: - remove_proc_entry("servers", proc_fs_nfs); + return 0; error_1: - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } @@ -1464,9 +1456,7 @@ error_0: */ void nfs_fs_proc_exit(void) { - remove_proc_entry("volumes", proc_fs_nfs); - remove_proc_entry("servers", proc_fs_nfs); - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b2c8b31b2be7..547308a5ec6f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; - if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - *opened |= FILE_CREATED; - err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -1771,7 +1768,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (d_really_is_positive(dentry) && !d_unhashed(dentry)) + if (simple_positive(dentry)) d_delete(dentry); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8b8d83a526ce..cc4fa1ed61fc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page) return nfs_wb_page(inode, page); } -#ifdef CONFIG_NFS_SWAP static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { - int ret; struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); *span = sis->pages; - rcu_read_lock(); - ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); - rcu_read_unlock(); - - return ret; + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - rcu_read_lock(); - xs_swapper(rcu_dereference(clnt->cl_xprt), 0); - rcu_read_unlock(); + rpc_clnt_swap_deactivate(clnt); } -#endif const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, @@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, -#ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, -#endif }; /* diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d05089e52d6..c12951b9551e 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -20,6 +20,7 @@ #include "../nfs4trace.h" #include "../iostat.h" #include "../nfs.h" +#include "../nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { - struct nfs4_ff_layout_mirror *tmp; int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) { - tmp = fls->mirror_array[i]; - fls->mirror_array[i] = fls->mirror_array[j]; - fls->mirror_array[j] = tmp; - } + fls->mirror_array[j]->efficiency) + swap(fls->mirror_array[i], + fls->mirror_array[j]); } } @@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; + fls->mirror_array[i]->lseg = &fls->generic_hdr; /* deviceid */ rc = decode_deviceid(&stream, &devid); @@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->gid); } + p = xdr_inline_decode(&stream, 4); + if (p) + fls->flags = be32_to_cpup(p); + ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) return 1; } +static void +nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer) +{ + /* first IO request? */ + if (atomic_inc_return(&timer->n_ops) == 1) { + timer->start_time = ktime_get(); + } +} + +static ktime_t +nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer) +{ + ktime_t start, now; + + if (atomic_dec_return(&timer->n_ops) < 0) + WARN_ON_ONCE(1); + + now = ktime_get(); + start = timer->start_time; + timer->start_time = now; + return ktime_sub(now, start); +} + +static ktime_t +nfs4_ff_layout_calc_completion_time(struct rpc_task *task) +{ + return ktime_sub(ktime_get(), task->tk_start); +} + +static bool +nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + struct nfs4_ff_layoutstat *layoutstat) +{ + static const ktime_t notime = {0}; + ktime_t now = ktime_get(); + + nfs4_ff_start_busy_timer(&layoutstat->busy_timer); + if (ktime_equal(mirror->start_time, notime)) + mirror->start_time = now; + if (ktime_equal(mirror->last_report_time, notime)) + mirror->last_report_time = now; + if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + FF_LAYOUTSTATS_REPORT_INTERVAL) { + mirror->last_report_time = now; + return true; + } + + return false; +} + +static void +nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + + iostat->ops_requested++; + iostat->bytes_requested += requested; +} + +static void +nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested, + __u64 completed, + ktime_t time_completed) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + ktime_t timer; + + iostat->ops_completed++; + iostat->bytes_completed += completed; + iostat->bytes_not_delivered += requested - completed; + + timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer); + iostat->total_busy_time = + ktime_add(iostat->total_busy_time, timer); + iostat->aggregate_completion_time = + ktime_add(iostat->aggregate_completion_time, time_completed); +} + +static void +nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror, + __u64 requested) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat); + nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); +} + +static void +nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed) +{ + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + requested, completed, + nfs4_ff_layout_calc_completion_time(task)); + spin_unlock(&mirror->lock); +} + +static void +nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror, + __u64 requested) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat); + nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); +} + +static void +nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed, + enum nfs3_stable_how committed) +{ + if (committed == NFS_UNSTABLE) + requested = completed = 0; + + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + requested, completed, + nfs4_ff_layout_calc_completion_time(task)); + spin_unlock(&mirror->lock); +} + static int ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, @@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) nfs_direct_set_resched_writes(hdr->dreq); /* fake unstable write to let common nfs resend pages */ hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = 0; + hdr->good_bytes = hdr->args.count; } return; } @@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return 0; } +static bool +ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) +{ + return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT); +} + /* * We reference the rpc_cred of the first WRITE that triggers the need for * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. @@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { + if (!ff_layout_need_layoutcommit(hdr->lseg)) + return; + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, @@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) static int ff_layout_read_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_read( + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_read_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_read_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_READ) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) + if (data->verf.committed == NFS_UNSTABLE + && ff_layout_need_layoutcommit(data->lseg)) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; @@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, static int ff_layout_write_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_write( + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_write_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_write_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_WRITE) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + nfs4_ff_layout_stat_io_start_write( + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + 0); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { + ff_layout_commit_prepare_common(task, data); rpc_call_start(task); } @@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - ff_layout_setup_sequence(wdata->ds_clp, + if (ff_layout_setup_sequence(wdata->ds_clp, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + ff_layout_commit_prepare_common(task, data); +} + +static void ff_layout_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + struct nfs_page *req; + __u64 count = 0; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); + + pnfs_generic_write_commit_done(task, data); } static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) @@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; @@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; - /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, @@ -1488,6 +1687,247 @@ out: dprintk("%s: Return\n", __func__); } +static int +ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + return snprintf(buf, buflen, "%pI4", &sin->sin_addr); +} + +static size_t +ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf, + const int buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr *addr = &sin6->sin6_addr; + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded ANY address + */ + if (ipv6_addr_any(addr)) + return snprintf(buf, buflen, "::"); + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded loopback address + */ + if (ipv6_addr_loopback(addr)) + return snprintf(buf, buflen, "::1"); + + /* + * RFC 4291, Section 2.2.3 + * + * Special presentation address format for mapped v4 + * addresses. + */ + if (ipv6_addr_v4mapped(addr)) + return snprintf(buf, buflen, "::ffff:%pI4", + &addr->s6_addr32[3]); + + /* + * RFC 4291, Section 2.2.1 + */ + return snprintf(buf, buflen, "%pI6c", addr); +} + +/* Derived from rpc_sockaddr2uaddr */ +static void +ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) +{ + struct sockaddr *sap = (struct sockaddr *)&da->da_addr; + char portbuf[RPCBIND_MAXUADDRPLEN]; + char addrbuf[RPCBIND_MAXUADDRLEN]; + char *netid; + unsigned short port; + int len, netid_len; + __be32 *p; + + switch (sap->sa_family) { + case AF_INET: + if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in *)sap)->sin_port); + netid = "tcp"; + netid_len = 3; + break; + case AF_INET6: + if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + netid = "tcp6"; + netid_len = 4; + break; + default: + /* we only support tcp and tcp6 */ + WARN_ON_ONCE(1); + return; + } + + snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); + len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + + p = xdr_reserve_space(xdr, 4 + netid_len); + xdr_encode_opaque(p, netid, netid_len); + + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, addrbuf, len); +} + +static void +ff_layout_encode_nfstime(struct xdr_stream *xdr, + ktime_t t) +{ + struct timespec64 ts; + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + ts = ktime_to_timespec64(t); + p = xdr_encode_hyper(p, ts.tv_sec); + *p++ = cpu_to_be32(ts.tv_nsec); +} + +static void +ff_layout_encode_io_latency(struct xdr_stream *xdr, + struct nfs4_ff_io_stat *stat) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 5 * 8); + p = xdr_encode_hyper(p, stat->ops_requested); + p = xdr_encode_hyper(p, stat->bytes_requested); + p = xdr_encode_hyper(p, stat->ops_completed); + p = xdr_encode_hyper(p, stat->bytes_completed); + p = xdr_encode_hyper(p, stat->bytes_not_delivered); + ff_layout_encode_nfstime(xdr, stat->total_busy_time); + ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time); +} + +static void +ff_layout_encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo) +{ + struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private; + struct nfs4_pnfs_ds_addr *da; + struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; + struct nfs_fh *fh = &mirror->fh_versions[0]; + __be32 *p, *start; + + da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); + dprintk("%s: DS %s: encoding address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + /* layoutupdate length */ + start = xdr_reserve_space(xdr, 4); + /* netaddr4 */ + ff_layout_encode_netaddr(xdr, da); + /* nfs_fh4 */ + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); + /* ff_io_latency4 read */ + spin_lock(&mirror->lock); + ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + /* ff_io_latency4 write */ + ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); + spin_unlock(&mirror->lock); + /* nfstime4 */ + ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + /* bool */ + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(false); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); +} + +static bool +ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, + struct pnfs_layout_segment *pls, + int *dev_count, int dev_limit) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *dev; + struct nfs42_layoutstat_devinfo *devinfo; + int i; + + for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) { + if (*dev_count >= dev_limit) + break; + mirror = FF_LAYOUT_COMP(pls, i); + if (!mirror || !mirror->mirror_ds) + continue; + dev = FF_LAYOUT_DEVID_NODE(pls, i); + devinfo = &args->devinfo[*dev_count]; + memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); + devinfo->offset = pls->pls_range.offset; + devinfo->length = pls->pls_range.length; + /* well, we don't really know if IO is continuous or not! */ + devinfo->read_count = mirror->read_stat.io_stat.bytes_completed; + devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; + devinfo->write_count = mirror->write_stat.io_stat.bytes_completed; + devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->layoutstats_encode = ff_layout_encode_layoutstats; + devinfo->layout_private = mirror; + /* lseg refcount put in cleanup_layoutstats */ + pnfs_get_lseg(pls); + + ++(*dev_count); + } + + return *dev_count < dev_limit; +} + +static int +ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +{ + struct pnfs_layout_segment *pls; + int dev_count = 0; + + spin_lock(&args->inode->i_lock); + list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { + dev_count += FF_LAYOUT_MIRROR_COUNT(pls); + } + spin_unlock(&args->inode->i_lock); + /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ + if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) { + dprintk("%s: truncating devinfo to limit (%d:%d)\n", + __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); + dev_count = PNFS_LAYOUTSTATS_MAXDEV; + } + args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL); + if (!args->devinfo) + return -ENOMEM; + + dev_count = 0; + spin_lock(&args->inode->i_lock); + list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { + if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count, + PNFS_LAYOUTSTATS_MAXDEV)) { + break; + } + } + spin_unlock(&args->inode->i_lock); + args->num_dev = dev_count; + + return 0; +} + +static void +ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) +{ + struct nfs4_ff_layout_mirror *mirror; + int i; + + for (i = 0; i < data->args.num_dev; i++) { + mirror = data->args.devinfo[i].layout_private; + data->args.devinfo[i].layout_private = NULL; + pnfs_put_lseg(mirror->lseg); + } +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, .sync = pnfs_nfs_generic_sync, + .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cleanup_layoutstats = ff_layout_cleanup_layoutstats, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 070f20445b2d..f92f9a0a856b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -9,12 +9,17 @@ #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H #define FS_NFS_NFS4FLEXFILELAYOUT_H +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 + #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +/* LAYOUTSTATS report interval in ms */ +#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) + struct nfs4_ff_ds_version { u32 version; u32 minor_version; @@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err { struct nfs4_deviceid deviceid; }; +struct nfs4_ff_io_stat { + __u64 ops_requested; + __u64 bytes_requested; + __u64 ops_completed; + __u64 bytes_completed; + __u64 bytes_not_delivered; + ktime_t total_busy_time; + ktime_t aggregate_completion_time; +}; + +struct nfs4_ff_busy_timer { + ktime_t start_time; + atomic_t n_ops; +}; + +struct nfs4_ff_layoutstat { + struct nfs4_ff_io_stat io_stat; + struct nfs4_ff_busy_timer busy_timer; +}; + struct nfs4_ff_layout_mirror { + struct pnfs_layout_segment *lseg; /* back pointer */ u32 ds_count; u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - struct nfs4_string user_name; - struct nfs4_string group_name; u32 uid; u32 gid; struct rpc_cred *cred; spinlock_t lock; + struct nfs4_ff_layoutstat read_stat; + struct nfs4_ff_layoutstat write_stat; + ktime_t start_time; + ktime_t last_report_time; }; struct nfs4_ff_layout_segment { struct pnfs_layout_segment generic_hdr; u64 stripe_unit; + u32 flags; u32 mirror_array_cnt; struct nfs4_ff_layout_mirror **mirror_array; }; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 77a2d026aa12..f13e1969eedd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, __func__, PTR_ERR(cred)); return PTR_ERR(cred); } else { - mirror->cred = cred; + if (cmpxchg(&mirror->cred, NULL, cred)) + put_rpccred(cred); } } return 0; @@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out; + goto out_update_creds; flavor = nfs4_ff_layout_choose_authflavor(mirror); @@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, } } } - +out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) ds = NULL; out: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f734562c6d24..b77b328a06d7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -678,6 +678,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; } out: trace_nfs_getattr_exit(inode, err); @@ -2008,17 +2010,15 @@ static int __init init_nfs_fs(void) if (err) goto out1; -#ifdef CONFIG_PROC_FS rpc_proc_register(&init_net, &nfs_rpcstat); -#endif - if ((err = register_nfs_fs()) != 0) + + err = register_nfs_fs(); + if (err) goto out0; return 0; out0: -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2049,9 +2049,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 53852a4bd88b..9b04c2e6fffc 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, if (args->npages != 0) xdr_write_pages(xdr, args->pages, 0, args->len); else - xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); + xdr_reserve_space(xdr, args->len); error = nfsacl_encode(xdr->buf, base, args->inode, (args->mask & NFS_ACL) ? diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 7afb8947dfdf..ff66ae700b89 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -5,11 +5,18 @@ #ifndef __LINUX_FS_NFS_NFS4_2_H #define __LINUX_FS_NFS_NFS4_2_H +/* + * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support + * more? Need to consider not to pre-alloc too much for a compound. + */ +#define PNFS_LAYOUTSTATS_MAXDEV (4) + /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); - +int nfs42_proc_layoutstats_generic(struct nfs_server *, + struct nfs42_layoutstat_data *); /* nfs4.2xdr.h */ extern struct rpc_procinfo nfs4_2_procedures[]; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 3a9e75235f30..f486b80f927a 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -10,6 +10,11 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" #include "nfs42.h" +#include "iostat.h" +#include "pnfs.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, fmode_t fmode) @@ -165,3 +170,85 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } + +static void +nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layoutstat_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + default: + dprintk("%s server returns %d\n", __func__, task->tk_status); + } +} + +static void +nfs42_layoutstat_release(void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutstats) + nfss->pnfs_curr_ld->cleanup_layoutstats(data); + + pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags); + smp_mb__after_atomic(); + nfs_iput_and_deactive(data->inode); + kfree(data->args.devinfo); + kfree(data); +} + +static const struct rpc_call_ops nfs42_layoutstat_ops = { + .rpc_call_prepare = nfs42_layoutstat_prepare, + .rpc_call_done = nfs42_layoutstat_done, + .rpc_release = nfs42_layoutstat_release, +}; + +int nfs42_proc_layoutstats_generic(struct nfs_server *server, + struct nfs42_layoutstat_data *data) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs42_layoutstat_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + + data->inode = nfs_igrab_and_active(data->args.inode); + if (!data->inode) { + nfs42_layoutstat_release(data); + return -EAGAIN; + } + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + return 0; +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 1a25b27248f2..a6bd27da6286 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -4,6 +4,8 @@ #ifndef __LINUX_FS_NFS_NFS4_2XDR_H #define __LINUX_FS_NFS_NFS4_2XDR_H +#include "nfs42.h" + #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) @@ -22,6 +24,16 @@ 1 /* whence */ + \ 2 /* offset */ + \ 2 /* length */) +#define encode_io_info_maxsz 4 +#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + encode_io_info_maxsz + \ + encode_io_info_maxsz + \ + 1 /* opaque devaddr4 length */ + \ + XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) +#define decode_layoutstats_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -45,6 +57,14 @@ #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) +#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz) +#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) static void encode_fallocate(struct xdr_stream *xdr, @@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr, encode_uint32(xdr, args->sa_what); } +static void encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, devinfo->offset); + p = xdr_encode_hyper(p, devinfo->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4); + p = xdr_encode_hyper(p, devinfo->read_count); + p = xdr_encode_hyper(p, devinfo->read_bytes); + p = xdr_encode_hyper(p, devinfo->write_count); + p = xdr_encode_hyper(p, devinfo->write_bytes); + p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data, + NFS4_DEVICEID4_SIZE); + /* Encode layoutupdate4 */ + *p++ = cpu_to_be32(devinfo->layout_type); + if (devinfo->layoutstats_encode != NULL) + devinfo->layoutstats_encode(xdr, args, devinfo); + else + encode_uint32(xdr, 0); +} + /* * Encode ALLOCATE request */ @@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTSTATS request + */ +static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args) +{ + int i; + + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < args->num_dev; i++) + encode_layoutstats(xdr, args, &args->devinfo[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -169,6 +238,12 @@ out_overflow: return -EIO; } +static int decode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + return decode_op_hdr(xdr, OP_LAYOUTSTATS); +} + /* * Decode ALLOCATE request */ @@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTSTATS request + */ +static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < res->num_dev; i++) { + status = decode_layoutstats(xdr, res); + if (status) + goto out; + } +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index fdef424b0cd3..ea3bee919a76 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); +extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index e42be52a8c18..3aa6a9ba5113 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -676,7 +676,6 @@ found: break; } - /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index f58c17b3b480..dcd39d4e2efe 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp) dprintk("NFS: open file(%pd2)\n", dentry); + err = nfs_check_flags(openflags); + if (err) + return err; + if ((openflags & O_ACCMODE) == 3) openflags--; diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index c0b3a16b4a00..039b3eb6d834 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p goto out; } - if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_rootfh:" - " getroot obtained referral\n"); - ret = -EREMOTE; - goto out; - } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 2e1737c40a29..535dfc69c628 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp) int nfs_idmap_init(void) { - int ret; - ret = nfs_idmap_init_keyring(); - if (ret != 0) - goto out; -out: - return ret; + return nfs_idmap_init_keyring(); } void nfs_idmap_quit(void) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 55e1e3af23a3..6f228b5af819 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ case 0: return 0; case -NFS4ERR_OPENMODE: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: if (inode && nfs4_have_delegation(inode, FMODE_READ)) { nfs4_inode_return_delegation(inode); exception->retry = 1; @@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - if (state == NULL) - break; - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { ret = nfs4_schedule_stateid_recovery(server, state); @@ -482,8 +476,8 @@ struct nfs4_call_sync_data { struct nfs4_sequence_res *seq_res; }; -static void nfs4_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) +void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) { args->sa_slot = NULL; args->sa_cache_this = cache_reply; @@ -1553,6 +1547,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod struct nfs4_state *newstate; int ret; + if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR || + opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) && + (opendata->o_arg.u.delegation_type & fmode) != fmode) + /* This mode can't have been delegated, so we must have + * a valid open_stateid to cover it - not need to reclaim. + */ + return 0; opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; opendata->o_arg.share_access = nfs4_map_atomic_open_share( @@ -1684,6 +1685,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct "%d.\n", __func__, err); case 0: case -ENOENT: + case -EAGAIN: case -ESTALE: break; case -NFS4ERR_BADSESSION: @@ -3355,6 +3357,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, goto out; case -NFS4ERR_MOVED: err = nfs4_get_referral(client, dir, name, fattr, fhandle); + if (err == -NFS4ERR_MOVED) + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); goto out; case -NFS4ERR_WRONGSEC: err = -EPERM; @@ -4955,49 +4959,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } -static unsigned int -nfs4_init_nonuniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - unsigned int result; + int result; + size_t len; + char *str; + bool retried = false; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; +retry: + rcu_read_lock(); + len = 10 + strlen(clp->cl_ipaddr) + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + + 1; + rcu_read_unlock(); + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; rcu_read_lock(); - result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO)); + result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); rcu_read_unlock(); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + + /* Did something change? */ + if (result >= len) { + kfree(str); + if (retried) + return -EINVAL; + retried = true; + goto retry; + } + clp->cl_owner_id = str; + return 0; } -static unsigned int -nfs4_init_uniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_uniquifier_client_string(struct nfs_client *clp) +{ + int result; + size_t len; + char *str; + + len = 10 + 10 + 1 + 10 + 1 + + strlen(nfs4_client_id_uniquifier) + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, clp->cl_minorversion, + nfs4_client_id_uniquifier, + clp->cl_rpcclient->cl_nodename); + if (result >= len) { + kfree(str); + return -EINVAL; + } + clp->cl_owner_id = str; + return 0; +} + +static int +nfs4_init_uniform_client_string(struct nfs_client *clp) { - const char *nodename = clp->cl_rpcclient->cl_nodename; - unsigned int result; + int result; + size_t len; + char *str; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; if (nfs4_client_id_uniquifier[0] != '\0') - result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", - clp->rpc_ops->version, - clp->cl_minorversion, - nfs4_client_id_uniquifier, - nodename); - else - result = scnprintf(buf, len, "Linux NFSv%u.%u %s", - clp->rpc_ops->version, clp->cl_minorversion, - nodename); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + return nfs4_init_uniquifier_client_string(clp); + + len = 10 + 10 + 1 + 10 + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); + if (result >= len) { + kfree(str); + return -EINVAL; + } + clp->cl_owner_id = str; + return 0; } /* @@ -5044,7 +5127,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, struct nfs4_setclientid setclientid = { .sc_verifier = &sc_verifier, .sc_prog = program, - .sc_cb_ident = clp->cl_cb_ident, + .sc_clnt = clp, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], @@ -5064,16 +5147,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) - setclientid.sc_name_len = - nfs4_init_uniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_uniform_client_string(clp); else - setclientid.sc_name_len = - nfs4_init_nonuniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_nonuniform_client_string(clp); + + if (status) + goto out; + /* cb_client4 */ setclientid.sc_netid_len = nfs4_init_callback_netid(clp, @@ -5083,9 +5165,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - dprintk("NFS call setclientid auth=%s, '%.*s'\n", + dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - setclientid.sc_name_len, setclientid.sc_name); + clp->cl_owner_id); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) { status = PTR_ERR(task); @@ -5402,6 +5484,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); + get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); p->server = NFS_SERVER(inode); return p; @@ -5413,6 +5496,7 @@ static void nfs4_locku_release_calldata(void *data) nfs_free_seqid(calldata->arg.seqid); nfs4_put_lock_state(calldata->lsp); put_nfs_open_context(calldata->ctx); + fput(calldata->fl.fl_file); kfree(calldata); } @@ -6846,11 +6930,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; nfs4_init_boot_verifier(clp, &verifier); - args.id_len = nfs4_init_uniform_client_string(clp, args.id, - sizeof(args.id)); - dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + + status = nfs4_init_uniform_client_string(clp); + if (status) + goto out; + + dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - args.id_len, args.id); + clp->cl_owner_id); res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); @@ -6885,7 +6972,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* unsupported! */ WARN_ON_ONCE(1); status = -EINVAL; - goto out_server_scope; + goto out_impl_id; } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -6913,6 +7000,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* use the most recent implementation id */ kfree(clp->cl_implid); clp->cl_implid = res.impl_id; + res.impl_id = NULL; if (clp->cl_serverscope != NULL && !nfs41_same_server_scope(clp->cl_serverscope, @@ -6926,15 +7014,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (clp->cl_serverscope == NULL) { clp->cl_serverscope = res.server_scope; - goto out; + res.server_scope = NULL; } - } else - kfree(res.impl_id); + } -out_server_owner: - kfree(res.server_owner); +out_impl_id: + kfree(res.impl_id); out_server_scope: kfree(res.server_scope); +out_server_owner: + kfree(res.server_owner); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -8061,9 +8150,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) struct rpc_task *task; int status = 0; - dprintk("NFS: %4d initiating layoutcommit call. sync %d " - "lbw: %llu inode %lu\n", - data->task.tk_pid, sync, + dprintk("NFS: initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", sync, data->args.lastbytewritten, data->args.inode->i_ino); @@ -8557,7 +8645,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE | NFS_CAP_DEALLOCATE - | NFS_CAP_SEEK, + | NFS_CAP_SEEK + | NFS_CAP_LAYOUTSTATS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2782cfca2265..605840dc89cf 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) goto do_confirm; - nfs4_begin_drain_session(clp); status = nfs4_proc_exchange_id(clp, cred); if (status != 0) goto out; @@ -1482,6 +1481,8 @@ restart: spin_unlock(&state->state_lock); } nfs4_put_open_state(state); + clear_bit(NFS4CLNT_RECLAIM_NOGRACE, + &state->flags); spin_lock(&sp->so_lock); goto restart; } @@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp) clp->cl_mvops->reboot_recovery_ops; int status; + nfs4_begin_drain_session(clp); cred = nfs4_get_clid_cred(clp); if (cred == NULL) return -ENOENT; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0aea97841d30..558cd65dbdb7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int); #define encode_setclientid_maxsz \ (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ - XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ + /* client name */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* sc_prog */ + \ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ @@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int); #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ encode_verifier_maxsz + \ 1 /* co_ownerid.len */ + \ - XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + /* eia_clientowner */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* flags */ + \ 1 /* spa_how */ + \ /* max is SP4_MACH_CRED (for now) */ + \ @@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); encode_nfs4_verifier(xdr, setclientid->sc_verifier); - encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id), + setclientid->sc_clnt->cl_owner_id); p = reserve_space(xdr, 4); *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); p = reserve_space(xdr, 4); - *p = cpu_to_be32(setclientid->sc_cb_ident); + *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident); } static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) @@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); encode_nfs4_verifier(xdr, args->verifier); - encode_string(xdr, args->id_len, args->id); + encode_string(xdr, strlen(args->client->cl_owner_id), + args->client->cl_owner_id); encode_uint32(xdr, args->flags); encode_uint32(xdr, args->state_protect.how); @@ -7427,6 +7431,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEEK, enc_seek, dec_seek), PROC(ALLOCATE, enc_allocate, dec_allocate), PROC(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 282b39369510..1da68d3b1eda 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); - dprintk("NFS: %5u initiated pgio call " + dprintk("NFS: initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", - hdr->task.tk_pid, hdr->inode->i_sb->s_id, (unsigned long long)NFS_FILEID(hdr->inode), hdr->args.count, @@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - if (hdr->rw_ops->rw_release) - hdr->rw_ops->rw_release(hdr); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror, * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor * @inode: pointer to inode - * @doio: pointer to io function + * @pg_ops: pointer to pageio operations + * @compl_ops: pointer to pageio completion operations + * @rw_ops: pointer to nfs read/write operations * @bsize: io block size * @io_flags: extra parameters for the io function */ @@ -1186,6 +1185,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an * nfs_pageio_descriptor * @desc: pointer to io descriptor + * @mirror_idx: pointer to mirror index */ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, u32 mirror_idx) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 230606243be6..0ba9a02c9566 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -35,6 +35,7 @@ #include "iostat.h" #include "nfs4trace.h" #include "delegation.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -1821,6 +1822,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); + set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); @@ -1865,6 +1867,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } static enum pnfs_try_status @@ -1979,6 +1982,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } /* @@ -2247,3 +2251,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) } return thp; } + +#if IS_ENABLED(CONFIG_NFS_V4_2) +int +pnfs_report_layoutstat(struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs42_layoutstat_data *data; + struct pnfs_layout_hdr *hdr; + int status = 0; + + if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats) + goto out; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS)) + goto out; + + if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags)) + goto out; + + spin_lock(&inode->i_lock); + if (!NFS_I(inode)->layout) { + spin_unlock(&inode->i_lock); + goto out; + } + hdr = NFS_I(inode)->layout; + pnfs_get_layout_hdr(hdr); + spin_unlock(&inode->i_lock); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + status = -ENOMEM; + goto out_put; + } + + data->args.fh = NFS_FH(inode); + data->args.inode = inode; + nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); + status = ld->prepare_layoutstats(&data->args); + if (status) + goto out_free; + + status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data); + +out: + dprintk("%s returns %d\n", __func__, status); + return status; + +out_free: + kfree(data); +out_put: + pnfs_put_layout_hdr(hdr); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); + smp_mb__after_atomic(); + goto out; +} +EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); +#endif diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1e6308f82fc3..3e6ab7bfbabd 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type { void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); + int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data); }; struct pnfs_layout_hdr { @@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); - /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ @@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_V4_2) +int pnfs_report_layoutstat(struct inode *inode); +#else +static inline int +pnfs_report_layoutstat(struct inode *inode) +{ + return 0; +} +#endif + #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f175b833b6ba..aa62004f1706 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2847,7 +2847,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) *((unsigned int *)kp->arg) = num; return 0; } -static struct kernel_param_ops param_ops_portnr = { +static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e6c262555e08..65869ca9c851 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1290,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); @@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -static void nfs_writeback_release_common(struct nfs_pgio_header *hdr) -{ - /* do nothing! */ -} - /* * Special version of should_remove_suid() that ignores capabilities. */ @@ -1556,7 +1552,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg); - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + dprintk("NFS: initiated commit call\n"); nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); @@ -2013,7 +2009,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = { .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, - .rw_release = nfs_writeback_release_common, .rw_done = nfs_writeback_done, .rw_result = nfs_writeback_result, .rw_initiate = nfs_initiate_write, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0ee0bed3649b..6b8b92b19cec 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 258d9fe2521a..4a73d6dffabf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -307,31 +307,13 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, static ssize_t nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - ssize_t size; + struct inode *inode = file_inode(iocb->ki_filp); if (iov_iter_rw(iter) == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); - - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - nilfs_write_failed(mapping, end); - } - - return size; + return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); } const struct address_space_operations nilfs_aops = { diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 450648697433..5b1e2a497e51 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -26,7 +26,7 @@ #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> -#include <linux/init.h> /* module_init */ +#include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ @@ -812,4 +812,4 @@ static int __init inotify_user_setup(void) return 0; } -module_init(inotify_user_setup); +fs_initcall(inotify_user_setup); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 2cd653670764..262561fea923 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, base_ni = ni; if (NInoAttr(ni)) base_ni = ni->ext.base_ntfs_ino; - err = file_remove_suid(file); + err = file_remove_privs(file); if (unlikely(err)) goto out; /* diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 76b6cfb579d7..b3c3469de6cb 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -239,7 +239,7 @@ typedef struct { */ static inline ntfs_inode *NTFS_I(struct inode *inode) { - return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); } static inline struct inode *VFS_I(ntfs_inode *ni) diff --git a/fs/open.c b/fs/open.c index e0250bdcc440..e33dab287fa0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ATTR_FILE; } - /* Remove suid/sgid on truncate too */ - ret = should_remove_suid(dentry); + /* Remove suid, sgid, and file capabilities on truncate too */ + ret = dentry_needs_remove_privs(dentry); + if (ret < 0) + return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; @@ -678,18 +680,18 @@ int open_check_o_direct(struct file *f) } static int do_dentry_open(struct file *f, + struct inode *inode, int (*open)(struct inode *, struct file *), const struct cred *cred) { static const struct file_operations empty_fops = {}; - struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; path_get(&f->f_path); - inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_inode = inode; f->f_mapping = inode->i_mapping; if (unlikely(f->f_flags & O_PATH)) { @@ -793,7 +795,8 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - error = do_dentry_open(file, open, current_cred()); + error = do_dentry_open(file, d_backing_inode(dentry), open, + current_cred()); if (!error) *opened |= FILE_OPENED; @@ -822,6 +825,34 @@ int finish_no_open(struct file *file, struct dentry *dentry) } EXPORT_SYMBOL(finish_no_open); +char *file_path(struct file *filp, char *buf, int buflen) +{ + return d_path(&filp->f_path, buf, buflen); +} +EXPORT_SYMBOL(file_path); + +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *file, + const struct cred *cred) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + + file->f_path = *path; + if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { + inode = dentry->d_op->d_select_inode(dentry, file->f_flags); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } + + return do_dentry_open(file, inode, NULL, cred); +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { @@ -853,26 +884,6 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -/** - * vfs_open - open the file at the given path - * @path: path to open - * @filp: newly allocated file with f_flag initialized - * @cred: credentials to use - */ -int vfs_open(const struct path *path, struct file *filp, - const struct cred *cred) -{ - struct inode *inode = path->dentry->d_inode; - - if (inode->i_op->dentry_open) - return inode->i_op->dentry_open(path->dentry, filp, cred); - else { - filp->f_path = *path; - return do_dentry_open(filp, NULL, cred); - } -} -EXPORT_SYMBOL(vfs_open); - static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 308379b2d0b2..f140e3dbfb7b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -337,37 +337,30 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -static int ovl_dentry_open(struct dentry *dentry, struct file *file, - const struct cred *cred) +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) { int err; struct path realpath; enum ovl_path_type type; - bool want_write = false; type = ovl_path_real(dentry, &realpath); - if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { - want_write = true; + if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); if (err) - goto out; + return ERR_PTR(err); - if (file->f_flags & O_TRUNC) + if (file_flags & O_TRUNC) err = ovl_copy_up_last(dentry, NULL, true); else err = ovl_copy_up(dentry); + ovl_drop_write(dentry); if (err) - goto out_drop_write; + return ERR_PTR(err); ovl_path_upper(dentry, &realpath); } - err = vfs_open(&realpath, file, cred); -out_drop_write: - if (want_write) - ovl_drop_write(dentry); -out: - return err; + return d_backing_inode(realpath.dentry); } static const struct inode_operations ovl_file_inode_operations = { @@ -378,7 +371,6 @@ static const struct inode_operations ovl_file_inode_operations = { .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, - .dentry_open = ovl_dentry_open, }; static const struct inode_operations ovl_symlink_inode_operations = { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 17ac5afc9ffb..ea5a40b06e3a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 907870e81a72..70e9af551600 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -23,6 +23,7 @@ struct ovl_cache_entry { u64 ino; struct list_head l_node; struct rb_node node; + struct ovl_cache_entry *next_maybe_whiteout; bool is_whiteout; char name[]; }; @@ -39,7 +40,7 @@ struct ovl_readdir_data { struct rb_root root; struct list_head *list; struct list_head middle; - struct dentry *dir; + struct ovl_cache_entry *first_maybe_whiteout; int count; int err; }; @@ -79,7 +80,7 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } -static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, +static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { @@ -98,29 +99,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, p->is_whiteout = false; if (d_type == DT_CHR) { - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - kfree(p); - return NULL; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - dentry = lookup_one_len(name, dir, len); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - revert_creds(old_cred); - put_cred(override_cred); + p->next_maybe_whiteout = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p; } return p; } @@ -148,7 +128,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } - p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); + p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) return -ENOMEM; @@ -169,7 +149,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { - p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else @@ -219,6 +199,43 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); } +static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) +{ + int err; + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + err = mutex_lock_killable(&dir->d_inode->i_mutex); + if (!err) { + while (rdd->first_maybe_whiteout) { + p = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_len(p->name, dir, p->len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + } + mutex_unlock(&dir->d_inode->i_mutex); + } + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + static inline int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd) { @@ -229,7 +246,7 @@ static inline int ovl_dir_read(struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->dir = realpath->dentry; + rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; @@ -238,6 +255,10 @@ static inline int ovl_dir_read(struct path *realpath, if (err >= 0) err = rdd->err; } while (!err && rdd->count); + + if (!err && rdd->first_maybe_whiteout) + err = ovl_check_whiteouts(realpath->dentry, rdd); + fput(realfile); return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bf8537c7f455..7466ff339c66 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,8 +273,56 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (ret < 0) + return ret; + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + return -ESTALE; + } + } + } + return 1; +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { + ret = d->d_op->d_weak_revalidate(d, flags); + if (ret <= 0) + break; + } + } + return ret; +} + static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, +}; + +static const struct dentry_operations ovl_reval_dentry_operations = { + .d_release = ovl_dentry_release, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, }; static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) @@ -288,6 +336,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) return oe; } +static bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); +} + +static bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) { @@ -303,6 +365,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, } else if (!dentry->d_inode) { dput(dentry); dentry = NULL; + } else if (ovl_dentry_weird(dentry)) { + dput(dentry); + /* Don't support traversing automounts and other weirdness */ + dentry = ERR_PTR(-EREMOTE); } return dentry; } @@ -350,6 +416,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out; if (this) { + if (unlikely(ovl_dentry_remote(this))) { + dput(this); + err = -EREMOTE; + goto out; + } if (ovl_is_whiteout(this)) { dput(this); this = NULL; @@ -694,25 +765,6 @@ static void ovl_unescape(char *s) } } -static bool ovl_is_allowed_fs_type(struct dentry *root) -{ - const struct dentry_operations *dop = root->d_op; - - /* - * We don't support: - * - automount filesystems - * - filesystems with revalidate (FIXME for lower layer) - * - filesystems with case insensitive names - */ - if (dop && - (dop->d_manage || dop->d_automount || - dop->d_revalidate || dop->d_weak_revalidate || - dop->d_compare || dop->d_hash)) { - return false; - } - return true; -} - static int ovl_mount_dir_noesc(const char *name, struct path *path) { int err = -EINVAL; @@ -727,7 +779,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) goto out; } err = -EINVAL; - if (!ovl_is_allowed_fs_type(path->dentry)) { + if (ovl_dentry_weird(path->dentry)) { pr_err("overlayfs: filesystem on '%s' not supported\n", name); goto out_put; } @@ -751,13 +803,21 @@ static int ovl_mount_dir(const char *name, struct path *path) if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); + + if (!err) + if (ovl_dentry_remote(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + tmp); + path_put(path); + err = -EINVAL; + } kfree(tmp); } return err; } static int ovl_lower_dir(const char *name, struct path *path, long *namelen, - int *stack_depth) + int *stack_depth, bool *remote) { int err; struct kstatfs statfs; @@ -774,6 +834,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen, *namelen = max(*namelen, statfs.f_namelen); *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + if (ovl_dentry_remote(path->dentry)) + *remote = true; + return 0; out_put: @@ -827,6 +890,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int numlower; unsigned int stacklen = 0; unsigned int i; + bool remote = false; int err; err = -ENOMEM; @@ -900,7 +964,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { err = ovl_lower_dir(lower, &stack[numlower], - &ufs->lower_namelen, &sb->s_stack_depth); + &ufs->lower_namelen, &sb->s_stack_depth, + &remote); if (err) goto out_put_lowerpath; @@ -958,7 +1023,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; - sb->s_d_op = &ovl_dentry_operations; + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; oe = ovl_alloc_entry(numlower); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84bb65b83570..4fb17ded7d47 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; diff --git a/fs/proc/base.c b/fs/proc/base.c index 1d540b3f226f..87782e874b6a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -491,14 +491,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, } #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - seq_printf(m, "%llu %llu %lu\n", + if (unlikely(!sched_info_on())) + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@ -2787,7 +2790,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP @@ -3135,7 +3138,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP diff --git a/fs/proc/generic.c b/fs/proc/generic.c index df6327a2b865..e5dee5c3188e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -373,6 +373,10 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; } + if (is_empty_pde(*parent)) { + WARN(1, "attempt to add to permanently empty directory"); + return NULL; + } ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); if (!ent) @@ -455,6 +459,25 @@ struct proc_dir_entry *proc_mkdir(const char *name, } EXPORT_SYMBOL(proc_mkdir); +struct proc_dir_entry *proc_create_mount_point(const char *name) +{ + umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO; + struct proc_dir_entry *ent, *parent = NULL; + + ent = __proc_create(&parent, name, mode, 2); + if (ent) { + ent->data = NULL; + ent->proc_fops = NULL; + ent->proc_iops = NULL; + if (proc_register(parent, ent) < 0) { + kfree(ent); + parent->nlink--; + ent = NULL; + } + } + return ent; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index afe232b9df6e..bd95b9fdebb0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -422,6 +422,10 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; + } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c835b94c0cd3..aa2781095bd1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -191,6 +191,12 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) } extern void pde_put(struct proc_dir_entry *); +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +struct proc_dir_entry *proc_create_mount_point(const char *name); + /* * inode.c */ diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index d4a35746cab9..f8595e8b5cd0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } seq_putc(m, '\n'); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fea2561d773b..fdda62e6115e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +/* Support for permanently empty directories */ + +struct ctl_table sysctl_mount_point[] = { + { } +}; + +static bool is_empty_dir(struct ctl_table_header *head) +{ + return head->ctl_table[0].child == sysctl_mount_point; +} + +static void set_empty_dir(struct ctl_dir *dir) +{ + dir->header.ctl_table[0].child = sysctl_mount_point; +} + +static void clear_empty_dir(struct ctl_dir *dir) + +{ + dir->header.ctl_table[0].child = NULL; +} + void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) @@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) struct ctl_table *entry; int err; + /* Is this a permanently empty directory? */ + if (is_empty_dir(&dir->header)) + return -EROFS; + + /* Am I creating a permanently empty directory? */ + if (header->ctl_table == sysctl_mount_point) { + if (!RB_EMPTY_ROOT(&dir->root)) + return -EINVAL; + set_empty_dir(dir); + } + dir->header.nreg++; header->parent = dir; err = insert_links(header); @@ -202,6 +235,8 @@ fail: erase_header(header); put_links(header); fail_links: + if (header->ctl_table == sysctl_mount_point) + clear_empty_dir(dir); header->parent = NULL; drop_sysctl_table(&dir->header); return err; @@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; + if (is_empty_dir(head)) + make_empty_dir_inode(inode); } out: return inode; diff --git a/fs/proc/root.c b/fs/proc/root.c index b7fa4bfe896a..68feb0f70e63 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); - /* Does the mounter have privilege over the pid namespace? */ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -159,7 +156,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -182,10 +179,10 @@ void __init proc_root_init(void) #endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); - proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ + proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ - proc_mkdir("openprom", NULL); + proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6dee68d013ff..ca1e091881d4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -310,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) */ if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, "\n"); + seq_file_path(m, file, "\n"); goto done; } @@ -1509,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (file) { seq_puts(m, " file="); - seq_path(m, &file->f_path, "\n\t= "); + seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 599ec2e20104..e0d64c92e4f6 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } else if (mm) { pid_t tid = pid_of_stack(priv, vma, is_pid); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8db932da4009..8ebd9a334085 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -17,7 +17,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; int event; @@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); event = ACCESS_ONCE(ns->event); - if (p->m.poll_event != event) { - p->m.poll_event = event; + if (m->poll_event != event) { + m->poll_event = event; res |= POLLERR | POLLPRI; } @@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb) static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -126,7 +127,7 @@ out: static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -186,7 +187,7 @@ out: static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; @@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; + struct seq_file *m; int ret = -EINVAL; if (!task) @@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file, task_unlock(task); put_task_struct(task); - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) + ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); + if (ret) goto err_put_path; - file->private_data = &p->m; - ret = seq_open(file, &mounts_op); - if (ret) - goto err_free; + m = file->private_data; + m->poll_event = ns->event; + p = m->private; p->ns = ns; p->root = root; - p->m.poll_event = ns->event; p->show = show; p->cached_event = ~0ULL; return 0; - err_free: - kfree(p); err_put_path: path_put(&root); err_put_ns: @@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file, static int mounts_release(struct inode *inode, struct file *file) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); - return seq_release(inode, file); + return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index dc43b5f29305..3adcc4669fac 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -461,22 +461,18 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; -static struct kobject *pstore_kobj; - static int __init init_pstore_fs(void) { - int err = 0; + int err; /* Create a convenient mount point for people to access pstore */ - pstore_kobj = kobject_create_and_add("pstore", fs_kobj); - if (!pstore_kobj) { - err = -ENOMEM; + err = sysfs_create_mount_point(fs_kobj, "pstore"); + if (err) goto out; - } err = register_filesystem(&pstore_fs_type); if (err < 0) - kobject_put(pstore_kobj); + sysfs_remove_mount_point(fs_kobj, "pstore"); out: return err; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8d64bb5366bf..e1f37278cf97 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n) return page; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static unsigned last_entry(struct inode *inode, unsigned long page_nr) { unsigned long last_byte = inode->i_size; diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f82155be8..ce9e39fd5daf 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,18 +48,21 @@ static void *seq_buf_alloc(unsigned long size) * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". + * Note: seq_open() will allocate a struct seq_file and store its + * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { - struct seq_file *p = file->private_data; + struct seq_file *p; + + WARN_ON(file->private_data); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + file->private_data = p; - if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return -ENOMEM; - file->private_data = p; - } - memset(p, 0, sizeof(*p)); mutex_init(&p->lock); p->op = op; #ifdef CONFIG_USER_NS @@ -487,6 +490,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc) } EXPORT_SYMBOL(seq_path); +/** + * seq_file_path - seq_file interface to print a pathname of a file + * @m: the seq_file handle + * @file: the struct file to print + * @esc: set of characters to escape in the output + * + * return the absolute path to the file. + */ +int seq_file_path(struct seq_file *m, struct file *file, const char *esc) +{ + return seq_path(m, &file->f_path, esc); +} +EXPORT_SYMBOL(seq_file_path); + /* * Same as seq_path, but relative to supplied root. */ @@ -538,6 +555,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } +EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 73588e7700ed..d09fcd6fb85d 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -49,6 +49,6 @@ struct squashfs_inode_info { static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) { - return list_entry(inode, struct squashfs_inode_info, vfs_inode); + return container_of(inode, struct squashfs_inode_info, vfs_inode); } #endif diff --git a/fs/super.c b/fs/super.c index 928c20f47af9..b61372354f2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -842,7 +842,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if (dev == (1 << MINORBITS)) { + if (dev >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0b45ff42f374..94374e435025 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -121,3 +121,37 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } + +/** + * sysfs_create_mount_point - create an always empty directory + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to add + */ +int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *kn, *parent = parent_kobj->sd; + + kn = kernfs_create_empty_dir(parent, name); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_create_mount_point); + +/** + * sysfs_remove_mount_point - remove an always empty directory. + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to remove + * + */ +void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *parent = parent_kobj->sd; + + kernfs_remove_by_name_ns(parent, name, NULL); +} +EXPORT_SYMBOL_GPL(sysfs_remove_mount_point); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8a49486bf30c..1c6ac6fcee9f 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -31,9 +31,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, bool new_sb; if (!(flags & MS_KERNMOUNT)) { - if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); - if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return ERR_PTR(-EPERM); } @@ -58,7 +55,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8f3555f00c54..63c1bcb224ee 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 2c13525131cd..6c212288adcb 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -73,7 +73,7 @@ struct sysv_inode_info { static inline struct sysv_inode_info *SYSV_I(struct inode *inode) { - return list_entry(inode, struct sysv_inode_info, vfs_inode); + return container_of(inode, struct sysv_inode_info, vfs_inode); } static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index d92bdf3b079a..cbc8d5d2755a 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare return dentry; } -static inline int tracefs_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (tracefs_positive(dentry)) { + if (simple_positive(dentry)) { if (dentry->d_inode) { dget(dentry); switch (dentry->d_inode->i_mode & S_IFMT) { @@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!tracefs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * tracefs_positive() check. + * simple_positive() check. */ goto loop; } @@ -631,14 +626,12 @@ bool tracefs_initialized(void) return tracefs_registered; } -static struct kobject *trace_kobj; - static int __init tracefs_init(void) { int retval; - trace_kobj = kobject_create_and_add("tracing", kernel_kobj); - if (!trace_kobj) + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); + if (retval) return -EINVAL; retval = register_filesystem(&trace_fs_type); diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index b5cd8ed2aa12..b1b9a63d8cf3 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -56,7 +56,7 @@ struct udf_inode_info { static inline struct udf_inode_info *UDF_I(struct inode *inode) { - return list_entry(inode, struct udf_inode_info, vfs_inode); + return container_of(inode, struct udf_inode_info, vfs_inode); } #endif /* _UDF_I_H) */ diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2c1036080d52..a7106eda5024 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -51,8 +51,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - - lock_ufs(sb); + + mutex_lock(&UFS_SB(sb)->s_lock); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -115,13 +115,13 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - - unlock_ufs(sb); + + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return; } @@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); do_more: overflow = 0; @@ -211,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed_unlock: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); failed: UFSD("EXIT (FAILED)\n"); return; @@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } @@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } @@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -439,7 +439,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -485,7 +485,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 1bfe8cabff0f..74f2e80288bf 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long ufs_dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; @@ -87,7 +82,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode) + struct page *page, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_address(page); @@ -103,7 +99,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, err = ufs_commit_chunk(page, pos, len); ufs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(dir); } @@ -256,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -320,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; loff_t pos; @@ -437,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = ufs_dir_pages(inode); + unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -608,7 +605,7 @@ int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; struct page *page = NULL; - unsigned long i, npages = ufs_dir_pages(inode); + unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { char *kaddr; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7caa01652888..fd0203ce1f7f 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode) ino = inode->i_ino; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } @@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); } @@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) sbi = UFS_SB(sb); uspi = sbi->s_uspi; - lock_ufs(sb); + mutex_lock(&sbi->s_lock); /* * Try to place the inode in its parent directory @@ -331,21 +331,21 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); clear_nlink(inode); unlock_new_inode(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 99aaf5c9bf4d..f913a6924b23 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -903,6 +903,9 @@ void ufs_evict_inode(struct inode * inode) invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) + if (want_delete) { + lock_ufs(inode->i_sb); ufs_free_inode(inode); + unlock_ufs(inode->i_sb); + } } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index f773deb1d2e3..47966554317c 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi if (dentry->d_name.len > UFS_MAXNAMLEN) return ERR_PTR(-ENAMETOOLONG); - lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) inode = ufs_iget(dir->i_sb, ino); - unlock_ufs(dir->i_sb); return d_splice_alias(inode, dentry); } @@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; - int err; - - UFSD("BEGIN\n"); inode = ufs_new_inode(dir, mode); - err = PTR_ERR(inode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - mark_inode_dirty(inode); - lock_ufs(dir->i_sb); - err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); - } - UFSD("END: err=%d\n", err); - return err; + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + return ufs_add_nondir(dentry, inode); } static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) @@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev init_special_inode(inode, mode, rdev); ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); mark_inode_dirty(inode); - lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); } return err; } @@ -121,19 +109,18 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; - int err = -ENAMETOOLONG; + int err; unsigned l = strlen(symname)+1; struct inode * inode; if (l > sb->s_blocksize) - goto out_notlocked; + return -ENAMETOOLONG; inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_notlocked; + return err; - lock_ufs(dir->i_sb); if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ inode->i_op = &ufs_symlink_inode_operations; @@ -150,17 +137,13 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, } mark_inode_dirty(inode); - err = ufs_add_nondir(dentry, inode); -out: - unlock_ufs(dir->i_sb); -out_notlocked: - return err; + return ufs_add_nondir(dentry, inode); out_fail: inode_dec_link_count(inode); unlock_new_inode(inode); iput(inode); - goto out; + return err; } static int ufs_link (struct dentry * old_dentry, struct inode * dir, @@ -169,14 +152,16 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - lock_ufs(dir->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); ihold(inode); - error = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); + error = ufs_add_link(dentry, inode); + if (error) { + inode_dec_link_count(inode); + iput(inode); + } else + d_instantiate(dentry, inode); return error; } @@ -185,9 +170,12 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; + inode_inc_link_count(dir); + inode = ufs_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); if (IS_ERR(inode)) - return PTR_ERR(inode); + goto out_dir; inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; @@ -195,9 +183,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) inode_inc_link_count(inode); - lock_ufs(dir->i_sb); - inode_inc_link_count(dir); - err = ufs_make_empty(inode, dir); if (err) goto out_fail; @@ -205,20 +190,19 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) err = ufs_add_link(dentry, inode); if (err) goto out_fail; - unlock_ufs(dir->i_sb); + unlock_new_inode(inode); d_instantiate(dentry, inode); -out: - return err; + return 0; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); unlock_new_inode(inode); iput (inode); +out_dir: inode_dec_link_count(dir); - unlock_ufs(dir->i_sb); - goto out; + return err; } static int ufs_unlink(struct inode *dir, struct dentry *dentry) @@ -248,7 +232,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); if (!err) { @@ -257,7 +240,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) inode_dec_link_count(dir); } } - unlock_ufs(dir->i_sb); return err; } @@ -295,7 +277,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode); + ufs_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) drop_nlink(new_inode); @@ -318,7 +300,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_inode_dirty(old_inode); if (dir_de) { - ufs_set_link(old_inode, dir_de, dir_page, new_dir); + if (old_dir != new_dir) + ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 098508a93c7b..250579a80d90 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -695,6 +695,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) unsigned flags; lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -712,6 +713,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; @@ -800,6 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); mutex_init(&sbi->mutex); + mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* @@ -1278,6 +1281,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sync_filesystem(sb); lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1291,6 +1295,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } @@ -1298,12 +1303,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1327,6 +1334,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) */ #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; #else @@ -1336,11 +1344,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EPERM; } @@ -1348,6 +1358,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2a07396d5f9e..2e31ea2e35a3 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -30,6 +30,7 @@ struct ufs_sb_info { int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct mutex s_lock; }; struct ufs_inode_info { @@ -105,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode); + struct page *page, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 516162be1398..f9e9ffe6fb46 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -149,13 +149,27 @@ xfs_alloc_compute_aligned( { xfs_agblock_t bno; xfs_extlen_t len; + xfs_extlen_t diff; /* Trim busy sections out of found extent */ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + /* + * If we have a largish extent that happens to start before min_agbno, + * see if we can shift it into range... + */ + if (bno < args->min_agbno && bno + len > args->min_agbno) { + diff = args->min_agbno - bno; + if (len > diff) { + bno += diff; + len -= diff; + } + } + if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); - xfs_extlen_t diff = aligned_bno - bno; + + diff = aligned_bno - bno; *resbno = aligned_bno; *reslen = diff >= len ? 0 : len - diff; @@ -795,9 +809,13 @@ xfs_alloc_find_best_extent( * The good extent is closer than this one. */ if (!dir) { + if (*sbnoa > args->max_agbno) + goto out_use_good; if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { + if (*sbnoa < args->min_agbno) + goto out_use_good; if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near( dofirst = prandom_u32() & 1; #endif + /* handle unitialized agbno range so caller doesn't have to */ + if (!args->min_agbno && !args->max_agbno) + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; + ASSERT(args->min_agbno <= args->max_agbno); + + /* clamp agbno to the range if it's outside */ + if (args->agbno < args->min_agbno) + args->agbno = args->min_agbno; + if (args->agbno > args->max_agbno) + args->agbno = args->max_agbno; + restart: bno_cur_lt = NULL; bno_cur_gt = NULL; @@ -976,6 +1005,8 @@ restart: <bnoa, <lena); if (ltlena < args->minlen) continue; + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) + continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); @@ -1096,11 +1127,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); - if (ltlena >= args->minlen) + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; - if (!i) { + if (!i || ltbnoa < args->min_agbno) { xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); bno_cur_lt = NULL; @@ -1112,11 +1143,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); - if (gtlena >= args->minlen) + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; - if (!i) { + if (!i || gtbnoa > args->max_agbno) { xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); bno_cur_gt = NULL; @@ -1216,6 +1247,7 @@ restart: ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, @@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels( xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_perag *pag, + xfs_extlen_t need) { - xfs_extlen_t need, delta = 0; + xfs_extlen_t delta = 0; - need = XFS_MIN_FREELIST_PAG(pag, mp); if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; @@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +unsigned int +xfs_alloc_min_freelist( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + unsigned int min_free; + + /* space needed by-bno freespace btree */ + min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + mp->m_ag_maxlevels); + /* space needed by-size freespace btree */ + min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + mp->m_ag_maxlevels); + + return min_free; +} + +/* + * Check if the operation we are fixing up the freelist for should go ahead or + * not. If we are freeing blocks, we always allow it, otherwise the allocation + * is dependent on whether the size and shape of free space available will + * permit the requested allocation to take place. + */ +static bool +xfs_alloc_space_available( + struct xfs_alloc_arg *args, + xfs_extlen_t min_free, + int flags) +{ + struct xfs_perag *pag = args->pag; + xfs_extlen_t longest; + int available; + + if (flags & XFS_ALLOC_FLAG_FREEING) + return true; + + /* do we have enough contiguous free space for the allocation? */ + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + return false; + + /* do have enough free space remaining for the allocation? */ + available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + min_free - args->total); + if (available < (int)args->minleft) + return false; + + return true; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ STATIC int /* error */ xfs_alloc_fix_freelist( - xfs_alloc_arg_t *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + struct xfs_alloc_arg *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ { - xfs_buf_t *agbp; /* agf buffer pointer */ - xfs_agf_t *agf; /* a.g. freespace structure pointer */ - xfs_buf_t *agflbp;/* agfl buffer pointer */ - xfs_agblock_t bno; /* freelist block */ - xfs_extlen_t delta; /* new blocks needed in freelist */ - int error; /* error result code */ - xfs_extlen_t longest;/* longest extent in allocation group */ - xfs_mount_t *mp; /* file system mount point structure */ - xfs_extlen_t need; /* total blocks needed in freelist */ - xfs_perag_t *pag; /* per-ag information structure */ - xfs_alloc_arg_t targs; /* local allocation arguments */ - xfs_trans_t *tp; /* transaction pointer */ - - mp = args->mp; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag = args->pag; + struct xfs_trans *tp = args->tp; + struct xfs_buf *agbp = NULL; + struct xfs_buf *agflbp = NULL; + struct xfs_alloc_arg targs; /* local allocation arguments */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t need; /* total blocks needed in freelist */ + int error; - pag = args->pag; - tp = args->tp; if (!pag->pagf_init) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; if (!pag->pagf_init) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - } else - agbp = NULL; + } /* - * If this is a metadata preferred pag and we are user data - * then try somewhere else if we are not being asked to - * try harder at this point + * If this is a metadata preferred pag and we are user data then try + * somewhere else if we are not being asked to try harder at this + * point */ if (pag->pagf_metadata && args->userdata && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - need = XFS_MIN_FREELIST_PAG(pag, mp); - longest = xfs_alloc_longest_free_extent(mp, pag); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; - } - } + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. */ - if (agbp == NULL) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; - if (agbp == NULL) { + if (!agbp) { + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; + if (!agbp) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; - } - } - /* - * Figure out how many blocks we should have in the freelist. - */ - agf = XFS_BUF_TO_AGF(agbp); - need = XFS_MIN_FREELIST(agf, mp); - /* - * If there isn't enough total or single-extent, reject it. - */ - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + goto out_no_agbp; } } + + /* If there isn't enough total space or single-extent, reject it. */ + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; + /* * Make the freelist shorter if it's too long. + * + * Note that from this point onwards, we will always release the agf and + * agfl buffers on error. This handles the case where we error out and + * the buffers are clean or may not have been joined to the transaction + * and hence need to be released manually. If they have been joined to + * the transaction, then xfs_trans_brelse() will handle them + * appropriately based on the recursion count and dirty state of the + * buffer. + * + * XXX (dgc): When we have lots of free space, does this buy us + * anything other than extra overhead when we need to put more blocks + * back on the free list? Maybe we should only do this when space is + * getting low or the AGFL is more than half full? */ - while (be32_to_cpu(agf->agf_flcount) > need) { - xfs_buf_t *bp; + while (pag->pagf_flcount > need) { + struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) - return error; - if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) - return error; + goto out_agbp_relse; + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + if (error) + goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - /* - * Initialize the args structure. - */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; @@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) - return error; - /* - * Make the freelist longer if it's too short. - */ - while (be32_to_cpu(agf->agf_flcount) < need) { + error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + if (error) + goto out_agbp_relse; + + /* Make the freelist longer if it's too short. */ + while (pag->pagf_flcount < need) { targs.agbno = 0; - targs.maxlen = need - be32_to_cpu(agf->agf_flcount); - /* - * Allocate as many blocks as possible at once. - */ - if ((error = xfs_alloc_ag_vextent(&targs))) { - xfs_trans_brelse(tp, agflbp); - return error; - } + targs.maxlen = need - pag->pagf_flcount; + + /* Allocate as many blocks as possible at once. */ + error = xfs_alloc_ag_vextent(&targs); + if (error) + goto out_agflbp_relse; + /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist( if (targs.agbno == NULLAGBLOCK) { if (flags & XFS_ALLOC_FLAG_FREEING) break; - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; + goto out_agflbp_relse; } /* * Put each allocated block on the list. @@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist( error = xfs_alloc_put_freelist(tp, agbp, agflbp, bno, 0); if (error) - return error; + goto out_agflbp_relse; } } xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; + +out_agflbp_relse: + xfs_trans_brelse(tp, agflbp); +out_agbp_relse: + if (agbp) + xfs_trans_brelse(tp, agbp); +out_no_agbp: + args->agbp = NULL; + return error; } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d1b4b6a5c894..ca1c8168373a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg { xfs_extlen_t total; /* total blocks needed in xaction */ xfs_extlen_t alignment; /* align answer to multiple of this */ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ + xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ @@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ -/* - * Find the length of the longest extent in an AG. - */ -xfs_extlen_t -xfs_alloc_longest_free_extent(struct xfs_mount *mp, +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag, xfs_extlen_t need); +unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0a472fbe06d4..3349c9a1e845 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -266,7 +266,7 @@ xfs_attr_set( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -276,7 +276,7 @@ xfs_attr_set( XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(args.trans); return error; } @@ -320,8 +320,7 @@ xfs_attr_set( xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); } - err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES); + err2 = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error ? error : err2; @@ -383,16 +382,14 @@ xfs_attr_set( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -462,7 +459,7 @@ xfs_attr_remove( error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } @@ -501,16 +498,14 @@ xfs_attr_remove( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f1026e86dabc..63e05b663380 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ - int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork( tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); if (error) goto trans_cancel; - cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { @@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; bmap_cancel: xfs_bmap_cancel(&flist); trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (*blen < longest) *blen = longest; @@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten( error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); - bma->logflags |= tmp_logflags; + /* + * Log the inode core unconditionally in the unwritten extent conversion + * path because the conversion might not have done so (e.g., if the + * extent count hasn't changed). We need to make sure the inode is dirty + * in the transaction for the sake of fsync(), even if nothing has + * changed, because fsync() will not force the log for this transaction + * unless it sees the inode pinned. + */ + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -5918,7 +5924,7 @@ xfs_bmap_split_extent( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -5936,10 +5942,9 @@ xfs_bmap_split_extent( if (error) goto out; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - + return xfs_trans_commit(tp); out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 4daaa662337b..a0ae572051de 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -170,7 +170,7 @@ typedef struct xfs_sb { __uint32_t sb_features_log_incompat; __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ @@ -256,7 +256,7 @@ typedef struct xfs_dsb { __be32 sb_features_log_incompat; __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; + __be32 sb_spino_align; /* sparse inode chunk alignment */ __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ @@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature( } #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ + XFS_SB_FEAT_INCOMPAT_SPINODES) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); +} + /* * end of superblock version macros */ @@ -758,19 +766,6 @@ typedef struct xfs_agfl { #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) #define XFS_FSB_TO_AGNO(mp,fsbno) \ @@ -1216,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INODES_PER_HOLEMASK_BIT \ + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* - * Data record structure + * The on-disk inode record structure has two formats. The original "full" + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount + * and replaces the 3 high-order freecount bytes wth the holemask and inode + * count. + * + * The holemask of the sparse record format allows an inode chunk to have holes + * that refer to blocks not owned by the inode record. This facilitates inode + * allocation in the event of severe free space fragmentation. */ typedef struct xfs_inobt_rec { __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ + union { + struct { + __be32 ir_freecount; /* count of free inodes */ + } f; + struct { + __be16 ir_holemask;/* hole mask for sparse chunks */ + __u8 ir_count; /* total inode count */ + __u8 ir_freecount; /* count of free inodes */ + } sp; + } ir_u; __be64 ir_free; /* free inode mask */ } xfs_inobt_rec_t; typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ + __uint16_t ir_holemask; /* hole mask for sparse chunks */ + __uint8_t ir_count; /* total inode count */ + __uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; +static inline bool xfs_inobt_issparse(uint16_t holemask) +{ + /* non-zero holemask represents a sparse rec. */ + return holemask; +} /* * Key structure @@ -1453,8 +1476,8 @@ struct xfs_acl { sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca19f..89689c6a43e2 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1c9e75521250..66efc702452a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -65,6 +65,8 @@ xfs_inobt_lookup( int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_holemask = 0; + cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); @@ -82,7 +84,14 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); + rec.inobt.ir_u.sp.ir_count = irec->ir_count; + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); + } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -100,12 +109,27 @@ xfs_inobt_get_rec( int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + if (error || *stat == 0) + return error; + + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); + irec->ir_count = rec->inobt.ir_u.sp.ir_count; + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; + } else { + /* + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded + * values for full inode chunks. + */ + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; + irec->ir_count = XFS_INODES_PER_CHUNK; + irec->ir_freecount = + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } - return error; + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + + return 0; } /* @@ -114,10 +138,14 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -154,7 +182,9 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, + XFS_INODES_PER_CHUNK, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -220,6 +250,7 @@ xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, + int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -275,7 +306,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; @@ -347,6 +378,214 @@ xfs_ialloc_inode_init( } /* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; + int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.fsbno = NULLFSBLOCK; + +#ifdef DEBUG + /* randomly do sparse inode allocations */ + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks) + do_sparse = prandom_u32() & 1; +#endif /* * Locking will ensure that we don't have two callers in here @@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc( agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + args.mp->m_ialloc_blks; + if (do_sparse) + goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc( * subsequent requests. */ args.minalignslop = 0; - } else - args.fsbno = NULLFSBLOCK; + } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* @@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { +sparse_alloc: + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + ASSERT(newlen <= XFS_INODES_PER_CHUNK); + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, - args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + args.agbno, args.len, prandom_u32()); if (error) return error; @@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -512,20 +871,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -645,7 +990,7 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_blks; + ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; @@ -732,6 +1077,27 @@ xfs_ialloc_get_rec( } /* + * Return the offset of the first free inode in the record. If the inode chunk + * is sparsely allocated, we convert the record holemask to inode granularity + * and mask off the unallocated regions from the inode free mask. + */ +STATIC int +xfs_inobt_first_free_inode( + struct xfs_inobt_rec_incore *rec) +{ + xfs_inofree_t realfree; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(rec->ir_holemask)) + return xfs_lowbit64(rec->ir_free); + + realfree = xfs_inobt_irec_to_allocmask(rec); + realfree &= rec->ir_free; + + return xfs_lowbit64(realfree); +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -961,7 +1327,7 @@ newino: } alloc_inode: - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1210,7 +1576,7 @@ xfs_dialloc_ag( if (error) goto error_cur; - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1439,6 +1805,83 @@ out_error: return error; } +/* + * Free the blocks of an inode chunk. We must consider that the inode chunk + * might be sparse and only free the regions that are allocated as part of the + * chunk. + */ +STATIC void +xfs_difree_inode_chunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *rec, + struct xfs_bmap_free *flist) +{ + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + int startidx, endidx; + int nextbit; + xfs_agblock_t agbno; + int contigblk; + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + + if (!xfs_inobt_issparse(rec->ir_holemask)) { + /* not sparse, calculate extent info directly */ + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), + mp->m_ialloc_blks, flist, mp); + return; + } + + /* holemask is only 16-bits (fits in an unsigned long) */ + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); + holemask[0] = rec->ir_holemask; + + /* + * Find contiguous ranges of zeroes (i.e., allocated regions) in the + * holemask and convert the start/end index of each range to an extent. + * We start with the start and end index both pointing at the first 0 in + * the mask. + */ + startidx = endidx = find_first_zero_bit(holemask, + XFS_INOBT_HOLEMASK_BITS); + nextbit = startidx + 1; + while (startidx < XFS_INOBT_HOLEMASK_BITS) { + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, + nextbit); + /* + * If the next zero bit is contiguous, update the end index of + * the current range and continue. + */ + if (nextbit != XFS_INOBT_HOLEMASK_BITS && + nextbit == endidx + 1) { + endidx = nextbit; + goto next; + } + + /* + * nextbit is not contiguous with the current end index. Convert + * the current start/end to an extent and add it to the free + * list. + */ + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + contigblk = ((endidx - startidx + 1) * + XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + flist, mp); + + /* reset range to current bit and carry on... */ + startidx = endidx = nextbit; + +next: + nextbit++; + } +} + STATIC int xfs_difree_inobt( struct xfs_mount *mp, @@ -1446,8 +1889,7 @@ xfs_difree_inobt( struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_bmap_free *flist, - int *deleted, - xfs_ino_t *first_ino, + struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); @@ -1501,20 +1943,23 @@ xfs_difree_inobt( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes eligible for removal + * When an inode chunk is free, it becomes eligible for removal. Don't + * remove the chunk if the block size is large enough for multiple inode + * chunks (that might not be free). */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == mp->m_ialloc_inos)) { - - *deleted = 1; - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + xic->deleted = 1; + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = mp->m_ialloc_inos; + ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1530,11 +1975,9 @@ xfs_difree_inobt( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_difree_inode_chunk(mp, agno, &rec, flist); } else { - *deleted = 0; + xic->deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1599,7 +2042,9 @@ xfs_difree_finobt( */ XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; @@ -1634,8 +2079,13 @@ xfs_difree_finobt( * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. + * + * Note that we currently can't free chunks when the block size is large + * enough for multiple chunks. Leave the finobt record to remain in sync + * with the inobt. */ - if (rec.ir_freecount == mp->m_ialloc_inos && + if (rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && !(mp->m_flags & XFS_MOUNT_IKEEP)) { error = xfs_btree_delete(cur, &i); if (error) @@ -1671,8 +2121,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted,/* set if inode cluster was deleted */ - xfs_ino_t *first_ino)/* first inode in deleted cluster */ + struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -1723,8 +2172,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, - &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 100007d56449..6e450df2979b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -28,6 +28,13 @@ struct xfs_btree_cur; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 +struct xfs_icluster { + bool deleted; /* record is deleted */ + xfs_ino_t first_ino; /* first inode number */ + uint64_t alloc; /* inode phys. allocation bitmap for + * sparse chunks */ +}; + /* Calculate and return the number of filesystem blocks per inode cluster */ static inline int xfs_icluster_size_fsb( @@ -44,8 +51,7 @@ xfs_icluster_size_fsb( static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (struct xfs_dinode *) - (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); + return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } /* @@ -90,8 +96,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino); /* first inode in deleted cluster */ + struct xfs_icluster *ifree); /* cluster info if deleted */ /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, * Inode chunk initialisation routine */ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, - struct list_head *buffer_list, + struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964c465ca69c..674ad8f760be 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec->inobt.ir_u.sp.ir_holemask = + cpu_to_be16(cur->bc_rec.i.ir_holemask); + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec->inobt.ir_u.f.ir_freecount = + cpu_to_be32(cur->bc_rec.i.ir_freecount); + } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } @@ -418,3 +427,85 @@ xfs_inobt_maxrecs( return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } + +/* + * Convert the inode record holemask to an inode allocation bitmap. The inode + * allocation bitmap is inode granularity and specifies whether an inode is + * physically allocated on disk (not whether the inode is considered allocated + * or free by the fs). + * + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. + */ +uint64_t +xfs_inobt_irec_to_allocmask( + struct xfs_inobt_rec_incore *rec) +{ + uint64_t bitmap = 0; + uint64_t inodespbit; + int nextbit; + uint allocbitmap; + + /* + * The holemask has 16-bits for a 64 inode record. Therefore each + * holemask bit represents multiple inodes. Create a mask of bits to set + * in the allocmask for each holemask bit. + */ + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; + + /* + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask + * anything beyond the 16 holemask bits since this casts to a larger + * type. + */ + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); + + /* + * allocbitmap is the inverted holemask so every set bit represents + * allocated inodes. To expand from 16-bit holemask granularity to + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target + * bitmap for every holemask bit. + */ + nextbit = xfs_next_bit(&allocbitmap, 1, 0); + while (nextbit != -1) { + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); + + bitmap |= (inodespbit << + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); + + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); + } + + return bitmap; +} + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c2d0..bd88453217ce 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +/* ir_holemask to inode allocation bitmap conversion */ +uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); + +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 002b6b3a1988..6526e7696184 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -46,8 +46,7 @@ xfs_inobp_check( j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); + dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", @@ -86,8 +85,7 @@ xfs_inode_buf_verify( int di_ok; xfs_dinode_t *dip; - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); + dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -186,7 +184,7 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dc4bfc5d88fc..df9851c46b5c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -174,6 +174,27 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (xfs_sb_version_hassparseinodes(sbp)) { + uint32_t align; + + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -374,7 +395,7 @@ __xfs_sb_from_disk( be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; - to->sb_pad = 0; + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* Convert on-disk flags to in-memory flags? */ @@ -516,7 +537,7 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); - to->sb_pad = 0; + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); } } @@ -689,6 +710,11 @@ xfs_sb_mount_common( mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + mp->m_ialloc_min_blks = sbp->sb_spino_align; + else + mp->m_ialloc_min_blks = mp->m_ialloc_blks; } /* @@ -792,12 +818,12 @@ xfs_sync_sb( tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 8dda4b321343..5be529707903 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer count in superblock */ /* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* * Field values for xfs_trans_mod_sb. */ #define XFS_TRANS_SB_ICOUNT 0x00000001 diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 2d5bdfce6d8f..797815012c0e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,9 +73,9 @@ struct xfs_trans_resv { * 2 trees * (2 blocks/level * max depth - 1) * block size */ #define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) /* * Per-directory log reservation for any directory change. diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index bf9c4579334d..41e0428d8175 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -67,7 +67,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * XFS_AG_MAXLEVELS(mp)) + (2 * (mp)->m_ag_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e5099f268032..3859f5e27a4d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -145,7 +145,7 @@ xfs_setfilesize( isize = xfs_new_eof(ip, offset + size); if (!isize) { xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return 0; } @@ -155,7 +155,7 @@ xfs_setfilesize( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } STATIC int @@ -1348,7 +1348,7 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1413,6 +1413,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1507,49 +1508,29 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } -STATIC int +int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); } -/* - * Complete a direct I/O write request. - * - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite - * wholly within the EOF and so there is nothing for us to do. Note that in this - * case the completion can be called in interrupt context, whereas if we have an - * ioend we will always be called in task context (i.e. from a workqueue). - */ -STATIC void -xfs_end_io_direct_write( - struct kiocb *iocb, +static void +__xfs_end_io_direct_write( + struct inode *inode, + struct xfs_ioend *ioend, loff_t offset, - ssize_t size, - void *private) + ssize_t size) { - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_ioend *ioend = private; - - trace_xfs_gbmap_direct_endio(ip, offset, size, - ioend ? ioend->io_type : 0, NULL); + struct xfs_mount *mp = XFS_I(inode)->i_mount; - if (!ioend) { - ASSERT(offset + size <= i_size_read(inode)); - return; - } - - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) goto out_end_io; /* @@ -1586,10 +1567,10 @@ xfs_end_io_direct_write( * here can result in EOF moving backwards and Bad Things Happen when * that occurs. */ - spin_lock(&ip->i_flags_lock); + spin_lock(&XFS_I(inode)->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); + spin_unlock(&XFS_I(inode)->i_flags_lock); /* * If we are doing an append IO that needs to update the EOF on disk, @@ -1606,6 +1587,98 @@ out_end_io: return; } +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + __xfs_end_io_direct_write(inode, ioend, offset, size); +} + +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocate blocks and then zero them. Note that in this + * case the mapping indicated by the ioend may extend beyond EOF. We most + * definitely do not want to extend EOF here, so we trim back the ioend size to + * EOF. + */ +#ifdef CONFIG_FS_DAX +void +xfs_end_io_dax_write( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct inode *inode = ioend->io_inode; + ssize_t size = ioend->io_size; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + ioend->io_error = -EIO; + + /* + * Trim update to EOF, so we don't extend EOF during unwritten extent + * conversion of partial EOF blocks. + */ + spin_lock(&XFS_I(inode)->i_flags_lock); + if (ioend->io_offset + size > i_size_read(inode)) + size = i_size_read(inode) - ioend->io_offset; + spin_unlock(&XFS_I(inode)->i_flags_lock); + + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); + +} +#else +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } +#endif + +static inline ssize_t +xfs_vm_do_dio( + struct inode *inode, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset, + void (*endio)(struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private), + int flags) +{ + struct block_device *bdev; + + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, + xfs_get_blocks_direct, endio, 0); + + bdev = xfs_find_bdev_for_inode(inode); + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, endio, NULL, flags); +} + STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1613,16 +1686,11 @@ xfs_vm_direct_IO( loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - if (iov_iter_rw(iter) == WRITE) { - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, - DIO_ASYNC_EXTEND); - } - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, NULL, NULL, 0); + if (iov_iter_rw(iter) == WRITE) + return xfs_vm_do_dio(inode, iocb, iter, offset, + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e0137a4..86afd1ac7895 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_direct(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 3fbf167cfb4c..2bb959ada45b 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -394,7 +394,6 @@ xfs_attr_inactive( { struct xfs_trans *trans; struct xfs_mount *mp; - int cancel_flags = 0; int lock_mode = XFS_ILOCK_SHARED; int error = 0; @@ -423,7 +422,6 @@ xfs_attr_inactive( goto out_cancel; lock_mode = XFS_ILOCK_EXCL; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; xfs_ilock(dp, lock_mode); if (!XFS_IFORK_Q(dp)) @@ -435,8 +433,14 @@ xfs_attr_inactive( */ xfs_trans_ijoin(trans, dp, 0); - /* invalidate and truncate the attribute fork extents */ - if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + /* + * Invalidate and truncate the attribute fork extents. Make sure the + * fork actually has attributes as otherwise the invalidation has no + * blocks to read and returns an error. In this case, just do the fork + * removal below. + */ + if (xfs_inode_hasattr(dp) && + dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -449,12 +453,12 @@ xfs_attr_inactive( /* Reset the attribute fork - this also destroys the in-core fork */ xfs_attr_fork_remove(dp, trans); - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(trans); xfs_iunlock(dp, lock_mode); return error; out_cancel: - xfs_trans_cancel(trans, cancel_flags); + xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ if (dp->i_afp) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a52bbd3abc7d..0f34886cf726 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,28 +75,20 @@ xfs_bmap_finish( xfs_efi_log_item_t *efi; /* extent free intention */ int error; /* error return value */ xfs_bmap_free_item_t *free; /* free extent item */ - struct xfs_trans_res tres; /* new log reservation */ xfs_mount_t *mp; /* filesystem mount structure */ xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { *committed = 0; return 0; } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - tres.tr_logres = ntp->t_log_res; - tres.tr_logcount = ntp->t_log_count; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; + error = xfs_trans_roll(tp, NULL); *committed = 1; /* * We have a new transaction, so we should return committed=1, @@ -105,19 +97,10 @@ xfs_bmap_finish( if (error) return error; - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - error = xfs_trans_reserve(ntp, &tres, 0, 0); - if (error) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + if ((error = xfs_free_extent(*tp, free->xbfi_startblock, free->xbfi_blockcount))) { /* * The bmap free list will be cleaned up at a @@ -127,7 +110,7 @@ xfs_bmap_finish( * happens, since this transaction may not be * dirty yet. */ - mp = ntp->t_mountp; + mp = (*tp)->t_mountp; if (!XFS_FORCED_SHUTDOWN(mp)) xfs_force_shutdown(mp, (error == -EFSCORRUPTED) ? @@ -135,7 +118,7 @@ xfs_bmap_finish( SHUTDOWN_META_IO_ERROR); return error; } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); xfs_bmap_del_free(flist, NULL, free); } @@ -878,7 +861,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return -EAGAIN; } } @@ -886,7 +869,7 @@ xfs_free_eofblocks( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -908,12 +891,9 @@ xfs_free_eofblocks( * If we get an error at this point we simply don't * bother truncating the file. */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (!error) xfs_inode_clear_eofblocks_tag(ip); } @@ -1026,7 +1006,7 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1053,7 +1033,7 @@ xfs_alloc_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { break; @@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes( break; ASSERT(imap.br_blockcount >= 1); ASSERT(imap.br_startoff == offset_fsb); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) { + /* skip the entire extent */ + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + + imap.br_blockcount) - 1; + continue; + } + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; if (lastoffset > endoff) lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) + + /* DAX can just zero the backing device directly */ + if (IS_DAX(VFS_I(ip))) { + error = dax_zero_page_range(VFS_I(ip), offset, + lastoffset - offset + 1, + xfs_get_blocks_direct); + if (error) + return error; continue; + } error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -1289,7 +1284,7 @@ xfs_free_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1320,7 +1315,7 @@ xfs_free_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -1330,7 +1325,7 @@ xfs_free_file_space( error0: xfs_bmap_cancel(&free_list); error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out; } @@ -1462,7 +1457,7 @@ xfs_shift_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } @@ -1492,13 +1487,13 @@ xfs_shift_file_space( if (error) goto out; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); } return error; out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -1718,7 +1713,7 @@ xfs_swap_extents( tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_unlock; } @@ -1901,7 +1896,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); @@ -1915,6 +1910,6 @@ out_unlock: goto out; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1790b00bea7a..a4b7d92e946c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1419,9 +1419,9 @@ xfs_buf_submit_wait( return error; } -xfs_caddr_t +void * xfs_buf_offset( - xfs_buf_t *bp, + struct xfs_buf *bp, size_t offset) { struct page *page; @@ -1431,7 +1431,7 @@ xfs_buf_offset( offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); + return page_address(page) + (offset & (PAGE_SIZE-1)); } /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 75ff5d5a7d2e..331c1ccf8264 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) /* Buffer Utility Routines */ -extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); +extern void *xfs_buf_offset(struct xfs_buf *, size_t); /* Delayed Write Buffer Routines */ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 02c01bbbc789..4143dc75dca4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -568,8 +568,6 @@ xfs_qm_dqread( struct xfs_buf *bp; struct xfs_trans *tp = NULL; int error; - int cancelflags = 0; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -617,7 +615,6 @@ xfs_qm_dqread( XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; } /* @@ -632,7 +629,6 @@ xfs_qm_dqread( * allocate (ENOENT). */ trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; goto error1; } @@ -670,7 +666,7 @@ xfs_qm_dqread( xfs_trans_brelse(tp, bp); if (tp) { - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; } @@ -680,7 +676,7 @@ xfs_qm_dqread( error1: if (tp) - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 338e50bbfd1e..74d0e5966ebc 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -127,7 +127,7 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, @@ -146,7 +146,7 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 64); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c0394ed126fc..4ed3042a0f16 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,10 +21,10 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, inst_t *ra); + const char *filename, int linenum, void *ra); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, inst_t *ra); + int linenum, void *ra); extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cb7fe64cdbfa..adc8f8fdd145 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -239,7 +239,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (__psint_t)(void*)efip; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); atomic_set(&efip->efi_refcount, 2); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7c62fca53e2f..f0e8249722d4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,14 +80,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -97,7 +98,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -109,20 +111,27 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); return status; @@ -139,7 +148,7 @@ xfs_update_prealloc_flags( tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -161,7 +170,7 @@ xfs_update_prealloc_flags( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } /* @@ -285,7 +294,7 @@ xfs_file_read_iter( if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -379,7 +388,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -564,6 +577,13 @@ restart: if (error) return error; + /* For changing security info in file_remove_privs() we need i_mutex */ + if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -624,7 +644,9 @@ restart: * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ - return file_remove_suid(file); + if (!IS_NOSEC(inode)) + return file_remove_privs(file); + return 0; } /* @@ -673,7 +695,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -759,8 +781,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -843,7 +868,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1064,17 +1089,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1455,48 +1469,83 @@ xfs_file_llseek( * ordering of: * * mmap_sem (MM) - * i_mmap_lock (XFS - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. */ STATIC int -xfs_filemap_fault( +xfs_filemap_page_mkwrite( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct inode *inode = file_inode(vma->vm_file); + int ret; - trace_xfs_filemap_fault(ip); + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return error; + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; } -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ STATIC int -xfs_filemap_page_mkwrite( +xfs_filemap_fault( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); + int ret; - trace_xfs_filemap_page_mkwrite(ip); + trace_xfs_filemap_fault(ip); + + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) + return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = filemap_fault(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - return error; + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP; + return 0; } const struct file_operations xfs_file_operations = { @@ -1527,9 +1576,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, -}; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index da82f1cb4b9b..c4c130f9bfb6 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -196,7 +196,8 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb7e8a29dfb6..9b3438a7680f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -101,7 +101,9 @@ xfs_fs_geometry( (xfs_sb_version_hasftype(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | + (xfs_sb_version_hassparseinodes(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SPINODES : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -201,7 +203,7 @@ xfs_growfs_data_private( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -489,7 +491,7 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) return error; @@ -557,7 +559,7 @@ xfs_growfs_data_private( return saved_error ? saved_error : error; error0: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 539a85fddbc2..3da9f4da4f3d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -905,7 +905,6 @@ xfs_dir_ialloc( { xfs_trans_t *tp; - xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; int code; @@ -954,8 +953,6 @@ xfs_dir_ialloc( * to succeed the second time. */ if (ialloc_context) { - struct xfs_trans_res tres; - /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -964,12 +961,6 @@ xfs_dir_ialloc( * allocation group. */ xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - tres.tr_logres = xfs_trans_get_log_res(tp); - tres.tr_logcount = xfs_trans_get_log_count(tp); /* * We want the quota changes to be associated with the next @@ -985,35 +976,9 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { + code = xfs_trans_roll(&tp, 0); + if (committed != NULL) *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - code = xfs_trans_reserve(tp, &tres, 0, 0); /* * Re-attach the quota info that we detached from prev trx. @@ -1025,7 +990,7 @@ xfs_dir_ialloc( if (code) { xfs_buf_relse(ialloc_context); - *tpp = ntp; + *tpp = tp; *ipp = NULL; return code; } @@ -1127,7 +1092,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; prid_t prid; struct xfs_dquot *udqp = NULL; @@ -1164,8 +1128,6 @@ xfs_create( tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1183,10 +1145,9 @@ xfs_create( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -1217,7 +1178,7 @@ xfs_create( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } /* @@ -1235,7 +1196,7 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); - goto out_trans_abort; + goto out_trans_cancel; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -1269,7 +1230,7 @@ xfs_create( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1282,10 +1243,8 @@ xfs_create( out_bmap_cancel: xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1317,7 +1276,6 @@ xfs_create_tmpfile( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1350,10 +1308,8 @@ xfs_create_tmpfile( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1365,7 +1321,7 @@ xfs_create_tmpfile( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -1381,9 +1337,9 @@ xfs_create_tmpfile( ip->i_d.di_nlink--; error = xfs_iunlink(tp, ip); if (error) - goto out_trans_abort; + goto out_trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1394,10 +1350,8 @@ xfs_create_tmpfile( *ipp = ip; return 0; - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1427,7 +1381,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; int resblks; @@ -1447,17 +1400,14 @@ xfs_link( goto std_return; tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto error_return; - } xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1486,19 +1436,19 @@ xfs_link( if (sip->i_d.di_nlink == 0) { error = xfs_iunlink_remove(tp, sip); if (error) - goto abort_return; + goto error_return; } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) - goto abort_return; + goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); if (error) - goto abort_return; + goto error_return; /* * If this is a synchronous mount, make sure that the @@ -1512,15 +1462,13 @@ xfs_link( error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); - goto abort_return; + goto error_return; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; error_return: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -1555,7 +1503,6 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - struct xfs_trans *ntp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; @@ -1613,29 +1560,7 @@ xfs_itruncate_extents( if (error) goto out_bmap_cancel; - if (committed) { - /* - * Mark the inode dirty so it will be logged and - * moved forward in the log as part of every commit. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - - ntp = xfs_trans_dup(tp); - error = xfs_trans_commit(tp, 0); - tp = ntp; - - xfs_trans_ijoin(tp, ip, 0); - - if (error) - goto out; - - /* - * Transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_roll(&tp, ip); if (error) goto out; } @@ -1756,7 +1681,7 @@ xfs_inactive_truncate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1777,7 +1702,7 @@ xfs_inactive_truncate( ASSERT(ip->i_d.di_nextents == 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error_unlock; @@ -1785,7 +1710,7 @@ xfs_inactive_truncate( return 0; error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1835,7 +1760,7 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(tp); return error; } @@ -1855,7 +1780,7 @@ xfs_inactive_ifree( __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1874,7 +1799,7 @@ xfs_inactive_ifree( if (error) xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); @@ -2235,28 +2160,42 @@ xfs_iunlink_remove( */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, - xfs_ino_t inum) + xfs_inode_t *free_ip, + xfs_trans_t *tp, + struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; int inodes_per_cluster; int nbufs; int i, j; + int ioffset; xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; + xfs_ino_t inum; + inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); blks_per_cluster = xfs_icluster_size_fsb(mp); inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; nbufs = mp->m_ialloc_blks / blks_per_cluster; for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + /* + * The allocation bitmap tells us which inodes of the chunk were + * physically allocated. Skip the cluster if an inode falls into + * a sparse region. + */ + ioffset = inum - xic->first_ino; + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { + ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + continue; + } + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2414,8 +2353,7 @@ xfs_ifree( xfs_bmap_free_t *flist) { int error; - int delete; - xfs_ino_t first_ino; + struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2431,7 +2369,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + error = xfs_difree(tp, ip->i_ino, flist, &xic); if (error) return error; @@ -2448,8 +2386,8 @@ xfs_ifree( ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (delete) - error = xfs_ifree_cluster(ip, tp, first_ino); + if (xic.deleted) + error = xfs_ifree_cluster(ip, tp, &xic); return error; } @@ -2536,7 +2474,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; uint resblks; @@ -2557,7 +2494,6 @@ xfs_remove( tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * We try to get the real space reservation first, @@ -2576,7 +2512,6 @@ xfs_remove( } if (error) { ASSERT(error != -ENOSPC); - cancel_flags = 0; goto out_trans_cancel; } @@ -2588,7 +2523,6 @@ xfs_remove( /* * If we're removing a directory perform some additional validation. */ - cancel_flags |= XFS_TRANS_ABORT; if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { @@ -2644,7 +2578,7 @@ xfs_remove( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto std_return; @@ -2656,7 +2590,7 @@ xfs_remove( out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -2730,11 +2664,11 @@ xfs_finish_rename( error = xfs_bmap_finish(&tp, free_list, &committed); if (error) { xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); } /* @@ -2855,7 +2789,7 @@ xfs_cross_rename( out_trans_abort: xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -2915,7 +2849,6 @@ xfs_rename( int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - int cancel_flags = 0; int spaceres; int error; @@ -2951,7 +2884,6 @@ xfs_rename( } if (error) goto out_trans_cancel; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes @@ -3022,10 +2954,8 @@ xfs_rename( error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, &free_list, spaceres); - if (error == -ENOSPC) - goto out_bmap_cancel; if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3033,7 +2963,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } else { /* target_ip != NULL */ /* @@ -3065,7 +2995,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3076,7 +3006,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; if (src_is_directory) { /* @@ -3084,7 +3014,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } /* target_ip != NULL */ @@ -3101,7 +3031,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3127,7 +3057,7 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3142,7 +3072,7 @@ xfs_rename( error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; /* * For whiteouts, we need to bump the link count on the whiteout inode. @@ -3156,10 +3086,10 @@ xfs_rename( ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; error = xfs_iunlink_remove(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); /* @@ -3180,12 +3110,10 @@ xfs_rename( IRELE(wip); return error; -out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); if (wip) IRELE(wip); return error; @@ -3464,7 +3392,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 87f67c6b654c..ea7d85af5310 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -336,7 +336,7 @@ xfs_set_dmattrs( tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -346,7 +346,7 @@ xfs_set_dmattrs( ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; } @@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans( return tp; out_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return ERR_PTR(error); } @@ -1253,7 +1253,7 @@ xfs_ioctl_setattr( else ip->i_d.di_extsize = 0; - code = xfs_trans_commit(tp, 0); + code = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1265,7 +1265,7 @@ xfs_ioctl_setattr( return code; error_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); @@ -1338,11 +1338,11 @@ xfs_ioc_setxflags( error = xfs_ioctl_setattr_xflags(tp, ip, &fa); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_write; } - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_write: mnt_drop_write_file(filp); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 38e633bad8c2..1f86033171c8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -183,7 +183,7 @@ xfs_iomap_write_direct( * Check for running out of space, note: need lock to return */ if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -213,7 +213,7 @@ xfs_iomap_write_direct( error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_unlock; @@ -236,7 +236,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } @@ -690,7 +690,7 @@ xfs_iomap_write_allocate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, nres, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -760,7 +760,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; @@ -791,7 +791,7 @@ xfs_iomap_write_allocate( trans_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -853,7 +853,7 @@ xfs_iomap_write_unwritten( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -890,7 +890,7 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -914,7 +914,7 @@ xfs_iomap_write_unwritten( error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 7f51f39f8acc..766b23f86ce9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -699,7 +699,7 @@ xfs_setattr_nonsize( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -730,7 +730,7 @@ xfs_setattr_nonsize( return 0; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); @@ -752,7 +752,6 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; - uint commit_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); @@ -848,7 +847,11 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); + else + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); @@ -858,7 +861,6 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -898,7 +900,7 @@ xfs_setattr_size( if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) - goto out_trans_abort; + goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data @@ -925,16 +927,14 @@ xfs_setattr_size( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; -out_trans_abort: - commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, commit_flags); + xfs_trans_cancel(tp); goto out_unlock; } @@ -981,7 +981,7 @@ xfs_vn_update_time( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1003,7 +1003,7 @@ xfs_vn_update_time( } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -1188,22 +1188,22 @@ xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + uint16_t flags = ip->i_d.di_flags; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | + S_NOATIME | S_DAX); + + if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + /* XXX: Also needs an on-disk per inode flag! */ + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + inode->i_flags |= S_DAX; } /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 80429891dc9b..f41b0c3fddab 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk( } irec->ir_free |= xfs_inobt_maskn(0, idx); - *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + *icount = irec->ir_count - irec->ir_freecount; } return 0; @@ -415,6 +415,8 @@ xfs_bulkstat( goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; @@ -447,13 +449,15 @@ xfs_bulkstat( * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < r.ir_count) { xfs_bulkstat_ichunk_ra(mp, agno, &r); irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + icount += r.ir_count - r.ir_freecount; } error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { @@ -599,8 +603,7 @@ xfs_inumbers( agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, r.ir_startino); - buffer[bufidx].xi_alloccount = - XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; buffer[bufidx].xi_allocmask = ~r.ir_free; if (++bufidx == bcount) { long written; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7c7842c85a08..85f883dd6207 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,26 +32,12 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef __uint32_t inst_t; /* an instruction */ - typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - #include "xfs_types.h" #include "kmem.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcc7cfabb787..08d4fe46f0fa 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,7 +109,7 @@ xlog_ungrant_log_space( STATIC void xlog_verify_dest_ptr( struct xlog *log, - char *ptr); + void *ptr); STATIC void xlog_verify_grant_tail( struct xlog *log); @@ -513,7 +513,7 @@ xfs_log_done( struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags) + bool regrant) { struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; @@ -526,14 +526,11 @@ xfs_log_done( (((ticket->t_flags & XLOG_TIC_INITED) == 0) && (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; - if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { - flags |= XFS_LOG_REL_PERM_RESERV; - } + regrant = false; } - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || - (flags & XFS_LOG_REL_PERM_RESERV)) { + if (!regrant) { trace_xfs_log_done_nonperm(log, ticket); /* @@ -541,7 +538,6 @@ xfs_log_done( * request has been made to release a permanent reservation. */ xlog_ungrant_log_space(log, ticket); - xfs_log_ticket_put(ticket); } else { trace_xfs_log_done_perm(log, ticket); @@ -553,6 +549,7 @@ xfs_log_done( ticket->t_flags |= XLOG_TIC_INITED; } + xfs_log_ticket_put(ticket); return lsn; } @@ -1447,7 +1444,7 @@ xlog_alloc_log( iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + log->l_iclog_bak[i] = &iclog->ic_header; #endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); @@ -1602,7 +1599,7 @@ xlog_pack_data( int i, j, k; int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; - xfs_caddr_t dp; + char *dp; cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); @@ -3664,7 +3661,7 @@ xlog_ticket_alloc( void xlog_verify_dest_ptr( struct xlog *log, - char *ptr) + void *ptr) { int i; int good_ptr = 0; @@ -3767,9 +3764,8 @@ xlog_verify_iclog( xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; - xfs_caddr_t ptr; - xfs_caddr_t base_ptr; - __psint_t field_offset; + void *base_ptr, *ptr, *p; + ptrdiff_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; @@ -3788,9 +3784,9 @@ xlog_verify_iclog( if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - ptr = (xfs_caddr_t) &iclog->ic_header; - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; - ptr += BBSIZE) { + base_ptr = ptr = &iclog->ic_header; + p = &iclog->ic_header; + for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); @@ -3798,20 +3794,19 @@ xlog_verify_iclog( /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); - ptr = iclog->ic_datap; - base_ptr = ptr; - ophead = (xlog_op_header_t *)ptr; + base_ptr = ptr = iclog->ic_datap; + ophead = ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = (xlog_op_header_t *)ptr; + ophead = ptr; /* clientid is only 1 byte */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + p = &ophead->oh_clientid; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,13 +3824,13 @@ xlog_verify_iclog( (unsigned long)field_offset); /* check length */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + p = &ophead->oh_len; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((__psint_t)&ophead->oh_len - - (__psint_t)iclog->ic_datap); + idx = BTOBBT((uintptr_t)&ophead->oh_len - + (uintptr_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e0deb95abd..fa27aaec72cb 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) /* - * Macros, structures, prototypes for interface to the log manager. - */ - -/* - * Flags to xfs_log_done() - */ -#define XFS_LOG_REL_PERM_RESERV 0x1 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -138,7 +129,7 @@ struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags); + bool regrant); int _xfs_log_force(struct xfs_mount *mp, uint flags, int *log_forced); @@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, int flags); + xfs_lsn_t *commit_lsn, bool regrant); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 45cc0ce18adf..abc2ccbff739 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -624,7 +624,7 @@ restart: spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); if (commit_lsn == -1) goto out_abort; @@ -773,14 +773,10 @@ xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, - int flags) + bool regrant) { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; - int log_flags = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; /* lock out background commit */ down_read(&cil->xc_ctx_lock); @@ -795,7 +791,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = tp->t_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); /* @@ -809,7 +805,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index db7cbdeb2b42..1c87c8abfbed 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -409,7 +409,7 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG - char *l_iclog_bak[XLOG_MAX_ICLOGS]; + void *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4f5784f85a5b..01dd228ca05e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_put_bp( * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ -STATIC xfs_caddr_t +STATIC char * xlog_align( struct xlog *log, xfs_daddr_t blk_no, @@ -203,7 +203,7 @@ xlog_bread( xfs_daddr_t blk_no, int nbblks, struct xfs_buf *bp, - xfs_caddr_t *offset) + char **offset) { int error; @@ -225,9 +225,9 @@ xlog_bread_offset( xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ struct xfs_buf *bp, - xfs_caddr_t offset) + char *offset) { - xfs_caddr_t orig_offset = bp->b_addr; + char *orig_offset = bp->b_addr; int orig_len = BBTOB(bp->b_length); int error, error2; @@ -396,7 +396,7 @@ xlog_find_cycle_start( xfs_daddr_t *last_blk, uint cycle) { - xfs_caddr_t offset; + char *offset; xfs_daddr_t mid_blk; xfs_daddr_t end_blk; uint mid_cycle; @@ -443,7 +443,7 @@ xlog_find_verify_cycle( uint cycle; xfs_buf_t *bp; xfs_daddr_t bufblks; - xfs_caddr_t buf = NULL; + char *buf = NULL; int error = 0; /* @@ -509,7 +509,7 @@ xlog_find_verify_log_record( { xfs_daddr_t i; xfs_buf_t *bp; - xfs_caddr_t offset = NULL; + char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; int smallmem = 0; @@ -616,7 +616,7 @@ xlog_find_head( xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; uint first_half_cycle, last_half_cycle; @@ -891,7 +891,7 @@ xlog_find_tail( { xlog_rec_header_t *rhead; xlog_op_header_t *op_head; - xfs_caddr_t offset = NULL; + char *offset = NULL; xfs_buf_t *bp; int error, i, found; xfs_daddr_t umount_data_blk; @@ -1099,7 +1099,7 @@ xlog_find_zeroed( xfs_daddr_t *blk_no) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; @@ -1199,7 +1199,7 @@ bp_err: STATIC void xlog_add_record( struct xlog *log, - xfs_caddr_t buf, + char *buf, int cycle, int block, int tail_cycle, @@ -1227,7 +1227,7 @@ xlog_write_log_records( int tail_cycle, int tail_block) { - xfs_caddr_t offset; + char *offset; xfs_buf_t *bp; int balign, ealign; int sectbb = log->l_sectBBsize; @@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer( return -EFSCORRUPTED; } - buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, - next_unlinked_offset); + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* @@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer( * have to leave the inode in a consistent state for whoever * reads it next.... */ - xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_dinode_calc_crc(mp, xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } @@ -2503,8 +2502,8 @@ xlog_recover_inode_pass2( xfs_buf_t *bp; xfs_dinode_t *dip; int len; - xfs_caddr_t src; - xfs_caddr_t dest; + char *src; + char *dest; int error; int attr_index; uint fields; @@ -2546,7 +2545,7 @@ xlog_recover_inode_pass2( goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks @@ -2885,7 +2884,7 @@ xlog_recover_dquot_pass2( return error; ASSERT(bp); - ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); /* * If the dquot has an LSN in it, recover the dquot only if it's less @@ -3068,12 +3067,22 @@ xlog_recover_do_icreate_pass2( return -EINVAL; } - /* existing allocation is fixed value */ - ASSERT(count == mp->m_ialloc_inos); - ASSERT(length == mp->m_ialloc_blks); - if (count != mp->m_ialloc_inos || - length != mp->m_ialloc_blks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + /* + * The inode chunk is either full or sparse and we only support + * m_ialloc_min_blks sized sparse allocations at this time. + */ + if (length != mp->m_ialloc_blks && + length != mp->m_ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); return -EINVAL; } @@ -3091,8 +3100,8 @@ xlog_recover_do_icreate_pass2( XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) return 0; - xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); + xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); return 0; } @@ -3364,17 +3373,17 @@ STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; + char *ptr, *old_ptr; int old_len; if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + + ptr = (char *)&trans->r_theader + sizeof(xfs_trans_header_t) - len; memcpy(ptr, dp, len); return 0; @@ -3410,12 +3419,12 @@ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xfs_inode_log_format_t *in_f; /* any will do */ xlog_recover_item_t *item; - xfs_caddr_t ptr; + char *ptr; if (!len) return 0; @@ -3504,7 +3513,7 @@ STATIC int xlog_recovery_process_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, unsigned int len, unsigned int flags, int pass) @@ -3611,8 +3620,8 @@ xlog_recover_process_ophdr( struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead, - xfs_caddr_t dp, - xfs_caddr_t end, + char *dp, + char *end, int pass) { struct xlog_recover *trans; @@ -3661,11 +3670,11 @@ xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, int pass) { struct xlog_op_header *ohead; - xfs_caddr_t end; + char *end; int num_logops; int error; @@ -3751,11 +3760,11 @@ xlog_recover_process_efi( } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; abort_error: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -3857,13 +3866,13 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out_error; return; out_abort: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; @@ -4010,7 +4019,7 @@ xlog_recover_process_iunlinks( STATIC int xlog_unpack_data_crc( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { __le32 crc; @@ -4040,7 +4049,7 @@ xlog_unpack_data_crc( STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { int i, j, k; @@ -4122,7 +4131,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t offset; + char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f23fbdfb365..461e791efad7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -725,6 +725,22 @@ xfs_mountfs( } /* + * If enabled, sparse inode chunk alignment is expected to match the + * cluster size. Full inode chunk alignment must match the chunk size, + * but that is checked on sb read verification... + */ + if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + mp->m_sb.sb_spino_align != + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { + xfs_warn(mp, + "Sparse inode block alignment (%u) must match cluster size (%llu).", + mp->m_sb.sb_spino_align, + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); + error = -EINVAL; + goto out_remove_uuid; + } + + /* * Set inode alignment fields */ xfs_set_inoalignment(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c995a2ccb6f..7999e91cd49a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -101,6 +101,8 @@ typedef struct xfs_mount { __uint64_t m_flags; /* global mount flags */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode + * allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ @@ -179,6 +181,8 @@ typedef struct xfs_mount { allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ +#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ + /* * Default minimum read and write sizes. diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 981a657eca39..ab4a6066f7ca 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -306,7 +306,7 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_iolock; } @@ -321,7 +321,7 @@ xfs_fs_commit_blocks( } xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5538468c7f63..eac9549efd52 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -756,7 +756,7 @@ xfs_qm_qino_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -764,8 +764,7 @@ xfs_qm_qino_alloc( error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } } @@ -796,7 +795,7 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9a25c9275fb3..3640c6e896af 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile( tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile( error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } ASSERT(ip->i_d.di_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); @@ -437,7 +436,7 @@ xfs_qm_scall_setqlim( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_rele; } @@ -548,7 +547,7 @@ xfs_qm_scall_setqlim( dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_rele: xfs_qm_dqrele(dqp); @@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - return error; + return xfs_trans_commit(tp); } @@ -605,7 +603,7 @@ xfs_qm_log_quotaoff( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } @@ -624,7 +622,7 @@ xfs_qm_log_quotaoff( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5376dd406ba2..ce6506adab7b 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -55,7 +55,6 @@ struct xfs_trans; typedef struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_blk_res_used; /* blks used from the reservation */ ulong qt_ino_res; /* inode reserved on a dquot */ ulong qt_ino_res_used; /* inodes used from the reservation */ long qt_bcount_delta; /* dquot blk count changes */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f2079b6911cc..f4e8c06eee26 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,7 +780,6 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - int cancelflags = 0; xfs_trans_t *tp; tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); @@ -792,7 +791,6 @@ xfs_growfs_rt_alloc( resblks, 0); if (error) goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* * Lock the inode. */ @@ -804,7 +802,6 @@ xfs_growfs_rt_alloc( * Allocate blocks to the bitmap file. */ nmap = 1; - cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, resblks, &map, &nmap, &flist); @@ -818,14 +815,13 @@ xfs_growfs_rt_alloc( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. */ - cancelflags = 0; for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { @@ -851,7 +847,7 @@ xfs_growfs_rt_alloc( if (bp == NULL) { error = -EIO; error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); goto error; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); @@ -859,7 +855,7 @@ error_cancel: /* * Commit the transaction. */ - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto error; } @@ -973,7 +969,6 @@ xfs_growfs_rt( bmbno < nrbmblocks; bmbno++) { xfs_trans_t *tp; - int cancelflags = 0; *nmp = *mp; nsbp = &nmp->m_sb; @@ -1015,7 +1010,6 @@ xfs_growfs_rt( mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - cancelflags |= XFS_TRANS_ABORT; /* * Get the summary inode into the transaction. */ @@ -1062,7 +1056,7 @@ xfs_growfs_rt( nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) { error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); break; } /* @@ -1076,7 +1070,7 @@ error_cancel: mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) break; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 858e1e62bbaa..1fb16562c159 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ + /* * Table driven mount option parser. * @@ -363,6 +365,10 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; +#ifdef CONFIG_FS_DAX + } else if (!strcmp(this_char, MNTOPT_DAX)) { + mp->m_flags |= XFS_MOUNT_DAX; +#endif } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -452,8 +458,8 @@ done: } struct proc_xfs_info { - int flag; - char *str; + uint64_t flag; + char *str; }; STATIC int @@ -474,6 +480,7 @@ xfs_showargs( { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1507,6 +1514,20 @@ xfs_fs_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= MS_I_VERSION; + if (mp->m_flags & XFS_MOUNT_DAX) { + xfs_warn(mp, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + if (sb->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "Filesystem block size invalid for DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + xfs_alert(mp, + "Block device does not support DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } + } + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3df411eadb86..4be27b0210af 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -104,7 +104,7 @@ xfs_readlink_bmap( cur_chunk += sizeof(struct xfs_dsymlink_hdr); } - memcpy(link + offset, bp->b_addr, byte_cnt); + memcpy(link + offset, cur_chunk, byte_cnt); pathlen -= byte_cnt; offset += byte_cnt; @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; @@ -224,7 +223,6 @@ xfs_symlink( return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -239,10 +237,8 @@ xfs_symlink( resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -394,7 +390,7 @@ xfs_symlink( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -407,9 +403,8 @@ xfs_symlink( out_bmap_cancel: xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt( /* * Commit the transaction containing extent freeing and EFDs. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error_unlock; @@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt( error_bmap_cancel: xfs_bmap_cancel(&free_list); error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 615781bf4ee5..8d916d33d93d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) +TRACE_EVENT(xfs_irec_merge_pre, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + __field(xfs_agino_t, nagino) + __field(uint16_t, nholemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + __entry->nagino = nagino; + __entry->nholemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->agino, __entry->holemask, __entry->nagino, + __entry->nholemask) +) + +TRACE_EVENT(xfs_irec_merge_post, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask), + TP_ARGS(mp, agno, agino, holemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->agno, __entry->agino, + __entry->holemask) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 220ef2c906b2..0582a27107d4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -113,7 +113,7 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -xfs_trans_t * +STATIC xfs_trans_t * xfs_trans_dup( xfs_trans_t *tp) { @@ -251,14 +251,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - int log_flags; - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -744,7 +737,7 @@ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags) + bool abort) { struct xfs_log_item_desc *lidp, *next; @@ -755,7 +748,7 @@ xfs_trans_free_items( if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); - if (flags & XFS_TRANS_ABORT) + if (abort) lip->li_flags |= XFS_LI_ABORTED; lip->li_ops->iop_unlock(lip); @@ -892,27 +885,17 @@ xfs_trans_committed_bulk( * have already been unlocked as if the commit had succeeded. * Do not reference the transaction structure after this call. */ -int -xfs_trans_commit( +static int +__xfs_trans_commit( struct xfs_trans *tp, - uint flags) + bool regrant) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; int error = 0; - int log_flags = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. - */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } - - /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the * transaction and free the transaction structure. @@ -936,7 +919,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -964,18 +947,25 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); return error; } +int +xfs_trans_commit( + struct xfs_trans *tp) +{ + return __xfs_trans_commit(tp, false); +} + /* * Unlock all of the transaction's items and free the transaction. * The transaction must not have modified any of its items, because @@ -986,29 +976,22 @@ out_unreserve: */ void xfs_trans_cancel( - xfs_trans_t *tp, - int flags) + struct xfs_trans *tp) { - int log_flags; - xfs_mount_t *mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); /* - * See if the caller is being too lazy to figure out if - * the transaction really needs an abort. - */ - if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) - flags &= ~XFS_TRANS_ABORT; - /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item_desc *lidp; list_for_each_entry(lidp, &tp->t_items, lid_trans) @@ -1018,27 +1001,20 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - } + if (tp->t_ticket) + xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); } /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when - * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * committed with xfs_trans_commit(), but we still want as soon * as possible to let chunks of it go to the log. So we commit the * chunk we've been working on and get a new transaction to continue. */ @@ -1055,7 +1031,8 @@ xfs_trans_roll( * Ensure that the inode is always logged. */ trans = *tpp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + if (dp) + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); /* * Copy the critical parameters from one trans to the next. @@ -1071,20 +1048,13 @@ xfs_trans_roll( * is in progress. The caller takes the responsibility to cancel * the duplicate transaction that gets returned. */ - error = xfs_trans_commit(trans, 0); + error = __xfs_trans_commit(trans, true); if (error) return error; trans = *tpp; /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(trans->t_ticket); - - - /* * Reserve space in the log for th next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log @@ -1100,6 +1070,7 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, 0); + if (dp) + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b5bc1ab3c4da..3b21b4e5e467 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -133,8 +133,6 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces that are * actually macros. */ -#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) -#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) @@ -153,7 +151,6 @@ typedef struct xfs_trans { */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); -void xfs_trans_cancel(xfs_trans_t *, int); +void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 573aefb5a573..1098cf490189 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next( { struct xfs_log_item *lip = cur->item; - if ((__psint_t)lip & 1) + if ((uintptr_t)lip & 1) lip = xfs_ail_min(ailp); if (lip) cur->item = xfs_ail_next(ailp, lip); @@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear( list_for_each_entry(cur, &ailp->xa_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) - ((__psint_t)cur->item | 1); + ((uintptr_t)cur->item | 1); } } @@ -287,7 +287,7 @@ xfs_ail_splice( * find the place in the AIL where the items belong. */ lip = cur ? cur->item : NULL; - if (!lip || (__psint_t) lip & 1) + if (!lip || (uintptr_t)lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 76a16df55ef7..ce78534a047e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo( xfs_trans_t *ntp) { xfs_dqtrx_t *oq, *nq; - int i,j; + int i, j; xfs_dqtrx_t *oqa, *nqa; + ulong blk_res_used; if (!otp->t_dqinfo) return; @@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo( * Because the quota blk reservation is carried forward, * it is also necessary to carry forward the DQ_DIRTY flag. */ - if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + if (otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { oqa = otp->t_dqinfo->dqs[j]; nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + blk_res_used = 0; + if (oqa[i].qt_dquot == NULL) break; oq = &oqa[i]; nq = &nqa[i]; + if (oq->qt_blk_res && oq->qt_bcount_delta > 0) + blk_res_used = oq->qt_bcount_delta; + nq->qt_dquot = oq->qt_dquot; nq->qt_bcount_delta = nq->qt_icount_delta = 0; nq->qt_rtbcount_delta = 0; @@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo( /* * Transfer whatever is left of the reservations. */ - nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; - oq->qt_blk_res = oq->qt_blk_res_used; + nq->qt_blk_res = oq->qt_blk_res - blk_res_used; + oq->qt_blk_res = blk_res_used; nq->qt_rtblk_res = oq->qt_rtblk_res - oq->qt_rtblk_res_used; @@ -239,10 +245,6 @@ xfs_trans_mod_dquot( * disk blocks used. */ case XFS_TRANS_DQ_BCOUNT: - if (qtrx->qt_blk_res && delta > 0) { - qtrx->qt_blk_res_used += (ulong)delta; - ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); - } qtrx->qt_bcount_delta += delta; break; @@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { - if (qtrx->qt_blk_res > - qtrx->qt_blk_res_used) + ulong blk_res_used = 0; + + if (qtrx->qt_bcount_delta > 0) + blk_res_used = qtrx->qt_bcount_delta; + + if (qtrx->qt_blk_res != blk_res_used) { + if (qtrx->qt_blk_res > blk_res_used) dqp->q_res_bcount -= (xfs_qcnt_t) (qtrx->qt_blk_res - - qtrx->qt_blk_res_used); + blk_res_used); else dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res_used - + (blk_res_used - qtrx->qt_blk_res); } } else { diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd1281862ad7..1b736294558a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); + bool abort); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, |