diff options
Diffstat (limited to 'fs')
210 files changed, 5415 insertions, 3757 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eb3589edf485..0576eaeb60b9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, } static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = d_inode(dentry); v9ses = v9fs_dentry2v9ses(dentry); /* diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 18c62bae9591..a6bd349bab23 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { const char *full_name = xattr_full_name(handler, name); diff --git a/fs/Kconfig b/fs/Kconfig index 6725f59c18e6..b8fcb416be72 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -52,6 +52,7 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE + depends on BROKEN endif # BLOCK diff --git a/fs/affs/super.c b/fs/affs/super.c index 2a6713b6b9f4..d6384863192c 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) char *prefix = NULL; new_opts = kstrdup(data, GFP_KERNEL); - if (!new_opts) + if (data && !new_opts) return -ENOMEM; pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); @@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data) } flush_delayed_work(&sbi->sb_work); - replace_mount_options(sb, new_opts); + if (new_opts) + replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; sbi->s_mode = mode; diff --git a/fs/afs/write.c b/fs/afs/write.c index 65de439bdc4f..14d506efd1aa 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) return 0; result = generic_file_write_iter(iocb, from); - if (IS_ERR_VALUE(result)) { - _leave(" = %zd", result); - return result; - } _leave(" = %zd", result); return result; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 72e35b721608..3ba385eaa26e 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) return -EIO; } -static int bad_inode_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { return -EIO; } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 2fab9f130e51..ae1b5404fced 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -127,12 +127,8 @@ static int set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); - if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; - } + if (end > start) + return vm_brk(start, end - start); return 0; } @@ -275,7 +271,7 @@ static int load_aout_binary(struct linux_binprm * bprm) map_size = ex.a_text+ex.a_data; #endif error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) + if (error) return error; error = read_code(bprm->file, text_addr, pos, @@ -298,7 +294,7 @@ static int load_aout_binary(struct linux_binprm * bprm) if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (IS_ERR_VALUE(error)) + if (error) return error; read_code(bprm->file, N_TXTADDR(ex), fd_offset, @@ -382,7 +378,7 @@ static int load_aout_library(struct file *file) file); } retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (IS_ERR_VALUE(retval)) + if (retval) goto out; read_code(file, start_addr, N_TXTOFF(ex), @@ -402,9 +398,8 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - error = vm_brk(start_addr + len, bss - len); - retval = error; - if (error != start_addr + len) + retval = vm_brk(start_addr + len, bss - len); + if (retval) goto out; } retval = 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 938fc4ede764..a7a28110dc80 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end) start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; + int error = vm_brk(start, end - start); + if (error) + return error; } current->mm->start_brk = current->mm->brk = end; return 0; @@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); - if (BAD_ADDR(error)) + if (error) goto out; } @@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file) bss = eppnt->p_memsz + eppnt->p_vaddr; if (bss > len) { error = vm_brk(len, bss - len); - if (BAD_ADDR(error)) + if (error) goto out_free_ph; } error = 0; @@ -2276,7 +2275,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) + if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 71ade0e556b7..203589311bf8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) + if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; if (!elf_fdpic_dump_segments(cprm)) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f723cd3a455c..caf9e39bb82b 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) "(%d != %d)", (unsigned) r, curid, id); goto failed; } else if ( ! p->lib_list[id].loaded && - IS_ERR_VALUE(load_flat_shared_library(id, p))) { + load_flat_shared_library(id, p) < 0) { printk("BINFMT_FLAT: failed to load library %d", id); goto failed; } @@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) res = prepare_binprm(&bprm); - if (!IS_ERR_VALUE(res)) + if (!res) res = load_flat_file(&bprm, libs, id, NULL); abort_creds(bprm.cred); @@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm) stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); - if (IS_ERR_VALUE(res)) + if (res < 0) return res; /* Update data segment pointers for all libraries */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 1089dbf25925..71ccab1d22c6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); if (!avail) return -ERANGE; if (avail > 0 && avail & ~PAGE_MASK) @@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) } EXPORT_SYMBOL_GPL(bdev_direct_access); +/** + * bdev_dax_supported() - Check if the device supports dax for filesystem + * @sb: The superblock of the device + * @blocksize: The block size of the device + * + * This is a library function for filesystems to check if the block device + * can be mounted with dax option. + * + * Return: negative errno if unsupported, 0 if supported. + */ +int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + struct blk_dax_ctl dax = { + .sector = 0, + .size = PAGE_SIZE, + }; + int err; + + if (blocksize != PAGE_SIZE) { + vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); + return -EINVAL; + } + + err = bdev_direct_access(sb->s_bdev, &dax); + if (err < 0) { + switch (err) { + case -EOPNOTSUPP: + vfs_msg(sb, KERN_ERR, + "error: device does not support dax"); + break; + case -EINVAL: + vfs_msg(sb, KERN_ERR, + "error: unaligned partition for dax"); + break; + default: + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%d)", err); + } + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bdev_dax_supported); + +/** + * bdev_dax_capable() - Return if the raw device is capable for dax + * @bdev: The device for raw block device access + */ +bool bdev_dax_capable(struct block_device *bdev) +{ + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + }; + + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + + dax.sector = 0; + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + return true; +} + /* * pseudo-fs */ @@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static bool blkdev_dax_capable(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - - if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX)) - return false; - - /* - * If the partition is not aligned on a page boundary, we can't - * do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) - || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) - return false; - - /* - * If the device has known bad blocks, force all I/O through the - * driver / page cache. - * - * TODO: support finer grained dax error handling - */ - if (disk->bb && disk->bb->count) - return false; - - return true; -} - static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); /* @@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } @@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d3090187fd76..8bb3509099e8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the - * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1da5753d886d..4919aedb5fc1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -313,7 +313,7 @@ struct btrfs_dio_private { struct bio *dio_bio; /* - * The original bio may be splited to several sub-bios, this is + * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 516e19d1d202..b677a6ea6001 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1939,7 +1939,7 @@ again: /* * Clear all references of this block. Do not free * the block itself even if is not referenced anymore - * because it still carries valueable information + * because it still carries valuable information * like whether it was ever written and IO completed. */ list_for_each_entry_safe(l, tmp, &block->ref_to_list, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index decd0a3f5d61..46025688f1d0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) /* * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do + * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ @@ -955,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf) { /* - * Tree blocks not in refernece counted trees and tree roots + * Tree blocks not in reference counted trees and tree roots * are never shared. If a block was allocated after the last * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. @@ -1270,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, /* * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewinded (until we reach something older than + * previous operations will be rewound (until we reach something older than * time_seq). */ static void @@ -1345,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } /* - * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * Called with eb read locked. If the buffer cannot be rewound, the same buffer * is returned. If rewind operations happen, a fresh buffer is returned. The * returned buffer is always read-locked. If the returned buffer is not the * input buffer, the lock on the input buffer is released and the input buffer @@ -1373,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start, + eb->len); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1454,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(root->fs_info, logical); + eb = alloc_dummy_extent_buffer(root->fs_info, logical, + root->nodesize); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1516,7 +1518,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * 3) the root is not forced COW. * * What is forced COW: - * when we create snapshot during commiting the transaction, + * when we create snapshot during committing the transaction, * after we've finished coping src root, we must COW the shared * block to ensure the metadata consistency. */ @@ -1531,7 +1533,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, /* * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than + * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, @@ -2986,7 +2988,7 @@ again: btrfs_unlock_up_safe(p, level + 1); /* - * Since we can unwind eb's we want to do a real search every + * Since we can unwind ebs we want to do a real search every * time. */ prev_cmp = -1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ddcc58f03c79..101c3cfd3f7c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -89,7 +89,7 @@ static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) /* ioprio of readahead is set to idle */ @@ -431,7 +431,7 @@ struct btrfs_space_info { * bytes_pinned does not reflect the bytes that will be pinned once the * delayed refs are flushed, so this counter is inc'ed every time we * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zero'ed every + * freed once the transaction is committed. It will be zeroed every * time the transaction commits. */ struct percpu_counter total_bytes_pinned; @@ -1401,7 +1401,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) token->kaddr = NULL; } -/* some macros to generate set/get funcs for the struct fields. This +/* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c24b653c7343..5fca9534a271 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root { /* * To make qgroup to skip given root. - * This is for snapshot, as btrfs_qgroup_inherit() will manully + * This is for snapshot, as btrfs_qgroup_inherit() will manually * modify counters for snapshot and its source, so we should skip * the snapshot in new_root/old_roots or it will get calculated twice */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 85f12e6e28d2..63ef9cdf0144 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -450,7 +450,7 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, } /* - * blocked until all flighting bios are finished. + * blocked until all in-flight bios operations are finished. */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91d123938cef..1142127f6e5e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -384,7 +384,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, /* * Things reading via commit roots that don't have normal protection, * like send, can have a really old block in cache that may point at a - * block that has been free'd and re-allocated. So don't clear uptodate + * block that has been freed and re-allocated. So don't clear uptodate * if we find an eb that is under IO (dirty/writeback) because we could * end up reading in the stale data and then writing it back out and * making everybody very sad. @@ -418,7 +418,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checkum. + * is filled with zeros and is included in the checksum. */ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -600,7 +600,7 @@ static noinline int check_leaf(struct btrfs_root *root, /* * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but + * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > @@ -1147,7 +1147,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_test_extent_buffer(root->fs_info, bytenr, + root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1314,14 +1315,16 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(void) +struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize) { struct btrfs_root *root; root = btrfs_alloc_root(NULL, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, root, NULL, 1); + /* We don't use the stripesize in selftest, set it as sectorsize */ + __setup_root(nodesize, sectorsize, sectorsize, root, NULL, + BTRFS_ROOT_TREE_OBJECTID); set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; @@ -3022,7 +3025,7 @@ retry_root_backup: } /* - * Mount does not set all options immediatelly, we can do it now and do + * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit */ btrfs_apply_pending_changes(fs_info); @@ -3255,7 +3258,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) btrfs_warn_rl_in_rcu(device->dev_root->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); - /* note, we dont' set_buffer_write_io_error because we have + /* note, we don't set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); @@ -4130,6 +4133,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb)) || + btrfs_super_stripesize(sb) != sectorsize) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); @@ -4367,7 +4381,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, if (ret) break; - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; @@ -4402,7 +4416,7 @@ again: if (ret) break; - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8e79d0070bcf..acba821499a9 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(void); +struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize); #endif /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9424864fd01a..689d25ac6a68 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root, { u64 end = start + num_bytes - 1; set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); return 0; } @@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root, end = start + cache->key.offset - 1; clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); } static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +980,7 @@ out_free: * event that tree block loses its owner tree's reference and do the * back refs conversion. * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. @@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); @@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * Need to drop our head ref lock and re-aqcuire the + * Need to drop our head ref lock and re-acquire the * delayed ref lock and then re-check to make sure * nobody got added. */ @@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) /* * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to ouse. + * closer to what we're really going to want to use. */ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } @@ -2851,7 +2857,7 @@ static void delayed_ref_async_start(struct btrfs_work *work) } /* - * trans->sync means that when we call end_transaciton, we won't + * trans->sync means that when we call end_transaction, we won't * wait on delayed refs */ trans->sync = true; @@ -4296,7 +4302,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * Called if we need to clear a data reservation for this inode * Normally in a error case. * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4967,7 +4973,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5572,7 +5578,7 @@ void btrfs_orphan_release_metadata(struct inode *inode) * common file/directory operations, they change two fs/file trees * and root tree, the number of items that the qgroup reserves is * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, @@ -5621,7 +5627,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of @@ -5647,7 +5653,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) drop_inode_space = 1; /* - * If we have more or the same amount of outsanding extents than we have + * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= @@ -5661,8 +5667,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) } /** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + * reserved/freed for the given bytes. * @inode: the inode we're manipulating * @num_bytes: the number of bytes in question * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5814,7 +5820,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occurred during this + * freed from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -5836,7 +5842,7 @@ out_fail: /* * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have free'd more space had we + * more than to_free then we would have freed more space had we * not had an artificially high ->csum_bytes, so we need to free * the remainder. If bytes is the same or less then we don't * need to do anything, the other free-ers did the correct @@ -6515,7 +6521,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); unpin_extent_range(root, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -7578,7 +7584,7 @@ loop: if (loop == LOOP_CACHING_NOWAIT) { /* * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any unached bgs and we've alrelady done a + * don't have any uncached bgs and we've already done a * full search through. */ if (orig_have_caching_bg || !full_search) @@ -7982,7 +7988,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, /* * Mixed block groups will exclude before processing the log so we only - * need to do the exlude dance if this fs isn't mixed. + * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -8032,7 +8038,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + buf->start + buf->len - 1); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, @@ -9426,7 +9432,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) u64 free_bytes = 0; int factor; - /* It's df, we don't care if it's racey */ + /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; @@ -10635,14 +10641,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2f83448d34fe..a3412d68ad37 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -726,14 +726,6 @@ next: start = last_end + 1; if (start <= end && state && !need_resched()) goto hit_next; - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; search_again: if (start > end) @@ -742,6 +734,14 @@ search_again: if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + } static void wait_on_state(struct extent_io_tree *tree, @@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); } spin_lock(&tree->lock); @@ -1037,7 +1043,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; out: spin_unlock(&tree->lock); @@ -1046,13 +1058,6 @@ out: return err; -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range * @cached_state: state that we're going to cache - * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist * already in this range they are set with the given bit and cleared of the * clear_bits. This is only meant to be used by things that are mergeable, ie * converting from say DELALLOC to DIRTY. This is not meant to be used with * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask) + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1106,7 +1112,7 @@ again: * extent state allocations are needed. We'll only know this * after locking the tree. */ - prealloc = alloc_extent_state(mask); + prealloc = alloc_extent_state(GFP_NOFS); if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1263,7 +1269,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; out: spin_unlock(&tree->lock); @@ -1271,21 +1283,11 @@ out: free_extent_state(prealloc); return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - first_iteration = false; - goto again; } /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, changeset); } @@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, changeset); } @@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret) err = ret; ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret && !err) err = ret; @@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, bio->bi_iter.bi_size = 0; map_length = length; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2036,6 +2044,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, dev = bbio->stripes[mirror_num-1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2044,6 +2053,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ + btrfs_bio_counter_dec(fs_info); bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; @@ -2053,6 +2063,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); + btrfs_bio_counter_dec(fs_info); bio_put(bio); return 0; } @@ -2232,13 +2243,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, /* set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) ret = set_state_failrec(failure_tree, start, failrec); /* set the bits in the inode's tree */ if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, - GFP_NOFS); + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); if (ret < 0) { kfree(failrec); return ret; @@ -4389,8 +4399,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret < 0) { btrfs_free_path(path); return ret; + } else { + WARN_ON(!ret); + if (ret == 1) + ret = 0; } - WARN_ON(!ret); + path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = found_key.type; @@ -4601,7 +4615,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) if (mapped) spin_unlock(&page->mapping->private_lock); - /* One for when we alloced the page */ + /* One for when we allocated the page */ put_page(page); } while (index != 0); } @@ -4714,16 +4728,16 @@ err: } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { unsigned long len; if (!fs_info) { /* * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 + * available */ - len = 4096; + len = nodesize; } else { len = fs_info->tree_root->nodesize; } @@ -4819,7 +4833,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4827,7 +4841,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(fs_info, start); + eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); if (!eb) return NULL; eb->fs_info = fs_info; @@ -5761,7 +5775,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; /* - * We need to make sure noboody is attaching this page to an eb right + * We need to make sure nobody is attaching this page to an eb right * now. */ spin_lock(&page->mapping->private_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 981f402bf754..c0c1c4fef6ce 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -220,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); @@ -240,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { int wake = 0; if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, + GFP_NOFS); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -278,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask); + struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -348,7 +348,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -468,5 +468,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 318b048eb254..e0715fcfb11e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void) /** * free_extent_map - drop reference count of an extent_map - * @em: extent map being releasead + * @em: extent map being released * * Drops the reference out on @em by one and free the structure * if the reference count hits zero. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7a7d6e253cfc..62a81ee13a5f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -248,7 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, offset + root->sectorsize - 1, - EXTENT_NODATASUM, GFP_NOFS); + EXTENT_NODATASUM); } else { btrfs_info(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c98805c35bab..e0c9bd3fb02d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1596,6 +1596,13 @@ again: copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + root->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + dirty_sectors); + /* * if we have trouble faulting in the pages, fall * back to one page at a time @@ -1605,6 +1612,7 @@ again: if (copied == 0) { force_page_uptodate = true; + dirty_sectors = 0; dirty_pages = 0; } else { force_page_uptodate = false; @@ -1615,20 +1623,19 @@ again: /* * If we had a short copy we need to release the excess delaloc * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space will decrement it, but + * because btrfs_delalloc_release_space and + * btrfs_delalloc_release_metadata will decrement it, but * we still have an outstanding extent for the chunk we actually * managed to copy. */ - num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - root->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - dirty_sectors); - if (num_sectors > dirty_sectors) { - release_bytes = (write_bytes - copied) - & ~((u64)root->sectorsize - 1); + /* + * we round down because we don't want to count + * any partial blocks actually sent through the + * IO machines + */ + release_bytes = round_down(release_bytes - copied, + root->sectorsize); if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -2022,7 +2029,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed)) { /* - * We'v had everything committed since the last time we were + * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ @@ -2370,7 +2377,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page - * including serveral following pages are already in holes, + * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5e6062c26129..69d270f6602c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,7 +29,7 @@ #include "inode-map.h" #include "volumes.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { @@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u32 bytes_per_bitmap; + u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max_t(u32, max_bitmaps, 1); + max_bitmaps = max_t(u64, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; if (bitmap_bytes >= max_bytes) { ctl->extents_thresh = 0; @@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want - * to reserve them to larger extents, however if we have plent + * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ @@ -3662,7 +3662,7 @@ have_info: if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { - n = rb_prev(&info->offset_index); + n = rb_prev(&tmp->offset_index); continue; } info = tmp; @@ -3676,7 +3676,7 @@ have_info: if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { - n = rb_next(&info->offset_index); + n = rb_next(&tmp->offset_index); continue; } info = tmp; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33178c490ace..3af651c2bbc7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -/* Support functions for runnint our sanity tests */ +/* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group_cache *cache, u64 offset, u64 bytes, bool bitmap); diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index aae520b2aee5..a97fdc156a03 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -24,6 +24,11 @@ int __init btrfs_hash_init(void) return PTR_ERR_OR_ZERO(tfm); } +const char* btrfs_crc32c_impl(void) +{ + return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); +} + void btrfs_hash_exit(void) { crypto_free_shash(tfm); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 118a2316e5d3..c3a2ec554361 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -22,6 +22,7 @@ int __init btrfs_hash_init(void); void btrfs_hash_exit(void); +const char* btrfs_crc32c_impl(void); u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 91419ef79b00..8b1212e8f7a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -455,7 +455,7 @@ again: /* * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. + * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -1978,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -3119,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } @@ -3722,7 +3721,7 @@ cache_index: * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, - * but it guarantees correctness at the expense of ocassional full + * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ @@ -4978,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * be instantly completed which will give us extents that need * to be truncated. If we fail to get an orphan inode down we * could have left over extents that were never meant to live, - * so we need to garuntee from this point on that everything + * so we need to guarantee from this point on that everything * will be consistent. */ ret = btrfs_orphan_add(trans, inode); @@ -5248,7 +5247,7 @@ void btrfs_evict_inode(struct inode *inode) } /* - * We can't just steal from the global reserve, we need tomake + * We can't just steal from the global reserve, we need to make * sure there is room to do it, if not we need to commit and try * again. */ @@ -6980,7 +6979,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to @@ -7433,7 +7443,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state); /* * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered + * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7595,7 +7605,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (current->journal_info) { /* * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transction doesn't get + * that anything that needs to check if there's a transaction doesn't get * confused. */ dio_data = current->journal_info; @@ -7628,7 +7638,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * - * We return -ENOTBLK because thats what makes DIO go ahead and go back + * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ @@ -9041,7 +9051,7 @@ static int btrfs_truncate(struct inode *inode) return ret; /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * Yes ladies and gentlemen, this is indeed ugly. The fact is we have * 3 things going on here * * 1) We need to reserve space for our orphan item and the space to @@ -9055,15 +9065,15 @@ static int btrfs_truncate(struct inode *inode) * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be seperate. The fact is we can use alot of + * And we need these to all be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it + * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode or * removing the orphan item. We also need to be able to stop the * transaction and start a new one, which means we need to be able to * update the inode several times, and we have no idea of knowing how * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. + * entirety of the operation, so that has to be done separately as well. * Then there is the orphan item, which does indeed need to be held on * to for the whole operation, and we need nobody to touch this reserved * space except the orphan code. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4e700694b741..05173563e4a6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } else { /* - * Revert back under same assuptions as above + * Revert back under same assumptions as above */ if (S_ISREG(mode)) { if (inode->i_size == 0) @@ -465,7 +465,7 @@ static noinline int create_subvol(struct inode *dir, /* * Don't create subvolume whose level is not zero. Or qgroup will be - * screwed up since it assume subvolme qgroup's level to be 0. + * screwed up since it assumes subvolume qgroup's level to be 0. */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; @@ -780,7 +780,7 @@ free_pending: * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability - * 6. If the victim is append-only or immutable we can't do antyhing with + * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. @@ -846,11 +846,9 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (error == -EINTR) - // return error; + error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -1239,7 +1237,7 @@ again: set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state, GFP_NOFS); + &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, @@ -2377,11 +2375,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (err == -EINTR) - // goto out_drop_write; + err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (err == -EINTR) + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2571,7 +2567,7 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); -//out_drop_write: +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -4654,7 +4650,7 @@ again: } /* - * mut. excl. ops lock is locked. Three possibilites: + * mut. excl. ops lock is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) @@ -5571,7 +5567,7 @@ long btrfs_ioctl(struct file *file, unsigned int ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); /* * The transaction thread may want to do more work, - * namely it pokes the cleaner ktread that will start + * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ wake_up_process(root->fs_info->transaction_kthread); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 559170464d7c..e96634a725c3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -718,12 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; int done; + int total_done = 0; INIT_LIST_HEAD(&splice); @@ -742,6 +743,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); btrfs_put_fs_root(root); + total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != -1) { @@ -752,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); + + return total_done; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8ef12623d65c..451507776ff5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -58,7 +58,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ @@ -199,7 +199,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, const u64 range_start, const u64 range_len); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e119552ed32..9d4c05b14f6e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -85,7 +85,7 @@ struct btrfs_qgroup { /* * temp variables for accounting operations - * Refer to qgroup_shared_accouting() for details. + * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; @@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } /* * we call btrfs_free_qgroup_config() when umounting - * filesystem and disabling quota, so we set qgroup_ulit + * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); @@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, /* * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and + * then this qgroup and all of the parent qgroups get their reference and * exclusive counts adjusted. * * Caller should hold fs_info->qgroup_lock. @@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, /* * No need to do lock, since this function will only be called in - * btrfs_commmit_transaction(). + * btrfs_commit_transaction(). */ node = rb_first(&delayed_refs->dirty_extent_root); while (node) { @@ -1557,7 +1557,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) - * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing @@ -1851,7 +1851,7 @@ out: } /* - * Copy the acounting information between qgroups. This is necessary + * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. @@ -2340,7 +2340,7 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has alreay updated the + * only update status, since the previous part has already updated the * qgroup info. */ trans = btrfs_start_transaction(fs_info->quota_root, 1); @@ -2542,8 +2542,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) changeset.bytes_changed = 0; changeset.range_changed = ulist_alloc(GFP_NOFS); ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); trace_btrfs_qgroup_reserve_data(inode, start, len, changeset.bytes_changed, QGROUP_RESERVE); @@ -2580,8 +2579,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, return -ENOMEM; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -2672,7 +2670,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) } /* - * Check qgroup reserved space leaking, normally at destory inode + * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct inode *inode) @@ -2688,7 +2686,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) return; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0b7792e02dd5..f8b6d411a034 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can - * steal from cached rbio's though, other functions + * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || @@ -2368,7 +2368,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - /* Check scrubbing pairty and repair it */ + /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) @@ -2493,7 +2493,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckly, use the other one to repair + * is scrubbing parity, luckily, use the other one to repair * the data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 298631eaee78..8428db7cd88f 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) do { enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(fs_info, device); } + mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1cfd35cfac76..0477dca154ed 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * roots of b-trees that reference the tree block. * * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop + * to find upper level blocks that reference the block, and then check + * backrefs of these upper level blocks recursively. the recursion stop * when tree root is reached or backrefs for the block is cached. * * NOTE: if we find backrefs for a block are cached, we know backrefs @@ -1160,7 +1160,7 @@ out: if (!RB_EMPTY_NODE(&upper->rb_node)) continue; - /* Add this guy's upper edges to the list to proces */ + /* Add this guy's upper edges to the list to process */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); if (list_empty(&upper->upper)) @@ -2396,7 +2396,7 @@ again: } /* - * we keep the old last snapshod transid in rtranid when we + * we keep the old last snapshot transid in rtranid when we * created the relocation tree. */ last_snap = btrfs_root_rtransid(&reloc_root->root_item); @@ -2616,7 +2616,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + rc->extent_root->nodesize * @@ -2814,7 +2814,7 @@ static void mark_block_processed(struct reloc_control *rc, u64 bytenr, u32 blocksize) { set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); } static void __mark_block_processed(struct reloc_control *rc, @@ -3182,7 +3182,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); + EXTENT_BOUNDARY); nr++; } @@ -4059,8 +4059,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans, rc->extent_root); @@ -4591,7 +4590,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, /* * called before creating snapshot. it calculates metadata reservation - * requried for relocating tree blocks in the snapshot + * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b2b14e7115f1..f1c30861d062 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, * search_key: the key to search * path: the path we search * root_item: the root item of the tree we look for - * root_key: the reak key of the tree we look for + * root_key: the root key of the tree we look for * - * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset * of the search key, just lookup the root with the highest offset for a * given objectid. * diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fa35cdc46494..70427ef66b04 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) * sure we read the bad mirror. */ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret) { /* set_extent_bits should give proper error */ WARN_ON(ret > 0); @@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) end, EXTENT_DAMAGED, 0, NULL); if (!corrected) clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); } out: @@ -1044,7 +1044,7 @@ nodatasum_case: /* * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified + * might not be COWed, that it might be modified * concurrently. The general strategy to work on the * commit root does not help in the case when COW is not * used. @@ -1125,7 +1125,7 @@ nodatasum_case: * the 2nd page of mirror #1 faces I/O errors, and the 2nd page * of mirror #2 is readable but the final checksum test fails, * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare + * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors @@ -2181,7 +2181,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio; + struct btrfs_bio *bbio = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2982,6 +2982,7 @@ again: extent_len); mapped_length = extent_len; + bbio = NULL; ret = btrfs_map_block(fs_info, READ, extent_logical, &mapped_length, &bbio, 0); if (!ret) { @@ -3581,6 +3582,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); + if (!ret && is_dev_replace) { + /* + * If we are doing a device replace wait for any tasks + * that started dellaloc right before we set the block + * group to RO mode, as they might have just allocated + * an extent from it or decided they could do a nocow + * write. And if any such tasks did that, wait for their + * ordered extents to complete and then commit the + * current transaction, so that we can later see the new + * extent items in the extent tree - the ordered extents + * create delayed data references (for cow writes) when + * they complete, which will be run and insert the + * corresponding extent items into the extent tree when + * we commit the transaction they used when running + * inode.c:btrfs_finish_ordered_io(). We later use + * the commit root of the extent tree to find extents + * to copy from the srcdev into the tgtdev, and we don't + * want to miss any new extents. + */ + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + ret = btrfs_wait_ordered_roots(fs_info, -1, + cache->key.objectid, + cache->key.offset); + if (ret > 0) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + ret = PTR_ERR(trans); + else + ret = btrfs_commit_transaction(trans, + root); + if (ret) { + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + } scrub_pause_off(fs_info); if (ret == 0) { @@ -3601,9 +3642,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3639,6 +3682,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + if (ro_set) btrfs_dec_block_group_ro(root, cache); @@ -3676,9 +3724,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOMEM; break; } - - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; skip: key.offset = found_key.offset + length; btrfs_release_path(path); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6a8c86074aa4..b71dd298385c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, /* * If we have a parent root we need to verify that the parent dir was - * not delted and then re-created, if it was then we have no overwrite + * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ if (sctx->parent_root) { @@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, return -ENOMEM; /* - * This hack is needed because empty acl's are stored as zero byte + * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte - * acl's will fail later. To fix this, we send a dummy acl list that + * acls will fail later. To fix this, we send a dummy acl list that * only contains the version number and no entries. */ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index e05619f241be..875c757e73e2 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The end result is that anyone who #includes ctree.h gets a * declaration for the btrfs_set_foo functions and btrfs_foo functions, - * which are wappers of btrfs_set_token_#bits functions and + * which are wrappers of btrfs_set_token_#bits functions and * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bf71071ab6f6..4339b6613f19 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -112,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * Note that a running device replace operation is not * canceled here although there is no way to update * the progress. It would add the risk of a deadlock, - * therefore the canceling is ommited. The only penalty + * therefore the canceling is omitted. The only penalty * is that some I/O remains active until the procedure * completes. The next time when the filesystem is * mounted writeable again, the device replace @@ -1877,7 +1877,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int ret; /* - * We aren't under the device list lock, so this is racey-ish, but good + * We aren't under the device list lock, so this is racy-ish, but good * enough for our purposes. */ nr_devices = fs_info->fs_devices->open_devices; @@ -1896,7 +1896,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) if (!devices_info) return -ENOMEM; - /* calc min stripe number for data space alloction */ + /* calc min stripe number for data space allocation */ type = btrfs_get_alloc_profile(root, 1); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; @@ -1932,7 +1932,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space *= BTRFS_STRIPE_LEN; /* - * In order to avoid overwritting the superblock on the drive, + * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ @@ -2303,7 +2303,7 @@ static void btrfs_interface_exit(void) static void btrfs_print_mod_info(void) { - printk(KERN_INFO "Btrfs loaded" + printk(KERN_INFO "Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2313,33 +2313,48 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif - "\n"); + "\n", + btrfs_crc32c_impl()); } static int btrfs_run_sanity_tests(void) { - int ret; - + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; ret = btrfs_init_test_fs(); if (ret) return ret; - - ret = btrfs_test_free_space_cache(); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(); - if (ret) - goto out; - ret = btrfs_test_extent_io(); - if (ret) - goto out; - ret = btrfs_test_inodes(); - if (ret) - goto out; - ret = btrfs_test_qgroups(); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(); + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index f54bf450bad3..10eb249ef891 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -175,7 +175,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) } struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length) +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) { struct btrfs_block_group_cache *cache; @@ -192,8 +192,8 @@ btrfs_alloc_dummy_block_group(unsigned long length) cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; + cache->sectorsize = sectorsize; + cache->full_stripe_len = sectorsize; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 054b8c73c951..66fb6b701eb7 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -26,27 +26,28 @@ struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(void); -int btrfs_test_extent_buffer_operations(void); -int btrfs_test_extent_io(void); -int btrfs_test_inodes(void); -int btrfs_test_qgroups(void); -int btrfs_test_free_space_tree(void); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); +int btrfs_test_inodes(u32 sectorsize, u32 nodesize); +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length); +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(void) +static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_extent_buffer_operations(void) +static inline int btrfs_test_extent_buffer_operations(u32 sectorsize, + u32 nodesize) { return 0; } @@ -57,19 +58,19 @@ static inline int btrfs_init_test_fs(void) static inline void btrfs_destroy_test_fs(void) { } -static inline int btrfs_test_extent_io(void) +static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_inodes(void) +static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_qgroups(void) +static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_free_space_tree(void) +static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index f51963a8f929..4f8cbd1ec5ee 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -22,7 +22,7 @@ #include "../extent_io.h" #include "../disk-io.h" -static int test_btrfs_split_item(void) +static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { struct btrfs_path *path; struct btrfs_root *root; @@ -40,7 +40,7 @@ static int test_btrfs_split_item(void) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); return PTR_ERR(root); @@ -53,7 +53,8 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, + nodesize); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; @@ -222,8 +223,8 @@ out: return ret; } -int btrfs_test_extent_buffer_operations(void) +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests"); - return test_btrfs_split_item(); + test_msg("Running extent buffer operation tests\n"); + return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 70948b13bc81..d19ab0317283 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sizes.h> #include "btrfs-tests.h" +#include "../ctree.h" #include "../extent_io.h" #define PROCESS_UNLOCK (1 << 0) @@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } -static int test_find_delalloc(void) +static int test_find_delalloc(u32 sectorsize) { struct inode *inode; struct extent_io_tree tmp; @@ -113,7 +114,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -122,9 +123,9 @@ static int test_find_delalloc(void) test_msg("Should have found at least one delalloc\n"); goto out_bits; } - if (start != 0 || end != 4095) { - test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", - start, end); + if (start != 0 || end != (sectorsize - 1)) { + test_msg("Expected start 0 end %u, got start %llu end %llu\n", + sectorsize - 1, start, end); goto out_bits; } unlock_extent(&tmp, start, end); @@ -144,7 +145,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -172,11 +173,11 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = max_bytes + 4096; + test_start = max_bytes + sectorsize; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Could'nt find the locked page\n"); + test_msg("Couldn't find the locked page\n"); goto out_bits; } start = test_start; @@ -199,7 +200,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +263,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -272,6 +273,16 @@ out: return ret; } +/** + * test_bit_in_byte - Determine whether a bit is set in a byte + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit_in_byte(int nr, const u8 *addr) +{ + return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); +} + static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { @@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return -EINVAL; } - bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Setting straddling pages failed\n"); - return -EINVAL; - } + /* Straddling pages test */ + if (len > PAGE_SIZE) { + bitmap_set(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } } /* @@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, for (i = 0; i < len * BITS_PER_BYTE; i++) { int bit, bit1; - bit = !!test_bit(i, bitmap); + bit = !!test_bit_in_byte(i, (u8 *)bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { test_msg("Testing bit pattern failed\n"); @@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return 0; } -static int test_eb_bitmaps(void) +static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { - unsigned long len = PAGE_SIZE * 4; + unsigned long len; unsigned long *bitmap; struct extent_buffer *eb; int ret; test_msg("Running extent buffer bitmap tests\n"); + /* + * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than + * BTRFS_MAX_METADATA_BLOCKSIZE. + */ + len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) + ? sectorsize * 4 : sectorsize; + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); @@ -379,7 +401,7 @@ static int test_eb_bitmaps(void) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len); + eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); @@ -393,17 +415,17 @@ out: return ret; } -int btrfs_test_extent_io(void) +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; test_msg("Running extent I/O tests\n"); - ret = test_find_delalloc(); + ret = test_find_delalloc(sectorsize); if (ret) goto out; - ret = test_eb_bitmaps(); + ret = test_eb_bitmaps(sectorsize, nodesize); out: test_msg("Extent I/O tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 514247515312..3956bb2ff84c 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -22,10 +22,10 @@ #include "../disk-io.h" #include "../free-space-cache.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) /* - * This test just does basic sanity checking, making sure we can add an exten + * This test just does basic sanity checking, making sure we can add an extent * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ @@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache) +static int test_bitmaps(struct btrfs_block_group_cache *cache, + u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) * The first bitmap we have starts at offset 0 so the next one is just * at the end of the first bitmap. */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); /* Test a bit straddling two bitmaps */ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, @@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, + u32 sectorsize) { - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; test_msg("Running bitmap and extent tests\n"); @@ -396,11 +398,13 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * wasn't optimal as they could be spread all over the block group while under * concurrency (extra overhead and fragmentation). * - * This stealing approach is benefical, since we always prefer to allocate from - * extent entries, both for clustered and non-clustered allocation requests. + * This stealing approach is beneficial, since we always prefer to allocate + * from extent entries, both for clustered and non-clustered allocation + * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, + u32 sectorsize) { int ret; u64 offset; @@ -538,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -596,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { - test_msg("Cache free space is not 1Mb + 4Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", sectorsize); return -EINVAL; } @@ -610,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is a sectorsize free space region in a bitmap. + * Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 4096) { - test_msg("Cache free space is not 4Kb\n"); + if (cache->free_space_ctl->free_space != sectorsize) { + test_msg("Cache free space is not %u\n", sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 4096, 0, + 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", - offset); + test_msg("Failed to allocate %u, returned offset : %llu\n", + sectorsize, offset); return -EINVAL; } @@ -732,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_32M, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -756,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Confirm that our extent entry didn't stole all free space from the - * bitmap, because of the small 8Kb free space region. + * bitmap, because of the small 2 * sectorsize free space region. */ ret = check_num_extents_and_bitmaps(cache, 2, 1); if (ret) @@ -782,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { - test_msg("Cache free space is not 1Mb + 8Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); return -EINVAL; } @@ -795,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is 2 * sectorsize free space region + * in a bitmap. Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 8192) { - test_msg("Cache free space is not 8Kb\n"); + if (cache->free_space_ctl->free_space != 2 * sectorsize) { + test_msg("Cache free space is not %u\n", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 8192, 0, + 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + test_msg("Failed to allocate %u, offset: %llu\n", + 2 * sectorsize, offset); return -EINVAL; } @@ -824,7 +835,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return 0; } -int btrfs_test_free_space_cache(void) +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; @@ -832,13 +843,19 @@ int btrfs_test_free_space_cache(void) test_msg("Running btrfs free space cache tests\n"); - cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); + /* + * For ppc64 (with 64k page size), bytes per bitmap might be + * larger than 1G. To make bitmap test available in ppc64, + * alloc dummy block group whose size cross bitmaps. + */ + cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize + + PAGE_SIZE, sectorsize); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; } - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -854,14 +871,14 @@ int btrfs_test_free_space_cache(void) ret = test_extents(cache); if (ret) goto out; - ret = test_bitmaps(cache); + ret = test_bitmaps(cache, sectorsize); if (ret) goto out; - ret = test_bitmaps_and_extents(cache); + ret = test_bitmaps_and_extents(cache, sectorsize); if (ret) goto out; - ret = test_steal_space_from_bitmap_to_extent(cache); + ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7cea4462acd5..aac507085ab0 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../disk-io.h" @@ -30,7 +31,7 @@ struct free_space_extent { * The test cases align their operations to this in order to hit some of the * edge cases in the bitmap code. */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -439,7 +440,8 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_block_group_cache *, struct btrfs_path *); -static int run_test(test_func_t test_func, int bitmaps) +static int run_test(test_func_t test_func, int bitmaps, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; @@ -447,7 +449,7 @@ static int run_test(test_func_t test_func, int bitmaps) struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate dummy root\n"); ret = PTR_ERR(root); @@ -466,7 +468,8 @@ static int run_test(test_func_t test_func, int bitmaps) root->fs_info->free_space_root = root; root->fs_info->tree_root = root; - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, + nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -474,9 +477,9 @@ static int run_test(test_func_t test_func, int bitmaps) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -534,17 +537,18 @@ out: return ret; } -static int run_test_both_formats(test_func_t test_func) +static int run_test_both_formats(test_func_t test_func, + u32 sectorsize, u32 nodesize) { int ret; - ret = run_test(test_func, 0); + ret = run_test(test_func, 0, sectorsize, nodesize); if (ret) return ret; - return run_test(test_func, 1); + return run_test(test_func, 1, sectorsize, nodesize); } -int btrfs_test_free_space_tree(void) +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { test_func_t tests[] = { test_empty_block_group, @@ -561,9 +565,11 @@ int btrfs_test_free_space_tree(void) test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i]); + int ret = run_test_both_formats(tests[i], sectorsize, + nodesize); if (ret) { - test_msg("%pf failed\n", tests[i]); + test_msg("%pf : sectorsize %u failed\n", + tests[i], sectorsize); return ret; } } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 863a6a3af1f8..29648c0a39f1 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../btrfs_inode.h" @@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] - * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] * - * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] - * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] + * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] * - * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] - * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] * - * [81920-86016] - * [ regular ] + * [69635-73731][ 73731 - 86019 ][86019-90115] + * [ regular ][ hole but no extent][ regular ] */ -static void setup_file_extents(struct btrfs_root *root) +static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { int slot = 0; u64 disk_bytenr = SZ_1M; @@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root) insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; - offset = 4096; + offset = sectorsize; /* Now another hole */ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, @@ -128,99 +129,106 @@ static void setup_file_extents(struct btrfs_root *root) offset += 4; /* Now for a regular extent */ - insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - disk_bytenr += 4096; - offset += 4095; + disk_bytenr += sectorsize; + offset += sectorsize - 1; /* * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, - 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, + BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now for a unwritten prealloc extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; + offset += sectorsize; /* * We want to jack up disk_bytenr a little more so the em stuff doesn't * merge our records. */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, + disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, + slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now a normal compressed extent */ - insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; + offset += 2 * sectorsize; /* No merges */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* Now a split compressed extent */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; - disk_bytenr += 8192; + offset += 2 * sectorsize; + disk_bytenr += 2 * sectorsize; /* Now extents that have a hole but no hole extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 16384; - disk_bytenr += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += 4 * sectorsize; + disk_bytenr += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } static unsigned long prealloc_only = 0; static unsigned long compressed_only = 0; static unsigned long vacancy_only = 0; -static noinline int test_btrfs_get_extent(void) +static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -240,7 +248,7 @@ static noinline int test_btrfs_get_extent(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -256,7 +264,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -264,7 +272,7 @@ static noinline int test_btrfs_get_extent(void) /* * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidently release our page. + * extra ref so our searches don't accidentally release our page. */ extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); @@ -273,7 +281,7 @@ static noinline int test_btrfs_get_extent(void) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -295,7 +303,7 @@ static noinline int test_btrfs_get_extent(void) * setup_file_extents, so if you change anything there you need to * update the comment and update the expected values below. */ - setup_file_extents(root); + setup_file_extents(root, sectorsize); em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { @@ -318,7 +326,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -327,7 +335,8 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected an inline, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4091) { + + if (em->start != offset || em->len != (sectorsize - 5)) { test_msg("Unexpected extent wanted start %llu len 1, got start " "%llu len %llu\n", offset, em->start, em->len); goto out; @@ -344,7 +353,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -366,7 +375,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -375,7 +384,7 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4095) { + if (em->start != offset || em->len != sectorsize - 1) { test_msg("Unexpected extent wanted start %llu len 4095, got " "start %llu len %llu\n", offset, em->start, em->len); goto out; @@ -393,7 +402,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -402,9 +411,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -421,7 +431,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -430,9 +440,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -442,7 +453,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -451,9 +462,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -475,7 +487,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,9 +496,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -503,7 +516,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -512,9 +525,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -532,7 +546,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -541,9 +555,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -564,7 +579,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -573,9 +588,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -598,7 +614,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -607,9 +623,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -631,7 +648,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -640,9 +657,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -665,7 +683,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -674,9 +692,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -691,7 +710,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -701,9 +720,10 @@ static noinline int test_btrfs_get_extent(void) disk_bytenr, em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -725,7 +745,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -734,9 +754,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -765,9 +786,10 @@ static noinline int test_btrfs_get_extent(void) * length of the actual hole, if this changes we'll have to change this * test. */ - if (em->start != offset || em->len != 12288) { - test_msg("Unexpected extent wanted start %llu len 12288, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 3 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -783,7 +805,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -792,9 +814,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -815,7 +838,7 @@ out: return ret; } -static int test_hole_first(void) +static int test_hole_first(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -832,7 +855,7 @@ static int test_hole_first(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -844,7 +867,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -861,9 +884,9 @@ static int test_hole_first(void) * btrfs_get_extent. */ insert_inode_item_key(root); - insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, - BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -872,9 +895,10 @@ static int test_hole_first(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != 0 || em->len != 4096) { - test_msg("Unexpected extent wanted start 0 len 4096, got start " - "%llu len %llu\n", em->start, em->len); + if (em->start != 0 || em->len != sectorsize) { + test_msg("Unexpected extent wanted start 0 len %u, " + "got start %llu len %llu\n", + sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -884,18 +908,19 @@ static int test_hole_first(void) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; } - if (em->block_start != 4096) { + if (em->block_start != sectorsize) { test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != 4096 || em->len != 4096) { - test_msg("Unexpected extent wanted start 4096 len 4096, got " - "start %llu len %llu\n", em->start, em->len); + if (em->start != sectorsize || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %u len %u, " + "got start %llu len %llu\n", + sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -912,7 +937,7 @@ out: return ret; } -static int test_extent_accounting(void) +static int test_extent_accounting(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -924,7 +949,7 @@ static int test_extent_accounting(void) return ret; } - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -954,10 +979,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, - BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -969,10 +995,10 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_KERNEL); @@ -987,10 +1013,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + + sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -1004,16 +1031,17 @@ static int test_extent_accounting(void) } /* - * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] * * I'm artificially adding 2 to outstanding_extents because in the * buffered IO case we'd add things up as we go, but I don't feel like * doing that here, this isn't the interesting case we want to test. */ BTRFS_I(inode)->outstanding_extents += 2; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, - (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, - NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, + (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1025,10 +1053,13 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1042,8 +1073,8 @@ static int test_extent_accounting(void) /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); @@ -1063,8 +1094,9 @@ static int test_extent_accounting(void) * might fail and I'd rather satisfy my paranoia at this point. */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1103,7 +1135,7 @@ out: return ret; } -int btrfs_test_inodes(void) +int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; @@ -1112,13 +1144,13 @@ int btrfs_test_inodes(void) set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); - ret = test_btrfs_get_extent(); + ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - ret = test_hole_first(); + ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; test_msg("Running outstanding_extents tests\n"); - return test_extent_accounting(); + return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 8ea5d34bc5a2..57a12c0d680b 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../transaction.h" @@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } -static int test_no_shared_qgroup(struct btrfs_root *root) +static int test_no_shared_qgroup(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -227,29 +229,30 @@ static int test_no_shared_qgroup(struct btrfs_root *root) btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5); + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } /* - * Since the test trans doesn't havee the complicated delayed refs, + * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_item(root, 4096, 4096); + ret = remove_extent_item(root, nodesize, nodesize); if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) * right, also remove one of the roots and make sure the exclusive count is * adjusted properly. */ -static int test_multiple_refs(struct btrfs_root *root) +static int test_multiple_refs(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); - /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256); + /* + * We have BTRFS_FS_TREE_OBJECTID created already from the + * previous test. + */ + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = add_tree_ref(root, 4096, 4096, 0, 256); + ret = add_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_ref(root, 4096, 4096, 0, 256); + ret = remove_extent_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -435,13 +451,13 @@ static int test_multiple_refs(struct btrfs_root *root) return 0; } -int btrfs_test_qgroups(void) +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); return PTR_ERR(root); @@ -468,7 +484,8 @@ int btrfs_test_qgroups(void) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize, + nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -476,16 +493,16 @@ int btrfs_test_qgroups(void) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 5; + tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID; root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { @@ -493,14 +510,14 @@ int btrfs_test_qgroups(void) goto out; } - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 256; + tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { test_msg("Couldn't insert fs root %d\n", ret); @@ -508,10 +525,10 @@ int btrfs_test_qgroups(void) } test_msg("Running qgroup tests\n"); - ret = test_no_shared_qgroup(root); + ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; - ret = test_multiple_refs(root); + ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5b0b758a3f79..f6e24cb423ae 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -944,7 +944,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, err = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 72be51f7ca2f..9fe0ec2bf0fe 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -144,7 +144,7 @@ struct btrfs_pending_snapshot { /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; - /* extra metadata reseration for relocation */ + /* extra metadata reservation for relocation */ int error; bool readonly; struct list_head list; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8aaca5c6af94..b7665af471d8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF + * orphan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { @@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + EXTENT_DIRTY | EXTENT_NEW); } /* @@ -4914,7 +4914,7 @@ out_unlock: * the actual unlink operation, so if we do this check before a concurrent task * sets last_unlink_trans it means we've logged a consistent version/state of * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task migth have only updated last_unlink_trans before + * commit (the concurrent task might have only updated last_unlink_trans before * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, @@ -4973,7 +4973,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, while (1) { /* * If we are logging a directory then we start with our inode, - * not our parents inode, so we need to skipp setting the + * not our parent's inode, so we need to skip setting the * logged_trans so that further down in the log code we don't * think this inode has already been logged. */ @@ -5357,7 +5357,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, log_dentries = true; /* - * On unlink we must make sure all our current and old parent directores + * On unlink we must make sure all our current and old parent directory * inodes are fully logged. This is to prevent leaving dangling * directory index entries in directories that were our parents but are * not anymore. Not doing this results in old parent directory being diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 91feb2bdefee..b1434bb57e36 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -28,7 +28,7 @@ * } * ulist_free(ulist); * - * This assumes the graph nodes are adressable by u64. This stems from the + * This assumes the graph nodes are addressable by u64. This stems from the * usage for tree enumeration in btrfs, where the logical addresses are * 64 bit. * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2b88127bba5b..548faaa9e169 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2190,7 +2190,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) } /* - * strore the expected generation for seed devices in device items. + * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2761,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2787,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } @@ -2811,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -3387,7 +3398,7 @@ static int should_balance_chunk(struct btrfs_root *root, } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be - * determined here because we do not have the global informatoin + * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) @@ -4230,6 +4241,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); return ret; } @@ -5762,20 +5774,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } @@ -6076,7 +6085,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { - /* Shoud be the original bio. */ + /* Should be the original bio. */ WARN_ON(bio != bbio->orig_bio); btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -6250,27 +6259,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; u64 length; u64 stripe_len; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; + u16 num_stripes; + u16 sub_stripes; + u64 type; - logical = key->offset; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - /* Validation check */ + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + if (!num_stripes) { btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", num_stripes); @@ -6281,6 +6286,11 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk logical %llu", logical); return -EIO; } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } if (!length || !IS_ALIGNED(length, root->sectorsize)) { btrfs_err(root->fs_info, "invalid chunk length %llu", length); @@ -6292,13 +6302,54 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)) { + type) { btrfs_err(root->fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EIO; } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 stripe_len; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6546,6 +6597,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6560,7 +6612,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* - * The sb extent buffer is artifical and just used to read the system array. + * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all @@ -6612,6 +6664,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6630,13 +6691,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return -EIO; } @@ -6648,6 +6711,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_key found_key; int ret; int slot; + u64 total_dev = 0; root = root->fs_info->chunk_root; @@ -6689,6 +6753,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) ret = read_one_dev(root, leaf, dev_item); if (ret) goto error; + total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -6698,6 +6763,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } path->slots[0]++; } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != root->fs_info->fs_devices->total_devices) { + btrfs_err(root->fs_info, + "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_super_num_devices(root->fs_info->super_copy), + total_dev); + ret = -EINVAL; + goto error; + } + if (btrfs_super_total_bytes(root->fs_info->super_copy) < + root->fs_info->fs_devices->total_rw_bytes) { + btrfs_err(root->fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(root->fs_info->super_copy), + root->fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } ret = 0; error: unlock_chunks(root); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3bfb252206c7..d1a177a3dbe8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, - int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_set_prop(d_inode(dentry), name, value, size, flags); + return btrfs_set_prop(inode, name, value, size, flags); } static const struct xattr_handler btrfs_security_xattr_handler = { diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 861d611b8c05..ce5f345d70f5 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) * check if the backing cache is updated to FS-Cache * - called by FS-Cache when evaluates if need to invalidate the cache */ -static bool cachefiles_check_consistency(struct fscache_operation *op) +static int cachefiles_check_consistency(struct fscache_operation *op) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 43098cd9602b..26a9d10d75e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) /* * Finish an async read(ahead) op. */ -static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_osd_data *osd_data; - int rc = req->r_result; - int bytes = le32_to_cpu(msg->hdr.data_len); + int rc = req->r_result <= 0 ? req->r_result : 0; + int bytes = req->r_result >= 0 ? req->r_result : 0; int num_pages; int i; @@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != -ENOENT) + if (rc < 0 && rc != -ENOENT) { + ceph_fscache_readpage_cancel(inode, page); goto unlock; + } if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -376,8 +378,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); - set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { - dout("writepage setting page/mapping error %d %p\n", err, page); + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + dout("writepage interrupted page %p\n", page); + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + goto out; + } + dout("writepage setting page/mapping error %d %p\n", + err, page); SetPageError(page); mapping_set_error(&inode->i_data, err); - if (wbc) - wbc->pages_skipped++; + wbc->pages_skipped++; } else { dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ @@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); err = writepage_nounlock(page, wbc); + if (err == -ERESTARTSYS) { + /* direct memory reclaimer was killed by SIGKILL. return 0 + * to prevent caller from setting mapping/page error */ + err = 0; + } unlock_page(page); iput(inode); return err; } - /* * lame release_pages helper. release_pages() isn't exported to * modules. @@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num) * If we get an error, set the mapping error bit, but not the individual * page error bits. */ -static void writepages_finish(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); @@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); bool remove_page; - dout("writepages_finish %p rc %d\n", inode, rc); if (rc < 0) mapping_set_error(mapping, rc); @@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + if (rc < 0) + SetPageError(page); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - pr_warn("writepage_start %p on forced umount\n", inode); - truncate_pagecache(inode, 0); + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited( + "writepage_start %p %lld forced umount\n", + inode, ceph_ino(inode)); + } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } @@ -1063,10 +1079,7 @@ new_request: pages = NULL; } - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, snapc, vino.snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1099,8 +1112,7 @@ release_pvec_pages: mapping->writeback_index = index; out: - if (req) - ceph_osdc_put_request(req); + ceph_osdc_put_request(req); ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; @@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file, struct page *page) { struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); loff_t page_off = pos & PAGE_MASK; int pos_in_page = pos & ~PAGE_MASK; @@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + unlock_page(page); + return -EIO; + } + retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); @@ -1165,7 +1184,7 @@ retry_locked: snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); - r = wait_event_interruptible(ci->i_cap_wq, + r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); if (r == -ERESTARTSYS) @@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = { .direct_IO = ceph_direct_io, }; +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} /* * vm ops @@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_SHIFT; int want, got, ret; + sigset_t oldset; + + ceph_block_sigs(&oldset); dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); @@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, - -1, &got, &pinned_page); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - return VM_FAULT_SIGBUS; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (ret < 0) + goto out_restore; + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); @@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ceph_put_cap_refs(ci, got); if (ret != -EAGAIN) - return ret; + goto out_restore; /* read inline data */ if (off >= PAGE_SIZE) { @@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; - goto out; + goto out_inline; } ret1 = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); if (ret1 < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); - ret = VM_FAULT_SIGBUS; - goto out; + if (ret1 < 0) + ret = ret1; + else + ret = VM_FAULT_SIGBUS; + goto out_inline; } if (ret1 < PAGE_SIZE) zero_user_segment(page, ret1, PAGE_SIZE); @@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_SIZE, ret); } -out: - dout("filemap_fault %p %llu~%zd read inline data ret %d\n", - inode, off, (size_t)PAGE_SIZE, ret); +out_restore: + ceph_restore_sigs(&oldset); + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return ret; } @@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size = i_size_read(inode); size_t len; int want, got, ret; + sigset_t oldset; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) - return VM_FAULT_SIGBUS; + return VM_FAULT_OOM; + + ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; @@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_free; - } } if (off + PAGE_SIZE <= size) @@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - ret = VM_FAULT_SIGBUS; - goto out_free; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (ret < 0) + goto out_free; + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", inode, off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); - lock_page(page); + do { + lock_page(page); - ret = VM_FAULT_NOPAGE; - if ((off > size) || - (page->mapping != inode->i_mapping)) { - unlock_page(page); - goto out; - } + if ((off > size) || (page->mapping != inode->i_mapping)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + break; + } + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret >= 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + ret = VM_FAULT_LOCKED; + } + } while (ret == -EAGAIN); - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret >= 0) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); - ret = VM_FAULT_LOCKED; - } else { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; - } -out: if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; @@ -1495,8 +1523,10 @@ out: inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); out_free: + ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); - + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; return ret; } @@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; - snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), - "%llx.00000000", ci->i_vino.ino); - rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); + + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); @@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; - wr_req->r_base_oid = rd_req->r_base_oid; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); + + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); @@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); + wr_req->r_mtime = ci->vfs_inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) @@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) out_unlock: up_write(&mdsc->pool_perm_rwsem); - if (rd_req) - ceph_osdc_put_request(rd_req); - if (wr_req) - ceph_osdc_put_request(wr_req); + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); out: if (!err) err = have; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a351480dbabc..238c55b01723 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,6 +25,7 @@ #include "cache.h" struct ceph_aux_inode { + u64 version; struct timespec mtime; loff_t size; }; @@ -69,15 +70,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, fsc, true); - - if (fsc->fscache == NULL) { + if (!fsc->fscache) pr_err("Unable to resgister fsid: %p fscache cookie", fsc); - return 0; - } - - fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); - if (fsc->revalidate_wq == NULL) - return -ENOMEM; return 0; } @@ -105,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, const struct inode* inode = &ci->vfs_inode; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -131,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -181,32 +177,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .now_uncached = ceph_fscache_inode_now_uncached, }; -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - struct inode* inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ if (fsc->fscache == NULL) return; /* Only cache for regular files that are read only */ - if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + if (!S_ISREG(inode->i_mode)) return; - /* Avoid multiple racing open requests */ - inode_lock(inode); - - if (ci->fscache) - goto done; - - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, true); - fscache_check_consistency(ci->fscache); -done: + inode_lock_nested(inode, I_MUTEX_CHILD); + if (!ci->fscache) { + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, false); + } inode_unlock(inode); - } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) @@ -222,6 +212,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } +static bool ceph_fscache_can_enable(void *data) +{ + struct inode *inode = data; + return !inode_is_open_for_write(inode); +} + +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!fscache_cookie_valid(ci->fscache)) + return; + + if (inode_is_open_for_write(inode)) { + dout("fscache_file_set_cookie %p %p disabling cache\n", + inode, filp); + fscache_disable_cookie(ci->fscache, false); + fscache_uncache_all_inode_pages(ci->fscache, inode); + } else { + fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, + inode); + if (fscache_cookie_enabled(ci->fscache)) { + dout("fscache_file_set_cookie %p %p enabing cache\n", + inode, filp); + } + } +} + static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) { if (!error) @@ -236,10 +254,9 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int unlock_page(page); } -static inline int cache_valid(struct ceph_inode_info *ci) +static inline bool cache_valid(struct ceph_inode_info *ci) { - return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && - (ci->i_fscache_gen == ci->i_rdcache_gen)); + return ci->i_fscache_gen == ci->i_rdcache_gen; } @@ -332,69 +349,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fsc->revalidate_wq) - destroy_workqueue(fsc->revalidate_wq); - fscache_relinquish_cookie(fsc->fscache, 0); fsc->fscache = NULL; } -static void ceph_revalidate_work(struct work_struct *work) -{ - int issued; - u32 orig_gen; - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_revalidate_work); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&ci->i_ceph_lock); - issued = __ceph_caps_issued(ci, NULL); - orig_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - - if (!(issued & CEPH_CAP_FILE_CACHE)) { - dout("revalidate_work lost cache before validation %p\n", - inode); - goto out; - } - - if (!fscache_check_consistency(ci->fscache)) - fscache_invalidate(ci->fscache); - - spin_lock(&ci->i_ceph_lock); - /* Update the new valid generation (backwards sanity check too) */ - if (orig_gen > ci->i_fscache_gen) { - ci->i_fscache_gen = orig_gen; - } - spin_unlock(&ci->i_ceph_lock); - -out: - iput(&ci->vfs_inode); -} - -void ceph_queue_revalidate(struct inode *inode) +/* + * caller should hold CEPH_CAP_FILE_{RD,CACHE} + */ +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_inode_info *ci = ceph_inode(inode); - - if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + if (cache_valid(ci)) return; - ihold(inode); - - if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, - &ci->i_revalidate_work)) { - dout("ceph_queue_revalidate %p\n", inode); - } else { - dout("ceph_queue_revalidate %p failed\n)", inode); - iput(inode); + /* resue i_truncate_mutex. There should be no pending + * truncate while the caller holds CEPH_CAP_FILE_RD */ + mutex_lock(&ci->i_truncate_mutex); + if (!cache_valid(ci)) { + if (fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + spin_lock(&ci->i_ceph_lock); + ci->i_fscache_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); } -} - -void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; - /* The first load is verifed cookie open time */ - ci->i_fscache_gen = 1; - INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); + mutex_unlock(&ci->i_truncate_mutex); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 5ac591bd012b..7e72c7594f0c 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -34,10 +34,10 @@ void ceph_fscache_unregister(void); int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); -void ceph_fscache_inode_init(struct ceph_inode_info *ci); -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci); +void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -46,12 +46,11 @@ int ceph_readpages_from_fscache(struct inode *inode, unsigned *nr_pages); void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); -void ceph_queue_revalidate(struct inode *inode); -static inline void ceph_fscache_update_objectsize(struct inode *inode) +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); - fscache_attr_changed(ci->fscache); + ci->fscache = NULL; + ci->i_fscache_gen = 0; } static inline void ceph_fscache_invalidate(struct inode *inode) @@ -88,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, return fscache_readpages_cancel(ci->fscache, pages); } +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ + ci->i_fscache_gen = ci->i_rdcache_gen - 1; +} + #else static inline int ceph_fscache_register(void) @@ -112,8 +116,20 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, - struct ceph_inode_info* ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_file_set_cookie(struct inode *inode, + struct file *filp) +{ +} + +static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { } @@ -141,10 +157,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } @@ -154,10 +166,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode, { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) -{ -} - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) { return 1; @@ -173,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, { } -static inline void ceph_queue_revalidate(struct inode *inode) +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cfaeef18cbca..6f60d0a3d0f9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1656,7 +1656,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ @@ -1698,8 +1698,8 @@ retry_locked: revoking = cap->implemented & ~cap->issued; dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap_used), + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -2317,7 +2317,7 @@ again: /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { + if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; @@ -2393,6 +2393,9 @@ again: snap_rwsem_locked = true; } *got = need | (have & want); + if ((need & CEPH_CAP_FILE_RD) && + !(*got & CEPH_CAP_FILE_CACHE)) + ceph_disable_fscache_readpage(ci); __take_cap_refs(ci, *got, true); ret = 1; } @@ -2412,12 +2415,26 @@ again: goto out_unlock; } - if (!__ceph_is_any_caps(ci) && - ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; - goto out_unlock; + if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { + int mds_wanted; + if (ACCESS_ONCE(mdsc->fsc->mount_state) == + CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci); + if ((mds_wanted & need) != need) { + dout("get_cap_refs %p caps were dropped" + " (session killed?)\n", inode); + *err = -ESTALE; + ret = 1; + goto out_unlock; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } dout("get_cap_refs %p have %s needed %s\n", inode, @@ -2487,7 +2504,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err == -EAGAIN) continue; if (err < 0) - return err; + ret = err; } else { ret = wait_event_interruptible(ci->i_cap_wq, try_get_cap_refs(ci, need, want, endoff, @@ -2496,8 +2513,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, continue; if (err < 0) ret = err; - if (ret < 0) - return ret; + } + if (ret < 0) { + if (err == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; + } + return ret; } if (ci->i_inline_version != CEPH_INLINE_NONE && @@ -2533,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, break; } + if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + ceph_fscache_revalidate_cookie(ci); + *got = _got; return 0; } @@ -2774,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; - bool queue_revalidate = false; bool deleted_inode = false; bool fill_inline = false; @@ -2807,7 +2833,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - !ci->i_wrbuffer_ref) { + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ @@ -2816,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } - - ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2859,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } } - /* Do we need to revalidate our fscache cookie. Don't bother on the - * first cache cap as we already validate at cookie creation time. */ - if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) - queue_revalidate = true; - if (newcaps & CEPH_CAP_ANY_RD) { /* ctime/mtime/atime? */ ceph_decode_timespec(&mtime, &grant->mtime); @@ -2972,11 +2991,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (fill_inline) ceph_fill_inline_data(inode, NULL, inline_data, inline_len); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_queue_revalidate(inode); - } else if (queue_revalidate) - ceph_queue_revalidate(inode); if (writeback) /* @@ -3178,10 +3194,8 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_fscache_invalidate(inode); - } } /* @@ -3226,6 +3240,8 @@ retry: if (target < 0) { __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; goto out_unlock; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 31f831471ed2..39ff678e567f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); kfree(path); - } else if (req->r_path2) { + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3ab1192d2029..6e0fedf6713b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -70,16 +70,42 @@ out_unlock: } /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + static unsigned fpos_frag(loff_t p) { - return p >> 32; + return p >> OFFSET_BITS; } + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + static unsigned fpos_off(loff_t p) { - return p & 0xffffffff; + return p & OFFSET_MASK; } static int fpos_cmp(loff_t l, loff_t r) @@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); + if (!cache_ctl->page) { + dout(" page %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_mutex, no need to use page lock */ + unlock_page(cache_ctl->page); + cache_ctl->dentries = kmap(cache_ctl->page); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; - unsigned nsize = PAGE_SIZE / sizeof(struct dentry *); - int err = 0; - loff_t ptr_pos = 0; struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; + int err = 0; - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); + + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linar search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } - /* we can calculate cache index for the first dirfrag */ - if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { - cache_ctl.index = fpos_off(ctx->pos) - 2; - BUG_ON(cache_ctl.index < 0); - ptr_pos = cache_ctl.index * sizeof(struct dentry *); + dout("__dcache_readdir %p cache idx %llu\n", dir, idx); } - while (true) { - pgoff_t pgoff; - bool emit_dentry; - if (ptr_pos >= i_size_read(dir)) { + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { fi->flags |= CEPH_F_ATEND; err = 0; break; } - - err = -EAGAIN; - pgoff = ptr_pos >> PAGE_SHIFT; - if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { - ceph_readdir_cache_release(&cache_ctl); - cache_ctl.page = find_lock_page(&dir->i_data, pgoff); - if (!cache_ctl.page) { - dout(" page %lu not found\n", pgoff); - break; - } - /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ - unlock_page(cache_ctl.page); - cache_ctl.dentries = kmap(cache_ctl.page); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; } - rcu_read_lock(); - spin_lock(&parent->d_lock); - /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ - if (ceph_dir_is_complete_ordered(dir) && - ptr_pos < i_size_read(dir)) - dentry = cache_ctl.dentries[cache_ctl.index % nsize]; - else - dentry = NULL; - spin_unlock(&parent->d_lock); - if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) - dentry = NULL; - rcu_read_unlock(); - if (!dentry) - break; - - emit_dentry = false; di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && - ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) { emit_dentry = true; } spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dout(" %llx dentry %p %pd %p\n", di->offset, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, @@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, } else { dput(dentry); } - - cache_ctl.index++; - ptr_pos += sizeof(struct dentry *); } +out: ceph_readdir_cache_release(&cache_ctl); if (last) { int ret; @@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, return err; } +static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +{ + if (!fi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + else + return fi->frag != fpos_frag(pos); +} + static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + int i; int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); if (fi->flags & CEPH_F_ATEND) return 0; @@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { ino_t ino = parent_ino(file->f_path.dentry); @@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } /* can we use the dcache? */ @@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; - frag = fpos_frag(ctx->pos); - off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; + unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -305,6 +372,13 @@ more: fi->last_readdir = NULL; } + if (is_hash_order(ctx->pos)) { + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -331,6 +405,8 @@ more: req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); req->r_inode = inode; ihold(inode); @@ -340,22 +416,26 @@ more: ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); - + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); - /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - off = req->r_readdir_offset; - fi->next_offset = off; + if (!rinfo->hash_order) { + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + fi->next_offset, + false); + } } fi->frag = frag; - fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_did_prepopulate) { @@ -363,7 +443,8 @@ more: if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ fi->dir_ordered_count = 0; - } else if (ceph_frag_is_leftmost(frag) && off == 2) { + } else if (ceph_frag_is_leftmost(frag) && + fi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ fi->dir_release_count = req->r_dir_release_cnt; @@ -377,65 +458,87 @@ more: fi->dir_release_count = 0; } - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - if (ceph_frag_is_rightmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - } else { - err = note_last_dentry(fi, - rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1], - fi->next_offset + rinfo->dir_nr); + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); + err = note_last_dentry(fi, rde->name, rde->name_len, + next_offset); if (err) return err; + } else if (req->r_reply_info.dir_end) { + fi->next_offset = 2; + /* keep last name */ } } rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); - - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_inode *in = - rinfo->dir_in[off - fi->offset].in; + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + fi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, - rinfo->dir_dname_len[off - fi->offset], - rinfo->dir_dname[off - fi->offset], in); - BUG_ON(!in); - ftype = le32_to_cpu(in->mode) >> 12; - vino.ino = le64_to_cpu(in->ino); - vino.snap = le64_to_cpu(in->snapid); + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); + + BUG_ON(!rde->inode.in); + ftype = le32_to_cpu(rde->inode.in->mode) >> 12; + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); - if (!dir_emit(ctx, - rinfo->dir_dname[off - fi->offset], - rinfo->dir_dname_len[off - fi->offset], - ceph_translate_ino(inode->i_sb, ino), ftype)) { + + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } - off++; ctx->pos++; } - if (fi->last_name) { + if (fi->next_offset > 2) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - off = 0; - ctx->pos = ceph_make_fpos(frag, off); + if (!ceph_frag_is_rightmost(fi->frag)) { + unsigned frag = ceph_frag_next(fi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + fi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); + kfree(fi->last_name); + fi->last_name = NULL; + } dout("readdir next frag is %x\n", frag); goto more; } @@ -467,7 +570,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +static void reset_readdir(struct ceph_file_info *fi) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->last_name = NULL; fi->dir_release_count = 0; fi->readdir_cache_idx = -1; - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; /* compensate for . and .. */ - else - fi->next_offset = 0; + fi->next_offset = 2; /* compensate for . and .. */ fi->flags &= ~CEPH_F_ATEND; } +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; + if (new_pos == 0) + return true; + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (fi->frag |= fpos_frag(new_pos)) { + return true; + } + rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); +} + static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; inode_lock(inode); @@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { + if (need_reset_readdir(fi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; fi->flags &= ~CEPH_F_ATEND; } retval = offset; - - if (offset == 0 || - fpos_frag(offset) != fi->frag || - fpos_off(offset) < fi->offset) { - /* discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk */ - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi, fpos_frag(offset)); - } else if (fpos_cmp(offset, old_offset) > 0) { - /* reset dir_release_count if we did a forward seek */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; - } } out: inode_unlock(inode); @@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, return dentry; } -static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) { return ceph_ino(inode) == CEPH_INO_ROOT && strncmp(dentry->d_name.name, ".ceph", 5) == 0; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4f1dc7120916..ce2f5795e44b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: - /* First file open request creates the cookie, we want to keep - * this cookie around for the filetime of the inode as not to - * have to worry about fscache register / revoke / operation - * races. - * - * Also, if we know the operation is going to invalidate data - * (non readonly) just nuke the cache right away. - */ - ceph_fscache_register_inode_cookie(mdsc->fsc, ci); - if ((fmode & CEPH_FILE_MODE_WR)) - ceph_fscache_invalidate(inode); + ceph_fscache_register_inode_cookie(inode); + ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -192,6 +180,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_fmode = -1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + +/* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -616,8 +657,7 @@ static void ceph_aio_complete(struct inode *inode, kfree(aio_req); } -static void ceph_aio_complete_req(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_aio_complete_req(struct ceph_osd_request *req) { int rc = req->r_result; struct inode *inode = req->r_inode; @@ -714,14 +754,21 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; - req->r_base_oid = orig_req->r_base_oid; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } req->r_ops[0] = orig_req->r_ops[0]; osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - ceph_osdc_build_request(req, req->r_ops[0].extent.offset, - snapc, CEPH_NOSNAP, &aio_req->mtime); + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; ceph_osdc_put_request(orig_req); @@ -733,7 +780,7 @@ static void ceph_aio_retry_work(struct work_struct *work) out: if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } ceph_put_snap_context(snapc); @@ -764,6 +811,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) list_add_tail(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); + + complete_all(&req->r_completion); } else { spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_item); @@ -875,14 +924,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (pos+len) | (PAGE_SIZE - 1)); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, false, false); - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; @@ -956,7 +1003,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req, false); if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } } return -EIOCBQUEUED; @@ -1067,9 +1114,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, true); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - + req->r_mtime = mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1292,7 +1337,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; } @@ -1350,7 +1395,6 @@ retry_snap: iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1361,8 +1405,6 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (i_size_read(inode) > old_size) - ceph_fscache_update_objectsize(inode); inode_unlock(inode); } @@ -1383,7 +1425,7 @@ retry_snap: ceph_put_cap_refs(ci, got); if (written >= 0) { - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)) + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); @@ -1524,9 +1566,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1617,8 +1657,8 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; goto unlock; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e669cfa9d793..f059b5997072 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -11,6 +11,7 @@ #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/random.h> +#include <linux/sort.h> #include "super.h" #include "mds_client.h" @@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode, diri_auth = ci->i_auth_cap->mds; spin_unlock(&ci->i_ceph_lock); + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + mutex_lock(&ci->i_fragtree_mutex); if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ @@ -300,20 +304,38 @@ out: return err; } +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(ls->frag, rs->frag); +} + +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_inode_frag *frag; + struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; - int i; - u32 id, nsplits; + unsigned i, split_by, nsplits; + u32 id; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); - if (nsplits) { + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) @@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode, if (!update) goto out_unlock; + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } frag = NULL; while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); @@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode, break; } rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } frag = NULL; } if (!frag) { @@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode, if (IS_ERR(frag)) continue; } - frag->split_by = le32_to_cpu(fragtree->splits[i].by); + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; dout(" frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; } while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } } out_unlock: mutex_unlock(&ci->i_fragtree_mutex); @@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode) rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) @@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode) return 1; } +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, size = 0; } i_size_write(inode, size); - inode->i_blocks = (size + (1<<9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); - err = -EINVAL; - if (WARN_ON(symlen != i_size_read(inode))) - goto out; + if (symlen != i_size_read(inode)) { + pr_err("fill_inode %llx.%llx BAD symlink " + "size %lld\n", ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } err = -ENOMEM; sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); @@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; struct inode *in; int rc; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); in = ceph_get_inode(req->r_dentry->d_sb, vino); if (IS_ERR(in)) { @@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { pr_err("fill_inode badness on %p got %d\n", in, rc); err = rc; - continue; } + iput(in); } return err; @@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - struct ceph_dentry_info *di; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order && req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } + if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) + if (!rinfo->hash_order) req->r_readdir_offset = 2; - else - req->r_readdir_offset = 0; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { @@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_readdir_cache_idx = 0; } cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; - dname.name = rinfo->dir_dname[i]; - dname.len = rinfo->dir_dname_len[i]; + dname.name = rde->name; + dname.len = rde->name_len; dname.hash = full_name_hash(dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); + + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } retry_lookup: dn = d_lookup(parent, &dname); @@ -1490,7 +1569,7 @@ retry_lookup: } } - ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + ret = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (ret < 0) { @@ -1523,11 +1602,9 @@ retry_lookup: dn = realdn; } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + ceph_dentry(dn)->offset = rde->offset; - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, + update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { @@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); i_size_write(inode, size); - inode->i_blocks = (size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && @@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_pg_inv_work); struct inode *inode = &ci->vfs_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0; mutex_lock(&ci->i_truncate_mutex); + + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", + inode, ceph_ino(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); @@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_pagecache(inode, 0); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err("invalidate_pages %p fails\n", inode); + } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); - inode->i_blocks = - (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(attr->ia_size); inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f851d8d70158..be6b1657b1af 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; - down_read(&osdc->map_sem); + down_read(&osdc->lock); r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return -EIO; } dl.file_offset -= dl.object_offset; @@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ceph_ino(inode), dl.object_no); oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); - ceph_oid_set_name(&oid, dl.object_name); + ceph_oid_printf(&oid, "%s", dl.object_name); - r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return r; } - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); @@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } - up_read(&osdc->map_sem); + up_read(&osdc->lock); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85b8517f17a0..2103b823bec0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end, ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); - info->dir_end = ceph_decode_8(p); - info->dir_complete = ceph_decode_8(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + } if (num == 0) goto done; - BUG_ON(!info->dir_in); - info->dir_dname = (void *)(info->dir_in + num); - info->dir_dname_len = (void *)(info->dir_dname + num); - info->dir_dlease = (void *)(info->dir_dname_len + num); - if ((unsigned long)(info->dir_dlease + num) > - (unsigned long)info->dir_in + info->dir_buf_size) { + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err("dir contents are larger than expected\n"); WARN_ON(1); goto bad; @@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_nr = num; while (num) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); - info->dir_dname_len[i] = ceph_decode_32(p); - ceph_decode_need(p, end, info->dir_dname_len[i], bad); - info->dir_dname[i] = *p; - *p += info->dir_dname_len[i]; - dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], - info->dir_dname[i]); - info->dir_dlease[i] = *p; + rde->name_len = ceph_decode_32(p); + ceph_decode_need(p, end, rde->name_len, bad); + rde->name = *p; + *p += rde->name_len; + dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + rde->lease = *p; *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i], features); + err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } @@ -345,9 +348,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - if (!info->dir_in) + if (!info->dir_entries) return; - free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req); } +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ -static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, - u64 tid) +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - struct rb_node *n = mdsc->request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mds_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else { - ceph_mdsc_get_request(req); - return req; - } - } - return NULL; -} -static void __insert_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *new) -{ - struct rb_node **p = &mdsc->request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mds_request *req = NULL; + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mds_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &mdsc->request_tree); + return req; } /* @@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - __insert_request(mdsc, req); + insert_request(&mdsc->request_tree, req); req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); @@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } } - rb_erase(&req->r_node, &mdsc->request_tree); - RB_CLEAR_NODE(&req->r_node); + erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir && req->r_got_unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); @@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; void *p; const char* metadata[][2] = { {"hostname", utsname()->nodename}, {"kernel_version", utsname()->release}, - {"entity_id", opt->name ? opt->name : ""}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; @@ -1149,9 +1125,11 @@ out: static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { + struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - int drop = 0; + bool drop = false; + bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; + + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + + if (ci->i_wrbuffer_ref > 0 && + ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + invalidate = true; while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); @@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = 1; + drop = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = 1; + drop = true; } spin_unlock(&mdsc->cap_dirty_lock); @@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_del(&cf->list); ceph_free_cap_flush(cf); } - while (drop--) + + wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); + if (drop) iput(inode); return 0; } @@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, */ static void remove_session_caps(struct ceph_mds_session *session) { + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, NULL); + iterate_session_caps(session, remove_session_caps_cb, fsc); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { - struct super_block *sb = session->s_mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; @@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; - size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + - sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + size_t size = sizeof(struct ceph_mds_reply_dir_entry); int order, num_entries; spin_lock(&ci->i_ceph_lock); @@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, - order); - if (rinfo->dir_in) + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, + order); + if (rinfo->dir_entries) break; order--; } - if (!rinfo->dir_in) + if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; @@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("handle_reply on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); @@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ee69a537dba5..e7d38aac7109 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in { u32 pool_ns_len; }; +struct ceph_mds_reply_dir_entry { + char *name; + u32 name_len; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; + loff_t offset; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + bool dir_complete; + bool dir_end; + bool hash_order; + struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 261531e55e9d..8c3591a7fbae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err = -EINVAL; - u16 version; + u8 mdsmap_v, mdsmap_cv; m = kzalloc(sizeof(*m), GFP_NOFS); if (m == NULL) return ERR_PTR(-ENOMEM); - ceph_decode_16_safe(p, end, version, bad); - if (version > 3) { - pr_warn("got mdsmap version %d > 3, failing", version); - goto bad; + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + mdsmap_cv = ceph_decode_8(p); + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); @@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 namelen; s32 mds, inc, state; u64 state_seq; - u8 infoversion; + u8 info_v; + void *info_end = NULL; struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; - ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); - infoversion = ceph_decode_8(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + u8 info_cv; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + info_cv = ceph_decode_8(p); + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); *p += sizeof(u64); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; @@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; - if (infoversion >= 2) { + if (info_v >= 2) { ceph_decode_32_safe(p, end, num_export_targets, bad); pexport_targets = *p; *p += num_export_targets * sizeof(u32); @@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) num_export_targets = 0; } + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), @@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ + *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f12d5e2955c2..91e02481ce06 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { + Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -143,6 +144,7 @@ enum { }; static match_table_t fsopt_tokens = { + {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ + case Opt_mds_namespace: + fsopt->mds_namespace = intval; + break; case Opt_wsize: fsopt->wsize = intval; break; @@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->server_path); kfree(args); } @@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } static int parse_mount_options(struct ceph_mount_options **pfsopt, struct ceph_options **popt, int flags, char *options, - const char *dev_name, - const char **path) + const char *dev_name) { struct ceph_mount_options *fsopt; const char *dev_name_end; @@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); + fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } else { - /* path is empty */ dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ @@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, goto out; } dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); @@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) + seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, { struct ceph_fs_client *fsc; const u64 supported_features = - CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDS_INLINE_DATA; + CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); fsc->mount_options = fsopt; @@ -785,8 +799,7 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto fail; } - if (path[0] == 0) { + if (!fsc->mount_options->server_path) { root = fsc->sb->s_root; dget(root); } else { - dout("mount opening base mountpoint\n"); + const char *path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; @@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, #ifdef CONFIG_CEPH_FS_POSIX_ACL flags |= MS_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { res = ERR_PTR(err); goto out_final; @@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } } - res = ceph_real_mount(fsc, path); + res = ceph_real_mount(fsc); if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7b99eb756477..0168b49fb6ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,6 +62,7 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + int mds_namespace; /* * everything above this point can be memcmp'd; everything below @@ -69,6 +70,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *server_path; /* default "/" */ }; struct ceph_fs_client { @@ -101,7 +103,6 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - struct workqueue_struct *revalidate_wq; #endif }; @@ -295,6 +296,7 @@ struct ceph_inode_info { u64 i_files, i_subdirs; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; @@ -357,8 +359,7 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; /* sequence, for delayed fscache validate */ - struct work_struct i_revalidate_work; + u32 i_fscache_gen; #endif struct inode vfs_inode; /* at end */ }; @@ -469,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -537,11 +539,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -632,7 +629,6 @@ struct ceph_file_info { struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; @@ -927,6 +923,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; +extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, @@ -942,6 +939,7 @@ extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0d66722c6a52..4870b29df224 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { size_t len = strlen(pool_name); @@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ret = -ERANGE; } } - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) ret = snprintf(val, size, "%s", pool_name); else ret = snprintf(val, size, "%lld", (unsigned long long)pool); - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; int err; if (size > 0) { @@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (err) goto out; } else if (!value) { - flags |= CEPH_XATTR_REMOVE; + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; } dout("setxattr value=%.*s\n", (int)size, value); /* do request */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, - USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { ceph_mdsc_put_request(req); @@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, goto out; } - req->r_pagelist = pagelist; - pagelist = NULL; + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_pagelist = pagelist; + pagelist = NULL; + } req->r_inode = inode; ihold(inode); @@ -1051,12 +1056,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - return __ceph_setxattr(d_inode(dentry), name, value, size, flags); + return __ceph_setxattr(inode, name, value, size, flags); } const struct xattr_handler ceph_other_xattr_handler = { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index c8b77aa24a1d..5e23f64c0804 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -39,8 +39,9 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; static int cifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int rc = -EOPNOTSUPP; unsigned int xid; @@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (value && pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - size, d_inode(dentry), + size, inode, full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(d_inode(dentry))->time = 0; + CIFS_I(inode)->time = 0; kfree(pacl); } #endif /* CONFIG_CIFS_ACL */ diff --git a/fs/coredump.c b/fs/coredump.c index 38a7ab87e10a..281b768000e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -794,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) return 0; file->f_pos = pos; cprm->written += n; + cprm->pos += n; nr -= n; } return 1; @@ -808,6 +809,7 @@ int dump_skip(struct coredump_params *cprm, size_t nr) if (dump_interrupted() || file->f_op->llseek(file, nr, SEEK_CUR) < 0) return 0; + cprm->pos += nr; return 1; } else { while (nr > PAGE_SIZE) { @@ -822,7 +824,7 @@ EXPORT_SYMBOL(dump_skip); int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->file->f_pos & (align - 1); + unsigned mod = cprm->pos & (align - 1); if (align & (align - 1)) return 0; return mod ? dump_skip(cprm, align - mod) : 1; @@ -32,14 +32,43 @@ #include <linux/pfn_t.h> #include <linux/sizes.h> -#define RADIX_DAX_MASK 0xf -#define RADIX_DAX_SHIFT 4 -#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +/* + * We use lowest available bit in exceptional entry for locking, other two + * bits to determine entry type. In total 3 special bits. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) +#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ + RADIX_TREE_EXCEPTIONAL_ENTRY)) + +/* We choose 4096 entries - same as per-zone page wait tables */ +#define DAX_WAIT_TABLE_BITS 12 +#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) + +wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; + +static int __init init_dax_wait_table(void) +{ + int i; + + for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) + init_waitqueue_head(wait_table + i); + return 0; +} +fs_initcall(init_dax_wait_table); + +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index) +{ + unsigned long hash = hash_long((unsigned long)mapping ^ index, + DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { @@ -87,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) return page; } -/* - * dax_clear_sectors() is called from within transaction context from XFS, - * and hence this means the stack from this point must follow GFP_NOFS - * semantics for all operations. - */ -int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) -{ - struct blk_dax_ctl dax = { - .sector = _sector, - .size = _size, - }; - - might_sleep(); - do { - long count, sz; - - count = dax_map_atomic(bdev, &dax); - if (count < 0) - return count; - sz = min_t(long, count, SZ_128K); - clear_pmem(dax.addr, sz); - dax.size -= sz; - dax.sector += sz / 512; - dax_unmap_atomic(bdev, &dax); - cond_resched(); - } while (dax.size); - - wmb_pmem(); - return 0; -} -EXPORT_SYMBOL_GPL(dax_clear_sectors); - -/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ -static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, - loff_t pos, loff_t end) -{ - loff_t final = end - pos + first; /* The final byte of the buffer */ - - if (first > 0) - clear_pmem(addr, first); - if (final < size) - clear_pmem(addr + final, size - final); -} - static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); @@ -169,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; + unsigned blkbits = inode->i_blkbits; + sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) + >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); @@ -176,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, while (pos < end) { size_t len; if (pos == max) { - unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); @@ -192,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; + /* + * We allow uninitialized buffers for writes + * beyond EOF as those cannot race with faults + */ + WARN_ON_ONCE( + (buffer_new(bh) && block < file_blks) || + (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -211,11 +205,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, rc = map_len; break; } - if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(dax.addr, map_len, first, - pos, end); - need_wmb = true; - } dax.addr += first; size = map_len - first; } @@ -276,15 +265,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { - struct address_space *mapping = inode->i_mapping; + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); - retval = filemap_write_and_wait_range(mapping, pos, end - 1); - if (retval) { - inode_unlock(inode); - goto out; - } - } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) @@ -305,12 +287,268 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); - out: return retval; } EXPORT_SYMBOL_GPL(dax_do_io); /* + * DAX radix tree locking + */ +struct exceptional_entry_key { + struct address_space *mapping; + unsigned long index; +}; + +struct wait_exceptional_entry_queue { + wait_queue_t wait; + struct exceptional_entry_key key; +}; + +static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, + int sync, void *keyp) +{ + struct exceptional_entry_key *key = keyp; + struct wait_exceptional_entry_queue *ewait = + container_of(wait, struct wait_exceptional_entry_queue, wait); + + if (key->mapping != ewait->key.mapping || + key->index != ewait->key.index) + return 0; + return autoremove_wake_function(wait, mode, sync, NULL); +} + +/* + * Check whether the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline int slot_locked(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + return entry & RADIX_DAX_ENTRY_LOCK; +} + +/* + * Mark the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline void *lock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry |= RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Mark the given slot is unlocked. The function must be called with + * mapping->tree_lock held + */ +static inline void *unlock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Lookup entry in radix tree, wait for it to become unlocked if it is + * exceptional entry and return it. The caller must call + * put_unlocked_mapping_entry() when he decided not to lock the entry or + * put_locked_mapping_entry() when he locked the entry and now wants to + * unlock it. + * + * The function must be called with mapping->tree_lock held. + */ +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) +{ + void *ret, **slot; + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + ewait.key.mapping = mapping; + ewait.key.index = index; + + for (;;) { + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + &slot); + if (!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot)) { + if (slotp) + *slotp = slot; + return ret; + } + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mapping->tree_lock); + schedule(); + finish_wait(wq, &ewait.wait); + spin_lock_irq(&mapping->tree_lock); + } +} + +/* + * Find radix tree entry at given index. If it points to a page, return with + * the page locked. If it points to the exceptional entry, return with the + * radix tree entry locked. If the radix tree doesn't contain given index, + * create empty exceptional entry for the index and return with it locked. + * + * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For + * persistent memory the benefit is doubtful. We can add that later if we can + * show it helps. + */ +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + +restart: + spin_lock_irq(&mapping->tree_lock); + ret = get_unlocked_mapping_entry(mapping, index, &slot); + /* No entry for given index? Make sure radix tree is big enough. */ + if (!ret) { + int err; + + spin_unlock_irq(&mapping->tree_lock); + err = radix_tree_preload( + mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + if (err) + return ERR_PTR(err); + ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK); + spin_lock_irq(&mapping->tree_lock); + err = radix_tree_insert(&mapping->page_tree, index, ret); + radix_tree_preload_end(); + if (err) { + spin_unlock_irq(&mapping->tree_lock); + /* Someone already created the entry? */ + if (err == -EEXIST) + goto restart; + return ERR_PTR(err); + } + /* Good, we have inserted empty locked entry into the tree. */ + mapping->nrexceptional++; + spin_unlock_irq(&mapping->tree_lock); + return ret; + } + /* Normal page in radix tree? */ + if (!radix_tree_exceptional_entry(ret)) { + struct page *page = ret; + + get_page(page); + spin_unlock_irq(&mapping->tree_lock); + lock_page(page); + /* Page got truncated? Retry... */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto restart; + } + return page; + } + ret = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all) +{ + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) { + struct exceptional_entry_key key; + + key.mapping = mapping; + key.index = index; + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + } +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + + spin_lock_irq(&mapping->tree_lock); + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +/* + * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree + * entry to get unlocked before deleting it. + */ +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *entry; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + /* + * This gets called from truncate / punch_hole path. As such, the caller + * must hold locks protecting against concurrent modifications of the + * radix tree (usually fs-private i_mmap_sem for writing). Since the + * caller has seen exceptional entry for this index, we better find it + * at that index as well... + */ + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, true); + + return 1; +} + +/* * The user has performed a load from a hole in the file. Allocating * a new page in the file would cause excessive storage usage for * workloads with sparse files. We allocate a page cache page instead. @@ -318,24 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io); * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, struct page *page, - struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - unsigned long size; - struct inode *inode = mapping->host; - if (!page) - page = find_or_create_page(mapping, vmf->pgoff, - GFP_KERNEL | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - /* Recheck i_size under page lock to avoid truncate race */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) { - unlock_page(page); - put_page(page); - return VM_FAULT_SIGBUS; + struct page *page; + + /* Hole page already exists? Return it... */ + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } + /* This will replace locked radix tree entry with a hole page */ + page = find_or_create_page(mapping, vmf->pgoff, + vmf->gfp_mask | __GFP_ZERO); + if (!page) { + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + return VM_FAULT_OOM; + } vmf->page = page; return VM_FAULT_LOCKED; } @@ -359,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } -#define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) -static int dax_radix_entry(struct address_space *mapping, pgoff_t index, - sector_t sector, bool pmd_entry, bool dirty) +static void *dax_insert_mapping_entry(struct address_space *mapping, + struct vm_fault *vmf, + void *entry, sector_t sector) { struct radix_tree_root *page_tree = &mapping->page_tree; - pgoff_t pmd_index = DAX_PMD_INDEX(index); - int type, error = 0; - void *entry; + int error = 0; + bool hole_fill = false; + void *new_entry; + pgoff_t index = vmf->pgoff; - WARN_ON_ONCE(pmd_entry && !dirty); - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - spin_lock_irq(&mapping->tree_lock); - - entry = radix_tree_lookup(page_tree, pmd_index); - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { - index = pmd_index; - goto dirty; + /* Replacing hole page with block mapping? */ + if (!radix_tree_exceptional_entry(entry)) { + hole_fill = true; + /* + * Unmap the page now before we remove it from page cache below. + * The page is locked so it cannot be faulted in again. + */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); + error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); + if (error) + return ERR_PTR(error); } - entry = radix_tree_lookup(page_tree, index); - if (entry) { - type = RADIX_DAX_TYPE(entry); - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && - type != RADIX_DAX_PMD)) { - error = -EIO; + spin_lock_irq(&mapping->tree_lock); + new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | + RADIX_DAX_ENTRY_LOCK); + if (hole_fill) { + __delete_from_page_cache(entry, NULL); + /* Drop pagecache reference */ + put_page(entry); + error = radix_tree_insert(page_tree, index, new_entry); + if (error) { + new_entry = ERR_PTR(error); goto unlock; } + mapping->nrexceptional++; + } else { + void **slot; + void *ret; - if (!pmd_entry || type == RADIX_DAX_PMD) - goto dirty; - - /* - * We only insert dirty PMD entries into the radix tree. This - * means we don't need to worry about removing a dirty PTE - * entry and inserting a clean PMD entry, thus reducing the - * range we would flush with a follow-up fsync/msync call. - */ - radix_tree_delete(&mapping->page_tree, index); - mapping->nrexceptional--; - } - - if (sector == NO_SECTOR) { - /* - * This can happen during correct operation if our pfn_mkwrite - * fault raced against a hole punch operation. If this - * happens the pte that was hole punched will have been - * unmapped and the radix tree entry will have been removed by - * the time we are called, but the call will still happen. We - * will return all the way up to wp_pfn_shared(), where the - * pte_same() check will fail, eventually causing page fault - * to be retried by the CPU. - */ - goto unlock; + ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + WARN_ON_ONCE(ret != entry); + radix_tree_replace_slot(slot, new_entry); } - - error = radix_tree_insert(page_tree, index, - RADIX_DAX_ENTRY(sector, pmd_entry)); - if (error) - goto unlock; - - mapping->nrexceptional++; - dirty: - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); - return error; + if (hole_fill) { + radix_tree_preload_end(); + /* + * We don't need hole page anymore, it has been replaced with + * locked radix tree entry now. + */ + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(entry); + unlock_page(entry); + put_page(entry); + } + return new_entry; } static int dax_writeback_one(struct block_device *bdev, @@ -555,56 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, +static int dax_insert_mapping(struct address_space *mapping, + struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), + .sector = to_sector(bh, mapping->host), .size = bh->b_size, }; - pgoff_t size; - int error; - - i_mmap_lock_read(mapping); - - /* - * Check truncate didn't happen while we were allocating a block. - * If it did, this block may or may not be still allocated to the - * file. We can't tell the filesystem to free it because we can't - * take i_mutex here. In the worst case, the file still has blocks - * allocated past the end of the file. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - error = -EIO; - goto out; - } + void *ret; + void *entry = *entryp; - if (dax_map_atomic(bdev, &dax) < 0) { - error = PTR_ERR(dax.addr); - goto out; - } - - if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(dax.addr, PAGE_SIZE); - wmb_pmem(); - } + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, - vmf->flags & FAULT_FLAG_WRITE); - if (error) - goto out; - - error = vm_insert_mixed(vma, vaddr, dax.pfn); + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + if (IS_ERR(ret)) + return PTR_ERR(ret); + *entryp = ret; - out: - i_mmap_unlock_read(mapping); - - return error; + return vm_insert_mixed(vma, vaddr, dax.pfn); } /** @@ -612,24 +818,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks - * @complete_unwritten: The filesystem method used to convert unwritten blocks - * to written so the data written to them is exposed. This is required for - * required by write faults for filesystems that will return unwritten - * extent mappings from @get_block, but it is optional for reads as - * dax_insert_mapping() will always zero unwritten blocks. If the fs does - * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; + void *entry; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; @@ -638,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, int error; int major = 0; + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -647,49 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; - repeat: - page = find_get_page(mapping, vmf->pgoff); - if (page) { - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - put_page(page); - return VM_FAULT_RETRY; - } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - /* - * We have a struct page covering a hole in the file - * from a read fault and we've raced with a truncate - */ - error = -EIO; - goto unlock_page; - } + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; - - if (!buffer_mapped(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_page; - } else { - return dax_load_hole(mapping, page, vmf); - } - } + goto unlock_entry; if (vmf->cow_page) { struct page *new_page = vmf->cow_page; @@ -698,53 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; - vmf->page = page; - if (!page) { - i_mmap_lock_read(mapping); - /* Check we didn't race with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); - error = -EIO; - goto out; - } + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } - return VM_FAULT_LOCKED; + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; } - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); - - if (page) { - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - delete_from_page_cache(page); - unlock_page(page); - put_page(page); - page = NULL; - } - - /* - * If we successfully insert the new mapping over an unwritten extent, - * we need to ensure we convert the unwritten extent. If there is an - * error inserting the mapping, the filesystem needs to leave it as - * unwritten to prevent exposure of the stale underlying data to - * userspace, but we still need to call the completion function so - * the private resources on the mapping buffer can be released. We - * indicate what the callback should do via the uptodate variable, same - * as for normal BH based IO completions. - */ - error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) { - if (complete_unwritten) - complete_unwritten(&bh, !error); - else - WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + if (!buffer_mapped(&bh)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_entry; + } else { + return dax_load_hole(mapping, entry, vmf); + } } + /* Filesystem should not return unwritten buffers to us! */ + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); + error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -752,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - - unlock_page: - if (page) { - unlock_page(page); - put_page(page); - } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -772,7 +920,7 @@ EXPORT_SYMBOL(__dax_fault); * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -781,7 +929,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_fault(vma, vmf, get_block, complete_unwritten); + result = __dax_fault(vma, vmf, get_block); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -789,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * The 'colour' (ie low bits) within a PMD of a page offset. This comes up * more often than one might expect in the below function. @@ -815,8 +963,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -828,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int error, result = 0; + int result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ @@ -875,6 +1022,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); } bdev = bh.b_bdev; @@ -900,26 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, truncate_pagecache_range(inode, lstart, lend); } - i_mmap_lock_read(mapping); - - /* - * If a truncate happened while we were allocating blocks, we may - * leave blocks allocated to the file that are beyond EOF. We can't - * take i_mutex here, so just leave them hanging; they'll be freed - * when the file is deleted. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) { - result = VM_FAULT_SIGBUS; - goto out; - } - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, - "offset + huge page size > file size"); - goto fallback; - } - - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); @@ -954,8 +1083,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, long length = dax_map_atomic(bdev, &dax); if (length < 0) { - result = VM_FAULT_SIGBUS; - goto out; + dax_pmd_dbg(&bh, address, "dax-error fallback"); + goto fallback; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); @@ -973,14 +1102,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } - - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - clear_pmem(dax.addr, PMD_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } dax_unmap_atomic(bdev, &dax); /* @@ -999,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * the write to insert a dirty entry. */ if (write) { - error = dax_radix_entry(mapping, pgoff, dax.sector, - true, true); - if (error) { - dax_pmd_dbg(&bh, address, - "PMD radix insertion failed"); - goto fallback; - } + /* + * We should insert radix-tree entry and dirty it here. + * For now this is broken... + */ } dev_dbg(part_to_dev(bdev->bd_part), @@ -1018,11 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } out: - i_mmap_unlock_read(mapping); - - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - return result; fallback: @@ -1042,8 +1155,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault); * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -1052,8 +1164,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_pmd_fault(vma, address, pmd, flags, get_block, - complete_unwritten); + result = __dax_pmd_fault(vma, address, pmd, flags, get_block); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -1070,27 +1181,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; - int error; - - /* - * We pass NO_SECTOR to dax_radix_entry() because we expect that a - * RADIX_DAX_PTE entry already exists in the radix tree from a - * previous call to __dax_fault(). We just want to look up that PTE - * entry using vmf->pgoff and make sure the dirty tag is set. This - * saves us from having to make a call to get_block() here to look - * up the sector. - */ - error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, - true); + struct address_space *mapping = file->f_mapping; + void *entry; + pgoff_t index = vmf->pgoff; - if (error == -ENOMEM) - return VM_FAULT_OOM; - if (error) - return VM_FAULT_SIGBUS; + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + put_unlocked_mapping_entry(mapping, index, entry); +out: + spin_unlock_irq(&mapping->tree_lock); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +static bool dax_range_is_aligned(struct block_device *bdev, + unsigned int offset, unsigned int length) +{ + unsigned short sector_size = bdev_logical_block_size(bdev); + + if (!IS_ALIGNED(offset, sector_size)) + return false; + if (!IS_ALIGNED(length, sector_size)) + return false; + + return true; +} + +int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + unsigned int offset, unsigned int length) +{ + struct blk_dax_ctl dax = { + .sector = sector, + .size = PAGE_SIZE, + }; + + if (dax_range_is_aligned(bdev, offset, length)) { + sector_t start_sector = dax.sector + (offset >> 9); + + return blkdev_issue_zeroout(bdev, start_sector, + length >> 9, GFP_NOFS, true); + } else { + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); + wmb_pmem(); + dax_unmap_atomic(bdev, &dax); + } + return 0; +} +EXPORT_SYMBOL_GPL(__dax_zero_page_range); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated @@ -1102,12 +1245,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); * page in a DAX file. This is intended for hole-punch operations. If * you are truncating a file, the helper function dax_truncate_page() may be * more convenient. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, get_block_t get_block) @@ -1126,23 +1263,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; err = get_block(inode, index, &bh, 0); - if (err < 0) + if (err < 0 || !buffer_written(&bh)) return err; - if (buffer_written(&bh)) { - struct block_device *bdev = bh.b_bdev; - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PAGE_SIZE, - }; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - wmb_pmem(); - dax_unmap_atomic(bdev, &dax); - } - - return 0; + return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), + offset, length); } EXPORT_SYMBOL_GPL(dax_zero_page_range); @@ -1154,12 +1279,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); * * Similar to block_truncate_page(), this function can be called by a * filesystem when it is truncating a DAX file to handle the partial page. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) { diff --git a/fs/dcache.c b/fs/dcache.c index c622872c12c5..817c243c1ff1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1636,7 +1636,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - + dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1670,8 +1670,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) struct qstr q; q.name = name; - q.len = strlen(name); - q.hash = full_name_hash(q.name, q.len); + q.hash_len = hashlen_string(name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); @@ -2359,7 +2358,6 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) { BUG_ON(!d_unhashed(entry)); hlist_bl_lock(b); - entry->d_flags |= DCACHE_RCUACCESS; hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } @@ -2844,6 +2842,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { /* splicing a tree */ + dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_parent = target->d_parent; target->d_parent = target; list_del_init(&target->d_child); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0b2954d7172d..37c134a132c7 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = { static DEFINE_MUTEX(allocated_ptys_lock); -static struct vfsmount *devpts_mnt; - struct pts_mount_opts { int setuid; int setgid; @@ -104,7 +102,7 @@ struct pts_mount_opts { kgid_t gid; umode_t mode; umode_t ptmxmode; - int newinstance; + int reserve; int max; }; @@ -117,11 +115,9 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES {Opt_ptmxmode, "ptmxmode=%o"}, {Opt_newinstance, "newinstance"}, {Opt_max, "max=%d"}, -#endif {Opt_err, NULL} }; @@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -static inline struct super_block *pts_sb_from_inode(struct inode *inode) +struct pts_fs_info *devpts_acquire(struct file *filp) { -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES - if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return inode->i_sb; -#endif - if (!devpts_mnt) - return NULL; - return devpts_mnt->mnt_sb; + struct pts_fs_info *result; + struct path path; + struct super_block *sb; + int err; + + path = filp->f_path; + path_get(&path); + + /* Has the devpts filesystem already been found? */ + sb = path.mnt->mnt_sb; + if (sb->s_magic != DEVPTS_SUPER_MAGIC) { + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(&path); + if (err) { + result = ERR_PTR(err); + goto out; + } + + /* Is the path the root of a devpts filesystem? */ + result = ERR_PTR(-ENODEV); + sb = path.mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path.mnt->mnt_root != sb->s_root)) + goto out; + } + + /* + * pty code needs to hold extra references in case of last /dev/tty close + */ + atomic_inc(&sb->s_active); + result = DEVPTS_SB(sb); + +out: + path_put(&path); + return result; +} + +void devpts_release(struct pts_fs_info *fsi) +{ + deactivate_super(fsi->sb); } #define PARSE_MOUNT 0 @@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) /* * parse_mount_options(): * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. The exception is - * 'newinstance' option which can only be set/cleared on a mount (i.e. - * cannot be changed during remount). + * specified in @data, set it to its default value. * * Note: @data may be NULL (in which case all options are set to default). */ @@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; - /* newinstance makes sense only on initial mount */ + /* Only allow instances mounted from the initial mount + * namespace to tap the reserve pool of ptys. + */ if (op == PARSE_MOUNT) - opts->newinstance = 0; + opts->reserve = + (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); while ((p = strsep(&data, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->mode = option & S_IALLUGO; break; -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES case Opt_ptmxmode: if (match_octal(&args[0], &option)) return -EINVAL; opts->ptmxmode = option & S_IALLUGO; break; case Opt_newinstance: - /* newinstance makes sense only on initial mount */ - if (op == PARSE_MOUNT) - opts->newinstance = 1; break; case Opt_max: if (match_int(&args[0], &option) || @@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->max = option; break; -#endif default: pr_err("called with bogus options\n"); return -EINVAL; @@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return 0; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES static int mknod_ptmx(struct super_block *sb) { int mode; @@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } -#else -static inline void update_ptmx_mode(struct pts_fs_info *fsi) -{ - return; -} -#endif static int devpts_remount(struct super_block *sb, int *flags, char *data) { @@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); if (opts->max < NR_UNIX98_PTY_MAX) seq_printf(seq, ",max=%d", opts->max); -#endif return 0; } @@ -410,40 +426,11 @@ fail: return -ENOMEM; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES -static int compare_init_pts_sb(struct super_block *s, void *p) -{ - if (devpts_mnt) - return devpts_mnt->mnt_sb == s; - return 0; -} - /* * devpts_mount() * - * If the '-o newinstance' mount option was specified, mount a new - * (private) instance of devpts. PTYs created in this instance are - * independent of the PTYs in other devpts instances. - * - * If the '-o newinstance' option was not specified, mount/remount the - * initial kernel mount of devpts. This type of mount gives the - * legacy, single-instance semantics. - * - * The 'newinstance' option is needed to support multiple namespace - * semantics in devpts while preserving backward compatibility of the - * current 'single-namespace' semantics. i.e all mounts of devpts - * without the 'newinstance' mount option should bind to the initial - * kernel mount, like mount_single(). - * - * Mounts with 'newinstance' option create a new, private namespace. - * - * NOTE: - * - * For single-mount semantics, devpts cannot use mount_single(), - * because mount_single()/sget() find and use the super-block from - * the most recent mount of devpts. But that recent mount may be a - * 'newinstance' mount and mount_single() would pick the newinstance - * super-block instead of the initial super-block. + * Mount a new (private) instance of devpts. PTYs created in this + * instance are independent of the PTYs in other devpts instances. */ static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, if (error) return ERR_PTR(error); - /* Require newinstance for all user namespace mounts to ensure - * the mount options are not changed. - */ - if ((current_user_ns() != &init_user_ns) && !opts.newinstance) - return ERR_PTR(-EINVAL); - - if (opts.newinstance) - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - else - s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, - NULL); - + s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -491,18 +467,6 @@ out_undo_sget: return ERR_PTR(error); } -#else -/* - * This supports only the legacy single-instance semantics (no - * multiple-instance semantics) - */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_single(fs_type, flags, data, devpts_fill_super); -} -#endif - static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); @@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, -#endif }; /* @@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi) int index; int ida_ret; - if (!fsi) - return -ENODEV; - retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; mutex_lock(&allocated_ptys_lock); - if (pty_count >= pty_limit - - (fsi->mount_opts.newinstance ? pty_reserve : 0)) { + if (pty_count >= (pty_limit - + (fsi->mount_opts.reserve ? 0 : pty_reserve))) { mutex_unlock(&allocated_ptys_lock); return -ENOSPC; } @@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) mutex_unlock(&allocated_ptys_lock); } -/* - * pty code needs to hold extra references in case of last /dev/tty close - */ -struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) -{ - struct super_block *sb; - struct pts_fs_info *fsi; - - sb = pts_sb_from_inode(ptmx_inode); - if (!sb) - return NULL; - fsi = DEVPTS_SB(sb); - if (!fsi) - return NULL; - - atomic_inc(&sb->s_active); - return fsi; -} - -void devpts_put_ref(struct pts_fs_info *fsi) -{ - deactivate_super(fsi->sb); -} - /** * devpts_pty_new -- create a new inode in /dev/pts/ * @ptmx_inode: inode of the master @@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi) struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; - struct super_block *sb; + struct super_block *sb = fsi->sb; struct inode *inode; struct dentry *root; struct pts_mount_opts *opts; char s[12]; - if (!fsi) - return ERR_PTR(-ENODEV); - - sb = fsi->sb; root = sb->s_root; opts = &fsi->mount_opts; @@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry) static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); - struct ctl_table_header *table; - if (!err) { - struct vfsmount *mnt; - - table = register_sysctl_table(pty_root_table); - mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - unregister_filesystem(&devpts_fs_type); - unregister_sysctl_table(table); - } else { - devpts_mnt = mnt; - } + register_sysctl_table(pty_root_table); } return err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index 3bf3f20f8ecc..f3b4408be590 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, map_bh->b_size = fs_count << i_blkbits; /* - * For writes inside i_size on a DIO_SKIP_HOLES filesystem we - * forbid block creations: only overwrites are permitted. - * We will return early to the caller once we see an - * unmapped buffer head returned, and the caller will fall - * back to buffered I/O. + * For writes that could fill holes inside i_size on a + * DIO_SKIP_HOLES filesystem we forbid block creations: only + * overwrites are permitted. We will return early to the caller + * once we see an unmapped buffer head returned, and the caller + * will fall back to buffered I/O. * * Otherwise the decision is left to the get_blocks method, * which may decide to handle it or also return an unmapped @@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (sdio->block_in_file < (i_size_read(dio->inode) >> - sdio->blkbits)) + if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> + i_blkbits)) create = 0; } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index ebd40f46ed4c..0d8eb3455b34 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode, char *page_virt, size_t size) { int rc; - rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt, - size, 0); + rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode, + ECRYPTFS_XATTR_NAME, page_virt, size, 0); return rc; } @@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, - size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode, + virt, size); else rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 3ec495db7e82..4ba1547bb9ad 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -609,8 +609,8 @@ ssize_t ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, const char *name, void *value, size_t size); int -ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); +ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); #ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 318b04689d76..9d153b6a1d72 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, } int -ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, +ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { int rc = 0; @@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc && d_really_is_positive(dentry)) - fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); + if (!rc && inode) + fsstack_copy_attr_all(inode, d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 866bb18efefe..e818f5ac7a26 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/mount.h> +#include <linux/file.h> #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -147,7 +148,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto out; + goto have_file; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -165,8 +166,16 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) + if (IS_ERR(*lower_file)) { rc = PTR_ERR(*lower_file); + goto out; + } +have_file: + if ((*lower_file)->f_op->mmap == NULL) { + fput(*lower_file); + *lower_file = NULL; + rc = -EMEDIUMTYPE; + } out: return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 148d11b514fb..9c3437c8a5b1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c1400b109805..868c02317b05 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = __dax_fault(vma, vmf, ext2_get_block, NULL); + ret = __dax_fault(vma, vmf, ext2_get_block); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, } down_read(&ei->dax_sem); - ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b675610391b8..fcbe58641e40 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,6 +26,7 @@ #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> +#include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_sectors(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) << - (inode->i_blkbits - 9), - 1 << inode->i_blkbits); + err = sb_issue_zeroout(inode->i_sb, + le32_to_cpu(chain[depth-1].key), count, + GFP_NOFS); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } + } else + set_buffer_new(bh_result); ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); - set_buffer_new(bh_result); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b78caf25f746..1d9379568aa8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext2_msg(sb, KERN_ERR, - "error: device does not support dax"); - goto failed_mount; - } } /* If the blocksize doesn't match, re-read the thing.. */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 7fd3b867ce65..7b9e9c1842d5 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 0f85705ff519..65049b71af13 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 1fafd27037cc..fb2f992ae763 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d478110c32a6..df44c877892a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); + result = __dax_fault(vma, vmf, ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block, NULL); + ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20c5d52253b4..3822a5aedc61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3417,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); - goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext4_msg(sb, KERN_ERR, - "error: device does not support dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 123a7d010efe..a8921112030d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 60652fa24cbc..c7765c735714 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 17a446ffecd3..ca20e423034b 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 00ea56797258..e3decae3acfb 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); switch (handler->flags) { case F2FS_XATTR_INDEX_USER: @@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, default: return -EINVAL; } - return f2fs_setxattr(d_inode(dentry), handler->flags, name, + return f2fs_setxattr(inode, handler->flags, name, value, size, NULL, flags); } @@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3078b679fcd1..c8c4f79c7ce1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) put_page(results[i]); } + wake_up_bit(&cookie->flags, 0); + _leave(""); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b9419058108f..ccd4971cc6c1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1719,10 +1719,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, return fuse_update_attributes(inode, stat, NULL, NULL); } -static int fuse_setxattr(struct dentry *entry, const char *name, - const void *value, size_t size, int flags) +static int fuse_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 4a01f30e9995..271d93905bac 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { __be64 *hash; + int error; hash = gfs2_dir_get_hash_table(dip); - if (IS_ERR(hash)) - return PTR_ERR(hash); - *leaf_out = be64_to_cpu(*(hash + index)); - return 0; + error = PTR_ERR_OR_ZERO(hash); + + if (!error) + *leaf_out = be64_to_cpu(*(hash + index)); + + return error; } static int get_first_leaf(struct gfs2_inode *dip, u32 index, @@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!IS_ERR_VALUE(error)) + if (!error) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (IS_ERR_VALUE(error)) + if (error) return error; /* Get the old leaf block */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f42ab53bd30d..3a2853504084 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 064f92f17efc..d9a86919fdf6 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -13,10 +13,10 @@ #include "hfs_fs.h" #include "btree.h" -int hfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int hfs_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index fa3eed86837c..ee2f385811c8 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ -extern int hfs_setxattr(struct dentry *dentry, const char *name, +extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4f118d282a7a..d37bb88dc746 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen) { @@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, - flags); + res = __hfsplus_setxattr(inode, xattr_name, value, size, flags); kfree(xattr_name); return res; } @@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { /* * Don't allow setting properly prefixed attributes @@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); + return __hfsplus_setxattr(inode, name, buffer, size, flags); } const struct xattr_handler hfsplus_xattr_osx_handler = { diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index d04ba6f58df2..68f6b539371f 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index ae2ca8c2e335..37b3efa733ef 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index eae2947060aa..94519d6c627d 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 3c9eec3e4c7b..fae6c0ea0030 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 458cf463047b..82067ca22f2b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/bitmap.h> #include <linux/slab.h> +#include <linux/seq_file.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) int lowercase, eas, chk, errs, chkdsk, timeshift; int o; struct hpfs_sb_info *sbi = hpfs_sb(s); - char *new_opts = kstrdup(data, GFP_KERNEL); - - if (!new_opts) - return -ENOMEM; sync_filesystem(s); @@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); - replace_mount_options(s, new_opts); - hpfs_unlock(s); return 0; out_err: hpfs_unlock(s); - kfree(new_opts); return -EINVAL; } +static int hpfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb); + + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid)); + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid)); + seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777)); + if (sbi->sb_lowercase) + seq_printf(seq, ",case=lower"); + if (!sbi->sb_chk) + seq_printf(seq, ",check=none"); + if (sbi->sb_chk == 2) + seq_printf(seq, ",check=strict"); + if (!sbi->sb_err) + seq_printf(seq, ",errors=continue"); + if (sbi->sb_err == 2) + seq_printf(seq, ",errors=panic"); + if (!sbi->sb_chkdsk) + seq_printf(seq, ",chkdsk=no"); + if (sbi->sb_chkdsk == 2) + seq_printf(seq, ",chkdsk=always"); + if (!sbi->sb_eas) + seq_printf(seq, ",eas=no"); + if (sbi->sb_eas == 1) + seq_printf(seq, ",eas=ro"); + if (sbi->sb_timeshift) + seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift); + return 0; +} + /* Super operations */ static const struct super_operations hpfs_sops = @@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops = .put_super = hpfs_put_super, .statfs = hpfs_statfs, .remount_fs = hpfs_remount_fs, - .show_options = generic_show_options, + .show_options = hpfs_show_options, }; static int hpfs_fill_super(struct super_block *s, void *options, int silent) @@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) int o; - save_mount_options(s, options); - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 3ed9a4b49778..c2332e30f218 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 4ebecff1d922..5d6030826c52 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index bce249e1b277..9d027b4abcf9 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index beb182b503b3..0bf3c33aedff 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __jfs_xattr_set(inode, name, value, size, flags); } @@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (is_known_namespace(name)) return -EOPNOTSUPP; return __jfs_xattr_set(inode, name, value, size, flags); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 1719649d7ad7..63b925d5ba1e 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -160,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, return 0; } -int kernfs_iop_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; void *secdata; int error; @@ -175,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(d_inode(dentry), suffix, + error = security_inode_setsecurity(inode, suffix, value, size, flags); if (error) return error; - error = security_inode_getsecctx(d_inode(dentry), + error = security_inode_getsecctx(inode, &secdata, &secdata_len); if (error) return error; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 45c9192c276e..37159235ac10 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, +int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags); int kernfs_iop_removexattr(struct dentry *dentry, const char *name); ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode, diff --git a/fs/libfs.c b/fs/libfs.c index 8765ff1adc07..3db2721144c2 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1118,8 +1118,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) return -EPERM; } -static int empty_dir_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { return -EOPNOTSUPP; } diff --git a/fs/namei.c b/fs/namei.c index 5375571cf6e1..70580ab1445c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -35,6 +35,7 @@ #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> +#include <linux/bitops.h> #include <asm/uaccess.h> #include "internal.h" @@ -1415,21 +1416,28 @@ static void follow_mount(struct path *path) } } +static int path_parent_directory(struct path *path) +{ + struct dentry *old = path->dentry; + /* rare case of legitimate dget_parent()... */ + path->dentry = dget_parent(path->dentry); + dput(old); + if (unlikely(!path_connected(path))) + return -ENOENT; + return 0; +} + static int follow_dotdot(struct nameidata *nd) { while(1) { - struct dentry *old = nd->path.dentry; - if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - nd->path.dentry = dget_parent(nd->path.dentry); - dput(old); - if (unlikely(!path_connected(&nd->path))) - return -ENOENT; + int ret = path_parent_directory(&nd->path); + if (ret) + return ret; break; } if (!follow_up(&nd->path)) @@ -1797,74 +1805,144 @@ static int walk_component(struct nameidata *nd, int flags) #include <asm/word-at-a-time.h> -#ifdef CONFIG_64BIT +#ifdef HASH_MIX -static inline unsigned int fold_hash(unsigned long hash) -{ - return hash_64(hash, 32); -} +/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ + +#elif defined(CONFIG_64BIT) +/* + * Register pressure in the mixing function is an issue, particularly + * on 32-bit x86, but almost any function requires one state value and + * one temporary. Instead, use a function designed for two state values + * and no temporaries. + * + * This function cannot create a collision in only two iterations, so + * we have two iterations to achieve avalanche. In those two iterations, + * we have six layers of mixing, which is enough to spread one bit's + * influence out to 2^6 = 64 state bits. + * + * Rotate constants are scored by considering either 64 one-bit input + * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the + * probability of that delta causing a change to each of the 128 output + * bits, using a sample of random initial states. + * + * The Shannon entropy of the computed probabilities is then summed + * to produce a score. Ideally, any input change has a 50% chance of + * toggling any given output bit. + * + * Mixing scores (in bits) for (12,45): + * Input delta: 1-bit 2-bit + * 1 round: 713.3 42542.6 + * 2 rounds: 2753.7 140389.8 + * 3 rounds: 5954.1 233458.2 + * 4 rounds: 7862.6 256672.2 + * Perfect: 8192 258048 + * (64*128) (64*63/2 * 128) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol64(x,12),\ + x += y, y = rol64(y,45),\ + y *= 9 ) /* - * This is George Marsaglia's XORSHIFT generator. - * It implements a maximum-period LFSR in only a few - * instructions. It also has the property (required - * by hash_name()) that mix_hash(0) = 0. + * Fold two longs into one 32-bit hash value. This must be fast, but + * latency isn't quite as critical, as there is a fair bit of additional + * work done before the hash value is used. */ -static inline unsigned long mix_hash(unsigned long hash) +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash ^= hash << 13; - hash ^= hash >> 7; - hash ^= hash << 17; - return hash; + y ^= x * GOLDEN_RATIO_64; + y *= GOLDEN_RATIO_64; + return y >> 32; } #else /* 32-bit case */ -#define fold_hash(x) (x) +/* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) -static inline unsigned long mix_hash(unsigned long hash) +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash ^= hash << 13; - hash ^= hash >> 17; - hash ^= hash << 5; - return hash; + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); } #endif -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* + * Return the hash of a string of known length. This is carfully + * designed to match hash_name(), which is the more critical function. + * In particular, we must end by hashing a final word containing 0..7 + * payload bytes, to match the way that hash_name() iterates until it + * finds the delimiter after the name. + */ +unsigned int full_name_hash(const char *name, unsigned int len) { - unsigned long a, hash = 0; + unsigned long a, x = 0, y = 0; for (;;) { + if (!len) + goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); - if (!len) - goto done; } - hash += a & bytemask_from_count(len); + x ^= a & bytemask_from_count(len); done: - return fold_hash(hash); + return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const char *name) +{ + unsigned long a = 0, x = 0, y = 0, adata, mask, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + + len = -sizeof(unsigned long); + do { + HASH_MIX(x, y, a); + len += sizeof(unsigned long); + a = load_unaligned_zeropad(name+len); + } while (!has_zero(a, &adata, &constants)); + + adata = prep_zero_mask(a, adata, &constants); + mask = create_zero_mask(adata); + x ^= a & zero_bytemask(mask); + + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); +} +EXPORT_SYMBOL(hashlen_string); + /* * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ static inline u64 hash_name(const char *name) { - unsigned long a, b, adata, bdata, mask, hash, len; + unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - hash = a = 0; len = -sizeof(unsigned long); do { - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); @@ -1872,25 +1950,40 @@ static inline u64 hash_name(const char *name) adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); - mask = create_zero_mask(adata | bdata); + x ^= a & zero_bytemask(mask); - hash += a & zero_bytemask(mask); - len += find_zero(mask); - return hashlen_create(fold_hash(hash), len); + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } -#else +#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* Return the hash of a string of known length */ +unsigned int full_name_hash(const char *name, unsigned int len) { unsigned long hash = init_name_hash(); while (len--) - hash = partial_name_hash(*name++, hash); + hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const char *name) +{ + unsigned long hash = init_name_hash(); + unsigned long len = 0, c; + + c = (unsigned char)*name; + while (c) { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } + return hashlen_create(end_name_hash(hash), len); +} +EXPORT_SYMBOL(hashlen_string); + /* * We know there's a real path component here of at least * one character. @@ -1934,7 +2027,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) int type; err = may_lookup(nd); - if (err) + if (err) return err; hash_len = hash_name(name); @@ -2428,6 +2521,34 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +#ifdef CONFIG_UNIX98_PTYS +int path_pts(struct path *path) +{ + /* Find something mounted on "pts" in the same directory as + * the input path. + */ + struct dentry *child, *parent; + struct qstr this; + int ret; + + ret = path_parent_directory(path); + if (ret) + return ret; + + parent = path->dentry; + this.name = "pts"; + this.len = 3; + child = d_hash_and_lookup(parent, &this); + if (!child) + return -ENOENT; + + path->dentry = child; + dput(parent); + follow_mount(path); + return 0; +} +#endif + int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { @@ -2909,9 +3030,13 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, } if (*opened & FILE_CREATED) fsnotify_create(dir, dentry); - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 1; + if (unlikely(d_is_negative(dentry))) { + error = -ENOENT; + } else { + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; + } } } dput(dentry); @@ -3080,9 +3205,7 @@ static int do_last(struct nameidata *nd, int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; - struct path save_parent = { .dentry = NULL, .mnt = NULL }; struct path path; - bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; @@ -3125,7 +3248,6 @@ static int do_last(struct nameidata *nd, return -EISDIR; } -retry_lookup: if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) @@ -3177,6 +3299,10 @@ retry_lookup: got_write = false; } + error = follow_managed(&path, nd); + if (unlikely(error < 0)) + return error; + if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; @@ -3192,10 +3318,6 @@ retry_lookup: return -EEXIST; } - error = follow_managed(&path, nd); - if (unlikely(error < 0)) - return error; - seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: @@ -3206,23 +3328,14 @@ finish_lookup: if (unlikely(error)) return error; - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { - path_to_nameidata(&path, nd); - } else { - save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path.mnt); - nd->path.dentry = path.dentry; - - } + path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: error = complete_walk(nd); - if (error) { - path_put(&save_parent); + if (error) return error; - } audit_inode(nd->name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) @@ -3245,13 +3358,9 @@ finish_open_created: goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); - if (!error) { - *opened |= FILE_OPENED; - } else { - if (error == -EOPENSTALE) - goto stale_open; + if (error) goto out; - } + *opened |= FILE_OPENED; opened: error = open_check_o_direct(file); if (!error) @@ -3267,26 +3376,7 @@ out: } if (got_write) mnt_drop_write(nd->path.mnt); - path_put(&save_parent); return error; - -stale_open: - /* If no saved parent or already retried then can't retry */ - if (!save_parent.dentry || retried) - goto out; - - BUG_ON(save_parent.dentry != dir); - path_put(&nd->path); - nd->path = save_parent; - nd->inode = dir->d_inode; - save_parent.mnt = NULL; - save_parent.dentry = NULL; - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; - } - retried = true; - goto retry_lookup; } static int do_tmpfile(struct nameidata *nd, unsigned flags, @@ -4542,7 +4632,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link) out: return len; } -EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that diff --git a/fs/namespace.c b/fs/namespace.c index 4fb1691b4355..a7ec92c051f5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2409,8 +2409,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) + if (!fs_fully_visible(type, &mnt_flags)) { + put_filesystem(type); return -EPERM; + } } } @@ -3271,7 +3273,7 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ - if (!(mnt_flags & MNT_LOCKED)) + if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 618ced381a14..aaa2e8d3df6f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp, } if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, - &args->cbl_range)) { + &args->cbl_range, + be32_to_cpu(args->cbl_stateid.seqid))) { rv = NFS4_OK; goto unlock; } @@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, cps->slot = slot; /* The ca_maxresponsesize_cached is 0 with no DRC */ - if (args->csa_cachethis != 0) - return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + if (args->csa_cachethis != 0) { + status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + goto out_unlock; + } /* * Check for pending referring calls. If a match is found, a diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 976c90608e56..d81f96aacd51 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) p = read_buf(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid, p, NFS4_STATEID_SIZE); + memcpy(stateid->data, p, NFS4_STATEID_SIZE); return 0; } +static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) { __be32 *p; @@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 *p; __be32 status; - status = decode_stateid(xdr, &args->stateid); + status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; p = read_buf(xdr, 4); @@ -227,6 +233,11 @@ out: } #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, @@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, } p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_stateid(xdr, &args->cbl_stateid); + status = decode_layout_stateid(xdr, &args->cbl_stateid); if (unlikely(status != 0)) goto out; } else if (args->cbl_recall_type == RETURN_FSID) { diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5166adcfc0fb..322c2585bc34 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp) /** * nfs4_copy_delegation_stateid - Copy inode's state ID information - * @dst: stateid data structure to fill in * @inode: inode to check * @flags: delegation type requirement + * @dst: stateid data structure to fill in + * @cred: optional argument to retrieve credential * * Returns "true" and fills in "dst->data" * if inode had a delegation, * otherwise "false" is returned. */ -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, - fmode_t flags) +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, + nfs4_stateid *dst, struct rpc_cred **cred) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; @@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); + if (cred) + *cred = get_rpccred(delegation->cred); } rcu_read_unlock(); return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 333063e032f0..64724d252a79 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 741a92c470bb..979b3c4dee6a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -87,6 +87,7 @@ struct nfs_direct_req { int mirror_count; ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ io_start, /* start of IO */ error; /* any reported error */ @@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; + WARN_ON_ONCE(dreq->count >= dreq->max_count); + if (dreq->mirror_count == 1) { dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; dreq->count += hdr->good_bytes; @@ -275,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq) { - cinfo->lock = &dreq->inode->i_lock; + cinfo->inode = dreq->inode; cinfo->mds = &dreq->mds_cinfo; cinfo->ds = &dreq->ds_cinfo; cinfo->dreq = dreq; @@ -591,7 +594,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = count; + dreq->bytes_left = dreq->max_count = count; dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -630,13 +633,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) @@ -671,13 +674,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) if (!nfs_pageio_add_request(&desc, req)) { nfs_list_remove_request(req); nfs_list_add_request(req, &failed); - spin_lock(cinfo.lock); + spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) dreq->error = desc.pg_error; else dreq->error = -EIO; - spin_unlock(cinfo.lock); + spin_unlock(&cinfo.inode->i_lock); } nfs_release_request(req); } @@ -1023,7 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = iov_iter_count(iter); dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3384dc8e6683..aa59757389dc 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets >= size) goto out; for (i = 0; i < cinfo->ds->nbuckets; i++) { @@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, swap(cinfo->ds->buckets, buckets); cinfo->ds->nbuckets = size; out: - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); kfree(buckets); return 0; } @@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0cb1abd535e3..0e8018bc9880 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -26,6 +26,8 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) +static struct group_info *ff_zero_group; + static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) { @@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) kfree(FF_LAYOUT_FROM_HDR(lo)); } -static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return -ENOBUFS; - memcpy(stateid, p, NFS4_STATEID_SIZE); + stateid->type = NFS4_PNFS_DS_STATEID_TYPE; + memcpy(stateid->data, p, NFS4_STATEID_SIZE); dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, p[0], p[1], p[2], p[3]); return 0; @@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { + struct rpc_cred *cred; + ff_layout_remove_mirror(mirror); kfree(mirror->fh_versions); - if (mirror->cred) - put_rpccred(mirror->cred); + cred = rcu_access_pointer(mirror->ro_cred); + if (cred) + put_rpccred(cred); + cred = rcu_access_pointer(mirror->rw_cred); + if (cred) + put_rpccred(cred); nfs4_ff_layout_put_deviceid(mirror->mirror_ds); kfree(mirror); } @@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, { u64 new_end, old_end; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + return false; if (new->pls_range.iomode != old->pls_range.iomode) return false; old_end = pnfs_calc_offset_end(old->pls_range.offset, @@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new_end); if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) set_bit(NFS_LSEG_ROC, &new->pls_flags); - if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) - set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); return true; } @@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; - u32 ds_count; - u32 fh_count; + struct auth_cred acred = { .group_info = ff_zero_group }; + struct rpc_cred __rcu *cred; + u32 ds_count, fh_count, id; int j; rc = -EIO; @@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->efficiency = be32_to_cpup(p); /* stateid */ - rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); + rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); if (rc) goto out_err_free; @@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->fh_versions_cnt = fh_count; /* user */ - rc = decode_name(&stream, &fls->mirror_array[i]->uid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.uid = make_kuid(&init_user_ns, id); + /* group */ - rc = decode_name(&stream, &fls->mirror_array[i]->gid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.gid = make_kgid(&init_user_ns, id); + + /* find the cred for it */ + rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags)); + if (IS_ERR(cred)) { + rc = PTR_ERR(cred); + goto out_err_free; + } + + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + else + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->ro_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + } else { + cred = xchg(&mirror->rw_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; } - dprintk("%s: uid %d gid %d\n", __func__, - fls->mirror_array[i]->uid, - fls->mirror_array[i]->gid); + dprintk("%s: iomode %s uid %u gid %u\n", __func__, + lgr->range.iomode == IOMODE_READ ? "READ" : "RW", + from_kuid(&init_user_ns, acred.uid), + from_kgid(&init_user_ns, acred.gid)); } p = xdr_inline_decode(&stream, 4); @@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, else { int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets != 0) kfree(buckets); else { @@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, NFS_INVALID_STABLE_HOW; } } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return 0; } } @@ -786,6 +821,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static void +ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, + bool strict_iomode) +{ +retry_strict: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + strict_iomode, + GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } + + /* If we don't have checking, do get a IOMODE_RW + * segment, and the server wants to avoid READs + * there, then retry! + */ + if (pgio->pg_lseg && !strict_iomode && + ff_layout_avoid_read_on_rw(pgio->pg_lseg)) { + strict_iomode = true; + goto retry_strict; + } +} + +static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_READ, - GFP_KERNEL); - if (IS_ERR(pgio->pg_lseg)) { - pgio->pg_error = PTR_ERR(pgio->pg_lseg); - pgio->pg_lseg = NULL; - return; - } - } + if (!pgio->pg_lseg) + ff_layout_pg_get_read(pgio, req, false); + else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) + ff_layout_pg_get_read(pgio, req, true); + /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); - if (!ds) - goto out_mds; + if (!ds) { + if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_pnfs; + else + goto out_mds; + } + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgio->pg_mirror_idx = ds_idx; @@ -828,6 +890,12 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); + return; + +out_pnfs: + pnfs_set_lo_fail(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; } static void @@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); - if (!ds) - goto out_mds; + if (!ds) { + if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_pnfs; + else + goto out_mds; + } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; @@ -883,6 +956,12 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); + return; + +out_pnfs: + pnfs_set_lo_fail(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; } static unsigned int @@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: - if (ff_layout_no_fallback_to_mds(lseg) || - ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; reset: dprintk("%s Retry through MDS. Error %d\n", __func__, @@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) } static bool -ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) +ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) { /* No mirroring for now */ struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); - if (ff_layout_has_available_ds(hdr->lseg)) - pnfs_read_resend_pnfs(hdr); - else - ff_layout_reset_read(hdr); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } - hdr->pgio_done_cb = ff_layout_read_done_cb; ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - bool retry_pnfs; - - retry_pnfs = ff_layout_has_available_ds(hdr->lseg); - dprintk("%s task %u reset io to %s\n", __func__, - task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); - ff_layout_reset_write(hdr, retry_pnfs); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } @@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + hdr->pgio_done_cb = ff_layout_read_done_cb; atomic_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); @@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, 0, RPC_TASK_SOFTCONN); - + put_rpccred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; return PNFS_NOT_ATTEMPTED; } @@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_NOT_ATTEMPTED; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) return PNFS_NOT_ATTEMPTED; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, sync, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); return PNFS_ATTEMPTED; } @@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct rpc_clnt *ds_clnt; struct rpc_cred *ds_cred; u32 idx; - int vers; + int vers, ret; struct nfs_fh *fh; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); @@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_err; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) if (fh) data->args.fh = fh; - return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, how, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); + return ret; out_err: pnfs_generic_prepare_to_resend_writes(data); pnfs_generic_commit_release(data); @@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void) { printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", __func__); + if (!ff_zero_group) { + ff_zero_group = groups_alloc(0); + if (!ff_zero_group) + return -ENOMEM; + } return pnfs_register_layoutdriver(&flexfilelayout_type); } @@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void) printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", __func__); pnfs_unregister_layoutdriver(&flexfilelayout_type); + if (ff_zero_group) { + put_group_info(ff_zero_group); + ff_zero_group = NULL; + } } MODULE_ALIAS("nfs-layouttype4-4"); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index dd353bb7dc0a..1bcdb15d0c41 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -10,7 +10,8 @@ #define FS_NFS_NFS4FLEXFILELAYOUT_H #define FF_FLAGS_NO_LAYOUTCOMMIT 1 -#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 #include "../pnfs.h" @@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror { u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - u32 uid; - u32 gid; - struct rpc_cred *cred; + struct rpc_cred __rcu *ro_cred; + struct rpc_cred __rcu *rw_cred; atomic_t ref; spinlock_t lock; struct nfs4_ff_layoutstat read_stat; @@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg) } static inline bool +ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; +} + +static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { return nfs4_test_deviceid_unavailable(node); @@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred); bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); + #endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index add0e5a70bd6..0aa36be71fce 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, return e1->opnum < e2->opnum ? -1 : 1; if (e1->status != e2->status) return e1->status < e2->status ? -1 : 1; - ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + ret = memcmp(e1->stateid.data, e2->stateid.data, + sizeof(e1->stateid.data)); if (ret != 0) return ret; ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); @@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, return 0; } -/* currently we only support AUTH_NONE and AUTH_SYS */ -static rpc_authflavor_t -nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) +static struct rpc_cred * +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) { - if (mirror->uid == (u32)-1) - return RPC_AUTH_NULL; - return RPC_AUTH_UNIX; -} + struct rpc_cred *cred, __rcu **pcred; -/* fetch cred for NFSv3 DS */ -static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, - struct nfs4_pnfs_ds *ds) -{ - if (ds->ds_clp && !mirror->cred && - mirror->mirror_ds->ds_versions[0].version == 3) { - struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; - struct rpc_cred *cred; - struct auth_cred acred = { - .uid = make_kuid(&init_user_ns, mirror->uid), - .gid = make_kgid(&init_user_ns, mirror->gid), - }; - - /* AUTH_NULL ignores acred */ - cred = auth->au_ops->lookup_cred(auth, &acred, 0); - if (IS_ERR(cred)) { - dprintk("%s: lookup_cred failed with %ld\n", - __func__, PTR_ERR(cred)); - return PTR_ERR(cred); - } else { - if (cmpxchg(&mirror->cred, NULL, cred)) - put_rpccred(cred); - } - } - return 0; + if (iomode == IOMODE_READ) + pcred = &mirror->ro_cred; + else + pcred = &mirror->rw_cred; + + rcu_read_lock(); + do { + cred = rcu_dereference(*pcred); + if (!cred) + break; + + cred = get_rpccred_rcu(cred); + } while(!cred); + rcu_read_unlock(); + return cred; } struct nfs_fh * @@ -356,7 +343,23 @@ out: return fh; } -/* Upon return, either ds is connected, or ds is NULL */ +/** + * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call + * @lseg: the layout segment we're operating on + * @ds_idx: index of the DS to use + * @fail_return: return layout on connect failure? + * + * Try to prepare a DS connection to accept an RPC call. This involves + * selecting a mirror to use and connecting the client to it if it's not + * already connected. + * + * Since we only need a single functioning mirror to satisfy a read, we don't + * want to return the layout if there is one. For writes though, any down + * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish + * between the two cases. + * + * Returns a pointer to a connected DS object on success or NULL on failure. + */ struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, bool fail_return) @@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - rpc_authflavor_t flavor; if (!ff_layout_mirror_valid(lseg, mirror)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", @@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out_update_creds; - - flavor = nfs4_ff_layout_choose_authflavor(mirror); + goto out; /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. @@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version, - flavor); + RPC_AUTH_UNIX); /* connect success, check rsize/wsize limit */ if (ds->ds_clp) { @@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (!fail_return) { - if (ff_layout_has_available_ds(lseg)) - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lseg->pls_layout->plh_flags); - else - pnfs_error_mark_layout_for_return(ino, lseg); - } else + if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; - goto out; } -out_update_creds: - if (ff_layout_update_mirror_cred(mirror, ds)) - ds = NULL; out: return ds; } @@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - struct rpc_cred *cred = ERR_PTR(-EINVAL); - - if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) - goto out; + struct rpc_cred *cred; - if (mirror && mirror->cred) - cred = mirror->cred; - else - cred = mdscred; -out: + if (mirror) { + cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + if (!cred) + cred = get_rpccred(mdscred); + } else { + cred = get_rpccred(mdscred); + } return cred; } @@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return ff_rw_layout_has_available_ds(lseg); } +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg) +{ + return ff_layout_no_fallback_to_mds(lseg) || + ff_layout_has_available_ds(lseg); +} + +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return lseg->pls_range.iomode == IOMODE_RW && + ff_layout_no_read_on_rw(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f1d1d2c472e9..5154fa65a2f2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b587ccd31083..b6cd15314bab 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index dff83460e5a6..aa03ed09ba06 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } +static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, + struct nfs_lock_context *src_lock, + struct file *dst, loff_t pos_dst, + struct nfs_lock_context *dst_lock, + size_t count) +{ + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct inode *dst_inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(dst_inode); + int status; + + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); + if (status) + return status; + + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); + if (status) + return status; + + status = nfs4_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, 0); + if (status == -ENOTSUPP) + server->caps &= ~NFS_CAP_COPY; + if (status) + return status; + + if (res.write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (status) + return status; + } + + truncate_pagecache_range(dst_inode, pos_dst, + pos_dst + res.write_res.count); + + return res.write_res.count; +} + +ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, + struct file *dst, loff_t pos_dst, + size_t count) +{ + struct nfs_server *server = NFS_SERVER(file_inode(dst)); + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + ssize_t err, err2; + + if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) + return -EOPNOTSUPP; + + src_lock = nfs_get_lock_context(nfs_file_open_context(src)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst); + dst_exception.state = dst_lock->open_context->state; + + do { + inode_lock(file_inode(dst)); + err = _nfs42_proc_copy(src, pos_src, src_lock, + dst, pos_dst, dst_lock, count); + inode_unlock(file_inode(dst)); + + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } + + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { @@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * with the current stateid. */ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0ca482a51e53..6dc6f2aea0d6 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -9,9 +9,22 @@ #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) +#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 /* wr_count */ + \ + 1 /* wr_committed */ + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) #define encode_allocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_allocate_maxsz (op_decode_hdr_maxsz) +#define encode_copy_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 + 2 + 2 + 1 + 1 + 1) +#define decode_copy_maxsz (op_decode_hdr_maxsz + \ + NFS42_WRITE_RES_SIZE + \ + 1 /* cr_consecutive */ + \ + 1 /* cr_synchronous */) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) @@ -49,6 +62,16 @@ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_maxsz) +#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_copy(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->src_stateid); + encode_nfs4_stateid(xdr, &args->dst_stateid); + + encode_uint64(xdr, args->src_pos); + encode_uint64(xdr, args->dst_pos); + encode_uint64(xdr, args->count); + + encode_uint32(xdr, 1); /* consecutive = true */ + encode_uint32(xdr, 1); /* synchronous = true */ + encode_uint32(xdr, 0); /* src server list */ +} + static void encode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_args *args, struct compound_hdr *hdr) @@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, } /* + * Encode COPY request + */ +static void nfs4_xdr_enc_copy(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_copy_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->src_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dst_fh, &hdr); + encode_copy(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode DEALLOCATE request */ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, @@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) return decode_op_hdr(xdr, OP_ALLOCATE); } +static int decode_write_response(struct xdr_stream *xdr, + struct nfs42_write_res *res) +{ + __be32 *p; + int stateids; + + p = xdr_inline_decode(xdr, 4 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + stateids = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &res->count); + res->verifier.committed = be32_to_cpup(p); + return decode_verifier(xdr, &res->verifier.verifier); + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy_requirements(struct xdr_stream *xdr, + struct nfs42_copy_res *res) { + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(!p)) + goto out_overflow; + + res->consecutive = be32_to_cpup(p++); + res->synchronous = be32_to_cpup(p++); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_COPY); + if (status == NFS4ERR_OFFLOAD_NO_REQS) { + status = decode_copy_requirements(xdr, res); + if (status) + return status; + return NFS4ERR_OFFLOAD_NO_REQS; + } else if (status) + return status; + + status = decode_write_response(xdr, &res->write_res); + if (status) + return status; + + return decode_copy_requirements(xdr, res); +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); @@ -331,6 +447,36 @@ out: } /* + * Decode COPY response + */ +static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_copy_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy(xdr, res); +out: + return status; +} + +/* * Decode DEALLOCATE request */ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4afdee420d25..768456fa1b17 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, - fmode_t, const struct nfs_lockowner *); +extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, + const struct nfs_lockowner *, nfs4_stateid *, + struct rpc_cred **); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4; static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) { - memcpy(dst, src, sizeof(*dst)); + memcpy(dst->data, src->data, sizeof(dst->data)); + dst->type = src->type; } static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) { - return memcmp(dst, src, sizeof(*dst)) == 0; + if (dst->type != src->type) + return false; + return memcmp(dst->data, src->data, sizeof(dst->data)) == 0; } static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d0390516467c..014b0e41ace5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id) } #ifdef CONFIG_NFS_V4_2 +static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t count, unsigned int flags) +{ + struct inode *in_inode = file_inode(file_in); + struct inode *out_inode = file_inode(file_out); + int ret; + + if (in_inode == out_inode) + return -EINVAL; + + /* flush any pending writes */ + ret = nfs_sync_inode(in_inode); + if (ret) + return ret; + ret = nfs_sync_inode(out_inode); + if (ret) + return ret; + + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); +} + static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) { loff_t ret; @@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = { .check_flags = nfs_check_flags, .setlease = simple_nosetlease, #ifdef CONFIG_NFS_V4_2 + .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, .clone_file_range = nfs42_clone_file_range, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 084e8570da18..de97567795a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -74,6 +74,17 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +/* file attributes which can be mapped to nfs attributes */ +#define NFS4_VALID_ATTRS (ATTR_MODE \ + | ATTR_UID \ + | ATTR_GID \ + | ATTR_SIZE \ + | ATTR_ATIME \ + | ATTR_MTIME \ + | ATTR_CTIME \ + | ATTR_ATIME_SET \ + | ATTR_MTIME_SET) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr, &label); - - nfs_fattr_init(opendata->o_res.f_attr); - status = nfs4_do_setattr(state->inode, cred, - opendata->o_res.f_attr, sattr, - state, label, olabel); - if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr, - opendata->o_res.f_attr); - nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + /* + * send create attributes which was not set by open + * with an extra setattr. + */ + if (sattr->ia_valid & NFS4_VALID_ATTRS) { + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state, label, olabel); + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } } } if (opened && opendata->file_created) @@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; fmode_t fmode; bool truncate; @@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; if (!nfs4_valid_open_stateid(state)) return -EBADF; - if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - &lockowner) == -EIO) + if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, + &arg.stateid, &delegation_cred) == -EIO) return -EBADF; } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); + if (delegation_cred) + msg.rpc_cred = delegation_cred; status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + + put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); trace_nfs4_setattr(inode, &arg.stateid, status); @@ -4285,7 +4307,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid, if (l_ctx != NULL) lockowner = &l_ctx->lockowner; - return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); + return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL); } EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); @@ -4993,12 +5015,11 @@ static int nfs4_do_set_security_label(struct inode *inode, } static int -nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -6054,6 +6075,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; int status = -ENOLCK; @@ -6068,6 +6090,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(state->inode, request); if (status < 0) goto out; + mutex_lock(&sp->so_delegreturn_mutex); down_read(&nfsi->rwsem); if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ @@ -6075,9 +6098,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock request->fl_flags = fl_flags & ~FL_SLEEP; status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: request->fl_flags = fl_flags; @@ -6255,11 +6280,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, @@ -6277,12 +6302,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { if (security_ismaclabel(key)) - return nfs4_set_security_label(dentry, buf, buflen); + return nfs4_set_security_label(inode, buf, buflen); return -EOPNOTSUPP; } @@ -7351,9 +7376,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) * always set csa_cachethis to FALSE because the current implementation * of the back channel DRC only supports caching the CB_SEQUENCE operation. */ -static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, + struct rpc_clnt *clnt) { unsigned int max_rqst_sz, max_resp_sz; + unsigned int max_bc_payload = rpc_max_bc_payload(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -7371,8 +7398,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.max_rqst_sz = PAGE_SIZE; - args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_rqst_sz = max_bc_payload; + args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; @@ -7476,7 +7503,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, }; int status; - nfs4_init_channel_attrs(&args); + nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -7820,40 +7847,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); - int ret; dprintk("--> %s\n", __func__); - /* Note the is a race here, where a CB_LAYOUTRECALL can come in - * right now covering the LAYOUTGET we are about to send. - * However, that is not so catastrophic, and there seems - * to be no way to prevent it completely. - */ - if (nfs41_setup_sequence(session, &lgp->args.seq_args, - &lgp->res.seq_res, task)) - return; - ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, - NFS_I(lgp->args.inode)->layout, - &lgp->args.range, - lgp->args.ctx->state); - if (ret < 0) - rpc_exit(task, ret); + nfs41_setup_sequence(session, &lgp->args.seq_args, + &lgp->res.seq_res, task); + dprintk("<-- %s\n", __func__); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + nfs41_sequence_done(task, &lgp->res.seq_res); + dprintk("<-- %s\n", __func__); +} + +static int +nfs4_layoutget_handle_exception(struct rpc_task *task, + struct nfs4_layoutget *lgp, struct nfs4_exception *exception) +{ struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - struct nfs4_state *state = NULL; - unsigned long timeo, now, giveup; + int status = task->tk_status; dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - if (!nfs41_sequence_done(task, &lgp->res.seq_res)) - goto out; - - switch (task->tk_status) { + switch (status) { case 0: goto out; @@ -7863,57 +7884,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * retry go inband. */ case -NFS4ERR_LAYOUTUNAVAILABLE: - task->tk_status = -ENODATA; + status = -ENODATA; goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_BADLAYOUT: - goto out_overflow; + status = -EOVERFLOW; + goto out; /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client * (or clients) writing to the same RAID stripe except when * the minlength argument is 0 (see RFC5661 section 18.43.3). + * + * Treat it like we would RECALLCONFLICT -- we retry for a little + * while, and then eventually give up. */ case -NFS4ERR_LAYOUTTRYLATER: - if (lgp->args.minlength == 0) - goto out_overflow; - /* - * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall - * existing layout before getting a new one). - */ - case -NFS4ERR_RECALLCONFLICT: - timeo = rpc_get_timeout(task->tk_client); - giveup = lgp->args.timestamp + timeo; - now = jiffies; - if (time_after(giveup, now)) { - unsigned long delay; - - /* Delay for: - * - Not less then NFS4_POLL_RETRY_MIN. - * - One last time a jiffie before we give up - * - exponential backoff (time_now minus start_attempt) - */ - delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, - min((giveup - now - 1), - now - lgp->args.timestamp)); - - dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", - __func__, delay); - rpc_delay(task, delay); - /* Do not call nfs4_async_handle_error() */ - goto out_restart; + if (lgp->args.minlength == 0) { + status = -EOVERFLOW; + goto out; } - break; + /* Fallthrough */ + case -NFS4ERR_RECALLCONFLICT: + nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, + exception); + status = -ERECALLCONFLICT; + goto out; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + exception->timeout = 0; spin_lock(&inode->i_lock); if (nfs4_stateid_match(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); /* If the open stateid was bad, then recover it. */ - state = lgp->args.ctx->state; + exception->state = lgp->args.ctx->state; break; } lo = NFS_I(inode)->layout; @@ -7926,25 +7933,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * with the current stateid. */ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else spin_unlock(&inode->i_lock); - goto out_restart; + status = -EAGAIN; + goto out; } - if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) - goto out_restart; + + status = nfs4_handle_exception(server, status, exception); + if (exception->retry) + status = -EAGAIN; out: dprintk("<-- %s\n", __func__); - return; -out_restart: - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; -out_overflow: - task->tk_status = -EOVERFLOW; - goto out; + return status; } static size_t max_response_pages(struct nfs_server *server) @@ -8013,7 +8016,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); @@ -8033,6 +8036,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .flags = RPC_TASK_ASYNC, }; struct pnfs_layout_segment *lseg = NULL; + struct nfs4_exception exception = { .timeout = *timeout }; int status = 0; dprintk("--> %s\n", __func__); @@ -8046,7 +8050,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; - lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; @@ -8056,13 +8059,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) if (IS_ERR(task)) return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); - if (status == 0) - status = task->tk_status; + if (status == 0) { + status = nfs4_layoutget_handle_exception(task, lgp, &exception); + *timeout = exception.timeout; + } + trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); @@ -8118,7 +8125,8 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); @@ -8653,6 +8661,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + if (s1->type != s2->type) + return false; + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) return false; @@ -8793,6 +8804,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE + | NFS_CAP_COPY | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d854693a15b0..9679f4749364 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -65,7 +65,10 @@ #define OPENOWNER_POOL_SIZE 8 -const nfs4_stateid zero_stateid; +const nfs4_stateid zero_stateid = { + { .data = { 0 } }, + .type = NFS4_SPECIAL_STATEID_TYPE, +}; static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fmode_t fmode, const struct nfs_lockowner *lockowner) +int nfs4_select_rw_stateid(struct nfs4_state *state, + fmode_t fmode, const struct nfs_lockowner *lockowner, + nfs4_stateid *dst, struct rpc_cred **cred) { - int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + int ret; + + if (cred != NULL) + *cred = NULL; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; /* returns true if delegation stateid found and copied */ - if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) { ret = 0; goto out; } diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2c8d05dae5b1..9c150b153782 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ + { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) TRACE_EVENT(pnfs_update_layout, @@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout, u64 count, enum pnfs_iomode iomode, struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, enum pnfs_update_layout_reason reason ), - TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_ARGS(inode, pos, count, iomode, lo, lseg, reason), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, fileid) @@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout, __field(enum pnfs_iomode, iomode) __field(int, layoutstateid_seq) __field(u32, layoutstateid_hash) + __field(long, lseg) __field(enum pnfs_update_layout_reason, reason) ), TP_fast_assign( @@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout, __entry->layoutstateid_seq = 0; __entry->layoutstateid_hash = 0; } + __entry->lseg = (long)lseg; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "iomode=%s pos=%llu count=%llu " - "layoutstateid=%d:0x%08x (%s)", + "layoutstateid=%d:0x%08x lseg=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout, (unsigned long long)__entry->pos, (unsigned long long)__entry->count, __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg, show_pnfs_update_layout_reason(__entry->reason) ) ); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 88474a4fc669..661e753fe1c9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); } +static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_OPEN_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LOCK_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) if (status == -EIO) goto out; if (status == 0) { - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); if (unlikely(status)) goto out; } else if (status == -NFS4ERR_DENIED) @@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); if (status == 0) - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); return status; } @@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, __be32 *p; int status; - status = decode_stateid(xdr, &res->delegation); + status = decode_delegation_stateid(xdr, &res->delegation); if (unlikely(status)) return status; p = xdr_inline_decode(xdr, 4); @@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) nfs_increment_open_seqid(status, res->seqid); if (status) return status; - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); if (unlikely(status)) return status; @@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5838,6 +5856,12 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) +static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_getdeviceinfo(struct xdr_stream *xdr, struct nfs4_getdeviceinfo_res *res) { @@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, if (unlikely(!p)) goto out_overflow; res->return_on_close = be32_to_cpup(p); - decode_stateid(xdr, &res->stateid); + decode_layout_stateid(xdr, &res->stateid); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr, goto out_overflow; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) - status = decode_stateid(xdr, &res->stateid); + status = decode_layout_stateid(xdr, &res->stateid); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -7515,6 +7539,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DEALLOCATE, enc_deallocate, dec_deallocate), PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), PROC(CLONE, enc_clone, dec_clone), + PROC(COPY, enc_copy, dec_copy), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1f6db4231057..174dd4cf5747 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page_file_index(page); - get_page(page); + if (page) { + req->wb_index = page_file_index(page); + get_page(page); + } req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 89a5ef4df08a..0c7e0d45a4de 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, }; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0); } static int @@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, @@ -522,13 +522,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, return rv; } -/* Returns count of number of matching invalid lsegs remaining in list - * after call. +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +/** + * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later + * @lo: layout header containing the lsegs + * @tmp_list: list head where doomed lsegs should go + * @recall_range: optional recall range argument to match (may be NULL) + * @seq: only invalidate lsegs obtained prior to this sequence (may be 0) + * + * Walk the list of lsegs in the layout header, and tear down any that should + * be destroyed. If "recall_range" is specified then the segment must match + * that range. If "seq" is non-zero, then only match segments that were handed + * out at or before that sequence. + * + * Returns number of matching invalid lsegs remaining in list after scanning + * it and purging them. */ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -540,10 +562,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (!recall_range || should_free_lseg(&lseg->pls_range, recall_range)) { - dprintk("%s: freeing lseg %p iomode %d " + if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + continue; + dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, - lseg, lseg->pls_range.iomode, lseg->pls_range.offset, - lseg->pls_range.length); + lseg, lseg->pls_range.iomode, lseg->pls_seq, + lseg->pls_range.offset, lseg->pls_range.length); if (!mark_lseg_invalid(lseg, tmp_list)) remaining++; } @@ -730,15 +754,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) -{ - return (s32)(s1 - s2) > 0; -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -781,50 +796,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } -int -pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state) -{ - int status = 0; - - dprintk("--> %s\n", __func__); - spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo)) { - status = -EAGAIN; - } else if (!nfs4_valid_open_stateid(open_state)) { - status = -EBADF; - } else if (list_empty(&lo->plh_segs) || - test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { - int seq; - - do { - seq = read_seqbegin(&open_state->seqlock); - nfs4_stateid_copy(dst, &open_state->stateid); - } while (read_seqretry(&open_state->seqlock, seq)); - } else - nfs4_stateid_copy(dst, &lo->plh_stateid); - spin_unlock(&lo->plh_inode->i_lock); - dprintk("<-- %s\n", __func__); - return status; -} - /* -* Get layout from server. -* for now, assume that whole file layouts are requested. -* arg->offset: 0 -* arg->length: all ones -*/ + * Get layout from server. + * for now, assume that whole file layouts are requested. + * arg->offset: 0 + * arg->length: all ones + */ static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, + nfs4_stateid *stateid, const struct pnfs_layout_range *range, - gfp_t gfp_flags) + long *timeout, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; - struct pnfs_layout_segment *lseg; loff_t i_size; dprintk("--> %s\n", __func__); @@ -834,40 +821,31 @@ send_layoutget(struct pnfs_layout_hdr *lo, * store in lseg. If we race with a concurrent seqid morphing * op, then re-send the LAYOUTGET. */ - do { - lgp = kzalloc(sizeof(*lgp), gfp_flags); - if (lgp == NULL) - return NULL; - - i_size = i_size_read(ino); - - lgp->args.minlength = PAGE_SIZE; - if (lgp->args.minlength > range->length) - lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; - } - lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - pnfs_copy_range(&lgp->args.range, range); - lgp->args.type = server->pnfs_curr_ld->id; - lgp->args.inode = ino; - lgp->args.ctx = get_nfs_open_context(ctx); - lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; - - lseg = nfs4_proc_layoutget(lgp, gfp_flags); - } while (lseg == ERR_PTR(-EAGAIN)); - - if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) - lseg = NULL; - else - pnfs_layout_clear_fail_bit(lo, - pnfs_iomode_to_fail_bit(range->iomode)); + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return ERR_PTR(-ENOMEM); - return lseg; + i_size = i_size_read(ino); + + lgp->args.minlength = PAGE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + pnfs_copy_range(&lgp->args.range, range); + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + nfs4_stateid_copy(&lgp->args.stateid, stateid); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + return nfs4_proc_layoutget(lgp, timeout, gfp_flags); } static void pnfs_clear_layoutcommit(struct inode *inode, @@ -899,6 +877,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; pnfs_get_layout_hdr(lo); clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); return true; @@ -969,6 +948,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) bool send; nfs4_stateid_copy(&stateid, &lo->plh_stateid); + stateid.seqid = cpu_to_be32(lo->plh_return_seq); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); @@ -1012,7 +992,7 @@ _pnfs_return_layout(struct inode *ino) pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { struct pnfs_layout_range range = { @@ -1341,23 +1321,28 @@ out_existing: /* * iomode matching rules: - * iomode lseg match - * ----- ----- ----- - * ANY READ true - * ANY RW true - * RW READ false - * RW RW true - * READ READ true - * READ RW true + * iomode lseg strict match + * iomode + * ----- ----- ------ ----- + * ANY READ N/A true + * ANY RW N/A true + * RW READ N/A false + * RW RW N/A true + * READ READ N/A true + * READ RW true false + * READ RW false true */ static bool pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || + (range->iomode != ls_range->iomode && + strict_iomode == true) || !pnfs_lseg_range_intersecting(ls_range, range)) return 0; @@ -1372,7 +1357,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, */ static struct pnfs_layout_segment * pnfs_find_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) + struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_segment *lseg, *ret = NULL; @@ -1381,7 +1367,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && - pnfs_lseg_range_match(&lseg->pls_range, range)) { + pnfs_lseg_range_match(&lseg->pls_range, range, + strict_iomode)) { ret = pnfs_get_lseg(lseg); break; } @@ -1498,6 +1485,7 @@ pnfs_update_layout(struct inode *ino, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags) { struct pnfs_layout_range arg = { @@ -1505,27 +1493,30 @@ pnfs_update_layout(struct inode *ino, .offset = pos, .length = count, }; - unsigned pg_offset; + unsigned pg_offset, seq; struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = NULL; struct pnfs_layout_segment *lseg = NULL; + nfs4_stateid stateid; + long timeout = 0; + unsigned long giveup = jiffies + rpc_get_timeout(server->client); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; } if (iomode == IOMODE_READ && i_size_read(ino) == 0) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; } if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; } @@ -1536,14 +1527,14 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; @@ -1551,14 +1542,34 @@ lookup_again: /* if LAYOUTGET already failed once we don't try again */ if (pnfs_layout_io_test_failed(lo, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; } - first = list_empty(&lo->plh_segs); - if (first) { - /* The first layoutget for the file. Need to serialize per + lseg = pnfs_find_lseg(lo, &arg, strict_iomode); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); + goto out_unlock; + } + + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + goto out_unlock; + } + + /* + * Choose a stateid for the LAYOUTGET. If we don't have a layout + * stateid, or it has been invalidated, then we must use the open + * stateid. + */ + if (lo->plh_stateid.seqid == 0 || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + + /* + * The first layoutget for the file. Need to serialize per * RFC 5661 Errata 3208. */ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, @@ -1567,18 +1578,17 @@ lookup_again: wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); goto lookup_again; } + + first = true; + do { + seq = read_seqbegin(&ctx->state->seqlock); + nfs4_stateid_copy(&stateid, &ctx->state->stateid); + } while (read_seqretry(&ctx->state->seqlock, seq)); } else { - /* Check to see if the layout for the given range - * already exists - */ - lseg = pnfs_find_lseg(lo, &arg); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, - PNFS_UPDATE_LAYOUT_FOUND_CACHED); - goto out_unlock; - } + nfs4_stateid_copy(&stateid, &lo->plh_stateid); } /* @@ -1593,15 +1603,17 @@ lookup_again: pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); dprintk("%s retrying\n", __func__); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg, PNFS_UPDATE_LAYOUT_RETRY); goto lookup_again; } - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } if (pnfs_layoutgets_blocked(lo)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; } @@ -1626,10 +1638,36 @@ lookup_again: if (arg.length != NFS4_MAX_UINT64) arg.length = PAGE_ALIGN(arg.length); - lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - atomic_dec(&lo->plh_outstanding); - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + if (IS_ERR(lseg)) { + switch(PTR_ERR(lseg)) { + case -ERECALLCONFLICT: + if (time_after(jiffies, giveup)) + lseg = NULL; + /* Fallthrough */ + case -EAGAIN: + pnfs_put_layout_hdr(lo); + if (first) + pnfs_clear_first_layoutget(lo); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + goto lookup_again; + } + /* Fallthrough */ + default: + if (!nfs_error_is_fatal(PTR_ERR(lseg))) { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + } + } + } else { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + } + + atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1678,38 +1716,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = -EINVAL; if (!pnfs_sanity_check_layout_range(&res->range)) - goto out; + return ERR_PTR(-EINVAL); /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); - if (!lseg || IS_ERR(lseg)) { + if (IS_ERR_OR_NULL(lseg)) { if (!lseg) - status = -ENOMEM; - else - status = PTR_ERR(lseg); - dprintk("%s: Could not allocate layout: error %d\n", - __func__, status); - goto out; + lseg = ERR_PTR(-ENOMEM); + + dprintk("%s: Could not allocate layout: error %ld\n", + __func__, PTR_ERR(lseg)); + return lseg; } init_lseg(lo, lseg); lseg->pls_range = res->range; + lseg->pls_seq = be32_to_cpu(res->stateid.seqid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); - goto out_forget_reply; + goto out_forget; } if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); - status = -EAGAIN; - goto out_forget_reply; + goto out_forget; } pnfs_set_layout_stateid(lo, &res->stateid, false); } else { @@ -1718,7 +1754,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid); @@ -1735,18 +1771,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); return lseg; -out: - return ERR_PTR(status); -out_forget_reply: +out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - goto out; + return ERR_PTR(-EAGAIN); } static void -pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, + u32 seq) { if (lo->plh_return_iomode == iomode) return; @@ -1754,6 +1789,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) + lo->plh_return_seq = seq; } /** @@ -1769,7 +1806,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -1792,8 +1830,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - pnfs_set_plh_return_iomode(lo, return_range->iomode); } + + if (remaining) + pnfs_set_plh_return_info(lo, return_range->iomode, seq); + return remaining; } @@ -1810,13 +1851,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_iomode(lo, range.iomode); + pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, + &range, lseg->pls_seq)) { nfs4_stateid stateid; enum pnfs_iomode iomode = lo->plh_return_iomode; @@ -1849,6 +1891,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r req_offset(req), rd_size, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1873,6 +1916,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, req_offset(req), wb_size, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -2143,12 +2187,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); - return nfs_pageio_resend(&pgio, hdr); + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + nfs_pageio_init_read(&pgio, hdr->inode, false, + hdr->completion_ops); + hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); + } } EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); @@ -2158,12 +2205,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; - int err = 0; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); if (trypnfs == PNFS_TRY_AGAIN) - err = pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || err) + pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) pnfs_read_through_mds(desc, hdr); } @@ -2405,7 +2451,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) spin_lock(&inode->i_lock); if (!NFS_I(inode)->layout) { spin_unlock(&inode->i_lock); - goto out; + goto out_clear_layoutstats; } hdr = NFS_I(inode)->layout; pnfs_get_layout_hdr(hdr); @@ -2434,6 +2480,7 @@ out_free: kfree(data); out_put: pnfs_put_layout_hdr(hdr); +out_clear_layoutstats: smp_mb__before_atomic(); clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); smp_mb__after_atomic(); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1ac1db5f6dad..b21bd0bee784 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -64,6 +64,7 @@ struct pnfs_layout_segment { struct list_head pls_lc_list; struct pnfs_layout_range pls_range; atomic_t pls_refcount; + u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; struct work_struct pls_work; @@ -194,6 +195,7 @@ struct pnfs_layout_hdr { unsigned long plh_flags; nfs4_stateid plh_stateid; u32 plh_barrier; /* ignore lower seqids */ + u32 plh_return_seq; enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ @@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier); -int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, - struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -int pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 4aaed890048f..0dfc476da3e1 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. - * Note this must be called holding the inode (/cinfo) lock + * Note this must be called holding i_lock */ void pnfs_generic_clear_request_commit(struct nfs_page *req, @@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); @@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; @@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct pnfs_layout_segment *freeme; int i; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (pnfs_generic_transfer_commit_list(&b->written, dst, cinfo, 0)) { freeme = b->wlseg; b->wlseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); goto restart; } } @@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) LIST_HEAD(pages); int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) @@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) freeme = bucket->clseg; bucket->clseg = NULL; list_splice_init(&bucket->committing, &pages); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static unsigned int @@ -238,14 +238,31 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, struct pnfs_commit_bucket *bucket; bucket = &cinfo->ds->buckets[data->ds_commit_index]; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); list_splice_init(&bucket->committing, pages); data->lseg = bucket->clseg; bucket->clseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } +/* Helper function for pnfs_generic_commit_pagelist to catch an empty + * page list. This can happen when two commits race. */ +static bool +pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + if (list_empty(pages)) { + if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) + wake_up_atomic_t(&cinfo->mds->rpcs_out); + nfs_commitdata_release(data); + return true; + } + + return false; +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -280,6 +297,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); if (data->ds_commit_index < 0) { + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, + data, cinfo)) + continue; + nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), @@ -288,6 +310,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, LIST_HEAD(pages); pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(&pages, + data, cinfo)) + continue; + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } @@ -874,12 +902,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -896,7 +924,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1268280244e..2137e0202f25 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = { enum { Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, + Opt_xprt_rdma6, Opt_xprt_err }; @@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_tcp, "tcp" }, { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, + { Opt_xprt_rdma6, "rdma6" }, { Opt_xprt_err, NULL } }; @@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; + case Opt_xprt_rdma6: + protofamily = AF_INET6; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; @@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1, struct nfs_server *server2) { struct sockaddr *sap1, *sap2; + struct rpc_xprt *xprt1 = server1->client->cl_xprt; + struct rpc_xprt *xprt2 = server2->client->cl_xprt; + + if (!net_eq(xprt1->xprt_net, xprt2->xprt_net)) + return 0; sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5f4fd53e5764..e1c74d3db64d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { int ret = 0; - if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_COND_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; return ret; @@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) head = req->wb_head; spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(head->wb_page))) { + if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); smp_mb__after_atomic(); @@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) static void nfs_mark_request_dirty(struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + if (req->wb_page) + __set_page_dirty_nobuffers(req->wb_page); } /* @@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold the cinfo->lock, and the nfs_page lock. + * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(cinfo->lock); - nfs_mark_page_unstable(req->wb_page, cinfo); + spin_unlock(&cinfo->inode->i_lock); + if (req->wb_page) + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode) { - cinfo->lock = &inode->i_lock; + cinfo->inode = inode; cinfo->mds = &NFS_I(inode)->commit_info; cinfo->ds = pnfs_get_ds_info(inode); cinfo->dreq = NULL; @@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo) return cinfo->mds->ncommit; } -/* cinfo->lock held by caller */ +/* cinfo->inode->i_lock held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); @@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->mds->ncommit > 0) { const int max = INT_MAX; @@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return ret; } @@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, { struct nfs_commit_data *data; + /* another commit raced with us */ + if (list_empty(head)) + return 0; + data = nfs_commitdata_alloc(); if (!data) @@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, return -ENOMEM; } +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) +{ + struct inode *inode = file_inode(file); + struct nfs_open_context *open; + struct nfs_commit_info cinfo; + struct nfs_page *req; + int ret; + + open = get_nfs_open_context(nfs_file_open_context(file)); + req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out_put; + } + + nfs_init_cinfo_from_inode(&cinfo, inode); + + memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); + nfs_request_add_commit_list(req, &cinfo); + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret > 0) + ret = 0; + + nfs_free_request(req); +out_put: + put_nfs_open_context(open); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_commit_file); + /* * COMMIT call returned */ @@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_clear_page_commit(req->wb_page); + if (req->wb_page) + nfs_clear_page_commit(req->wb_page); dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a8d15beee5cb..6aaf3e351391 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -272,10 +272,21 @@ struct o2hb_region { struct delayed_work hr_write_timeout_work; unsigned long hr_last_timeout_start; + /* negotiate timer, used to negotiate extending hb timeout. */ + struct delayed_work hr_nego_timeout_work; + unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* Used during o2hb_check_slot to hold a copy of the block * being checked because we temporarily have to zero out the * crc field. */ struct o2hb_disk_heartbeat_block *hr_tmp_block; + + /* Message key for negotiate timeout message. */ + unsigned int hr_key; + struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; struct o2hb_bio_wait_ctxt { @@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) + +enum { + O2HB_NEGO_TIMEOUT_MSG = 1, + O2HB_NEGO_APPROVE_MSG = 2, +}; + +struct o2hb_nego_msg { + u8 node_num; +}; + static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; @@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work) o2quo_disk_timeout(); } -static void o2hb_arm_write_timeout(struct o2hb_region *reg) +static void o2hb_arm_timeout(struct o2hb_region *reg) { /* Arm writeout only after thread reaches steady state */ if (atomic_read(®->hr_steady_iterations) != 0) @@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) spin_unlock(&o2hb_live_lock); } cancel_delayed_work(®->hr_write_timeout_work); - reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); + + cancel_delayed_work(®->hr_nego_timeout_work); + /* negotiate timeout must be less than write timeout. */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); + memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); } -static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +static void o2hb_disarm_timeout(struct o2hb_region *reg) { cancel_delayed_work_sync(®->hr_write_timeout_work); + cancel_delayed_work_sync(®->hr_nego_timeout_work); +} + +static int o2hb_send_nego_msg(int key, int type, u8 target) +{ + struct o2hb_nego_msg msg; + int status, ret; + + msg.node_num = o2nm_this_node(); +again: + ret = o2net_send_message(type, key, &msg, sizeof(msg), + target, &status); + + if (ret == -EAGAIN || ret == -ENOMEM) { + msleep(100); + goto again; + } + + return ret; +} + +static void o2hb_nego_timeout(struct work_struct *work) +{ + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int master_node, i, ret; + struct o2hb_region *reg; + + reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + /* lowest node as master node to make negotiate decision. */ + master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + + if (master_node == o2nm_this_node()) { + if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, + config_item_name(®->hr_item), reg->hr_dev_name); + set_bit(master_node, reg->hr_nego_node_bitmap); + } + if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, + sizeof(reg->hr_nego_node_bitmap))) { + /* check negotiate bitmap every second to do timeout + * approve decision. + */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(1000)); + + return; + } + + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", + config_item_name(®->hr_item), reg->hr_dev_name); + /* approve negotiate timeout request. */ + o2hb_arm_timeout(reg); + + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + if (i == master_node) + continue; + + mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); + ret = o2hb_send_nego_msg(reg->hr_key, + O2HB_NEGO_APPROVE_MSG, i); + if (ret) + mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", + i, ret); + } + } else { + /* negotiate timeout with master node. */ + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), + reg->hr_dev_name, master_node); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); + if (ret) + mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", + master_node, ret); + } +} + +static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + struct o2hb_nego_msg *nego_msg; + + nego_msg = (struct o2hb_nego_msg *)msg->buf; + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", + nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); + if (nego_msg->node_num < O2NM_MAX_NODES) + set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); + else + mlog(ML_ERROR, "got nego timeout message from bad node.\n"); + + return 0; +} + +static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", + config_item_name(®->hr_item), reg->hr_dev_name); + o2hb_arm_timeout(reg); + return 0; } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Skip disarming the timeout if own slot has stale/bad data */ if (own_slot_ok) { o2hb_set_quorum_device(reg); - o2hb_arm_write_timeout(reg); + o2hb_arm_timeout(reg); + reg->hr_last_timeout_start = jiffies; } bail: @@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data) before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; after_hb = ktime_get_real(); @@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data) } } - o2hb_disarm_write_timeout(reg); + o2hb_disarm_timeout(reg); /* unclean stop is only used in very bad situation */ for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) @@ -1451,6 +1593,7 @@ static void o2hb_region_release(struct config_item *item) list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); + o2net_unregister_handler_list(®->hr_handler_list); kfree(reg); } @@ -1762,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); /* * A node is considered live after it has beat LIVE_THRESHOLD @@ -1995,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + /* this is the same way to generate msg key as dlm, for local heartbeat, + * name is also the same, so make initial crc value different to avoid + * message key conflict. + */ + reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, + name, strlen(name)); + INIT_LIST_HEAD(®->hr_handler_list); + ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_timeout_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto free; + + ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_approve_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto unregister_handler; + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); - goto free; + goto unregister_handler; } return ®->hr_item; + +unregister_handler: + o2net_unregister_handler_list(®->hr_handler_list); free: kfree(reg); return ERR_PTR(ret); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..94b18369b1cc 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,6 +44,9 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * + * New in version 12 + * - Negotiate hb timeout when storage is down. + * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -75,7 +78,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 11ULL +#define O2NET_PROTOCOL_VERSION 12ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0748777f2e2a..c56a7679df93 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } if (is_bad_inode(inode)) { iput(inode); - if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || - (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) - /* Return OCFS2_FILECHECK_ERR_XXX related errno */ - inode = ERR_PTR(rc); - else - inode = ERR_PTR(-ESTALE); + inode = ERR_PTR(rc); goto bail; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ad16995c9e7a..d2053853951e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 99c19545752c..5893ddde0e4b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -448,13 +448,14 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, + return orangefs_inode_setxattr(inode, ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, name, buffer, @@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler, } static int orangefs_xattr_set_trusted(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, + return orangefs_inode_setxattr(inode, ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, name, buffer, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index cc514da6f3e7..80aa6f1eb336 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -336,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct dentry *upperdir; struct dentry *upperdentry; const struct cred *old_cred; - struct cred *override_cred; char *link = NULL; if (WARN_ON(!workdir)) @@ -357,28 +356,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return PTR_ERR(link); } - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_free_link; - - override_cred->fsuid = stat->uid; - override_cred->fsgid = stat->gid; - /* - * CAP_SYS_ADMIN for copying up extended attributes - * CAP_DAC_OVERRIDE for create - * CAP_FOWNER for chmod, timestamp update - * CAP_FSETID for chmod - * CAP_CHOWN for chown - * CAP_MKNOD for mknod - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - cap_raise(override_cred->cap_effective, CAP_MKNOD); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { @@ -401,9 +379,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, out_unlock: unlock_rename(workdir, upperdir); revert_creds(old_cred); - put_cred(override_cred); -out_free_link: if (link) free_page((unsigned long) link); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index b3fc0a35bf62..22f0253a3567 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -405,28 +405,13 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, err = ovl_create_upper(dentry, inode, &stat, link, hardlink); } else { const struct cred *old_cred; - struct cred *override_cred; - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_iput; - - /* - * CAP_SYS_ADMIN for setting opaque xattr - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = ovl_create_over_whiteout(dentry, inode, &stat, link, hardlink); revert_creds(old_cred); - put_cred(override_cred); } if (!err) @@ -662,32 +647,11 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { - const struct cred *old_cred; - struct cred *override_cred; - - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); - put_cred(override_cred); } out_drop_write: ovl_drop_write(dentry); @@ -725,7 +689,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool new_is_dir = false; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; - struct cred *override_cred = NULL; err = -EINVAL; if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) @@ -794,26 +757,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_opaque = !OVL_TYPE_PURE_UPPER(old_type); new_opaque = !OVL_TYPE_PURE_UPPER(new_type); - if (old_opaque || new_opaque) { - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); - } + if (old_opaque || new_opaque) + old_cred = ovl_override_creds(old->d_sb); if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); @@ -943,10 +888,8 @@ out_dput_old: out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - if (old_opaque || new_opaque) { + if (old_opaque || new_opaque) revert_creds(old_cred); - put_cred(override_cred); - } out_drop_write: ovl_drop_write(old); out: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c7b31a03dc9c..0ed7c4012437 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -210,8 +210,9 @@ static bool ovl_is_private_xattr(const char *name) return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } -int ovl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int ovl_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int err; struct dentry *upperdentry; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 99ec4b035237..4bd9b5ba8f42 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -153,6 +153,7 @@ void ovl_drop_write(struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); bool ovl_is_whiteout(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); @@ -171,8 +172,9 @@ int ovl_check_d_type_supported(struct path *realpath); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); -int ovl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); +int ovl_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags); ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index da186ee4f846..cf37fc76fc9f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,6 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; + struct dentry *dentry; bool is_lowest; struct rb_root root; struct list_head *list; @@ -206,21 +207,10 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) struct ovl_cache_entry *p; struct dentry *dentry; const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) - return -ENOMEM; - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(rdd->dentry->d_sb); - inode_lock(dir->d_inode); - err = 0; - // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex); + err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; @@ -234,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) inode_unlock(dir->d_inode); } revert_creds(old_cred); - put_cred(override_cred); return err; } @@ -290,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, + .dentry = dentry, .list = list, .root = RB_ROOT, .is_lowest = false, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ed53ae0fe868..ce02f46029da 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -42,6 +42,8 @@ struct ovl_fs { long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; }; struct ovl_dir_cache; @@ -265,6 +267,13 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + static bool ovl_is_opaquedir(struct dentry *dentry) { int res; @@ -603,6 +612,7 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); kfree(ufs->config.workdir); + put_cred(ufs->creator_cred); kfree(ufs); } @@ -1064,16 +1074,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* * Upper should support d_type, else whiteouts are visible. * Given workdir and upper are on same fs, we can do - * iterate_dir() on workdir. + * iterate_dir() on workdir. This check requires successful + * creation of workdir in previous step. */ - err = ovl_check_d_type_supported(&workpath); - if (err < 0) - goto out_put_workdir; + if (ufs->workdir) { + err = ovl_check_d_type_supported(&workpath); + if (err < 0) + goto out_put_workdir; - if (!err) { - pr_err("overlayfs: upper fs needs to support d_type.\n"); - err = -EINVAL; - goto out_put_workdir; + if (!err) { + pr_err("overlayfs: upper fs needs to support d_type.\n"); + err = -EINVAL; + goto out_put_workdir; + } } } @@ -1108,10 +1121,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; + ufs->creator_cred = prepare_creds(); + if (!ufs->creator_cred) + goto out_put_lower_mnt; + err = -ENOMEM; oe = ovl_alloc_entry(numlower); if (!oe) - goto out_put_lower_mnt; + goto out_put_cred; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) @@ -1144,6 +1161,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) out_free_oe: kfree(oe); +out_put_cred: + put_cred(ufs->creator_cred); out_put_lower_mnt: for (i = 0; i < ufs->numlower; i++) mntput(ufs->lower_mnt[i]); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 2c60f17e7d92..8a4a266beff3 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler, static int posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; int ret; diff --git a/fs/proc/root.c b/fs/proc/root.c index 55bc7d6c8aac..06702783bf40 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); diff --git a/fs/readdir.c b/fs/readdir.c index 68ef06efe6bc..9d0212c374d6 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -35,13 +35,13 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - if (shared) + if (shared) { inode_lock_shared(inode); - else - inode_lock(inode); - // res = mutex_lock_killable(&inode->i_mutex); - // if (res) - // goto out; + } else { + res = down_write_killable(&inode->i_rwsem); + if (res) + goto out; + } res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 86aeb9dd805a..e4cbb7719906 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, } static int -security_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +security_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (IS_PRIVATE(d_inode(dentry))) + if (IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 31837f031f59..f15a5f9e84ce 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, } static int -trusted_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +trusted_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index f7c39731684b..dc59df43b2db 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +user_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!reiserfs_xattrs_user(dentry->d_sb)) + if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 595ca0debe11..69e287e20732 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -260,7 +260,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\txattr_names %u\n", ui->xattr_names); pr_err("\tdirty %u\n", ui->dirty); pr_err("\txattr %u\n", ui->xattr); - pr_err("\tbulk_read %u\n", ui->xattr); + pr_err("\tbulk_read %u\n", ui->bulk_read); pr_err("\tsynced_i_size %llu\n", (unsigned long long)ui->synced_i_size); pr_err("\tui_size %llu\n", diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6c277eb6aef9..b5fc27969e9d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, } static int ubifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); diff --git a/fs/xattr.c b/fs/xattr.c index b11945e15fde..4beafc43daa5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_op->setxattr) { - error = inode->i_op->setxattr(dentry, name, value, size, flags); + error = inode->i_op->setxattr(dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, @@ -655,6 +655,7 @@ strcmp_prefix(const char *a, const char *a_prefix) * operations to the correct xattr_handler. */ #define for_each_xattr_handler(handlers, handler) \ + if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) @@ -668,7 +669,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name) const struct xattr_handler *handler; if (!*name) - return NULL; + return ERR_PTR(-EINVAL); for_each_xattr_handler(handlers, handler) { const char *n; @@ -744,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) * Find the handler for the prefix and dispatch its set() operation. */ int -generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) +generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) { const struct xattr_handler *handler; @@ -753,7 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, value, size, flags); + return handler->set(handler, dentry, inode, name, value, size, flags); } /* @@ -768,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name) handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); + return handler->set(handler, dentry, d_inode(dentry), name, NULL, + 0, XATTR_REPLACE); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 686ba6fb20dd..339c696bbc01 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) } void * -kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - xfs_km_flags_t flags) +kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { - void *new; + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; - new = kmem_alloc(newsize, flags); - if (ptr) { - if (new) - memcpy(new, ptr, - ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr); - } - return new; + do { + ptr = krealloc(old, newsize, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", + current->comm, current->pid, + newsize, __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); } void * diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index d1c66e465ca5..689f746224e7 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fa3b948ef9c2..4e126f41a0aa 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -242,37 +242,21 @@ xfs_attr_set( return error; } - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (rsvd) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(args.trans, &tres, args.total, 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &tres, args.total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); + if (error) return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); @@ -429,31 +413,15 @@ xfs_attr_remove( return error; /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); - - /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (flags & ATTR_ROOT) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0, + (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, + &args.trans); + if (error) return error; - } xfs_ilock(dp, XFS_ILOCK_EXCL); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ce41d7fe753c..932381caef1b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork( mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : @@ -6026,13 +6025,10 @@ xfs_bmap_split_extent( xfs_fsblock_t firstfsb; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 974d62e677f4..e5bb9cc3b243 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -257,15 +257,12 @@ xfs_dir2_block_to_sf( * * Convert the inode to local format and copy the data in. */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); + xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size); + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; - memcpy(dp->i_df.if_u1.if_data, dst, size); - dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 11faf7df14c8..bbcc8c7a44b3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -231,6 +231,48 @@ xfs_iformat_fork( return error; } +void +xfs_init_local_fork( + struct xfs_inode *ip, + int whichfork, + const void *data, + int size) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int mem_size = size, real_size = 0; + bool zero_terminate; + + /* + * If we are using the local fork to store a symlink body we need to + * zero-terminate it so that we can pass it back to the VFS directly. + * Overallocate the in-memory fork by one for that and add a zero + * to terminate it below. + */ + zero_terminate = S_ISLNK(VFS_I(ip)->i_mode); + if (zero_terminate) + mem_size++; + + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (mem_size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(mem_size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + + if (size) { + memcpy(ifp->if_u1.if_data, data, size); + if (zero_terminate) + ifp->if_u1.if_data[size] = '\0'; + } + + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ifp->if_flags |= XFS_IFINLINE; +} + /* * The file is in-lined in the on-disk inode. * If it fits into if_inline_data, then copy @@ -248,8 +290,6 @@ xfs_iformat_local( int whichfork, int size) { - xfs_ifork_t *ifp; - int real_size; /* * If the size is unreasonable, then something @@ -265,22 +305,8 @@ xfs_iformat_local( ip->i_mount, dip); return -EFSCORRUPTED; } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; + + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -516,7 +542,6 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -660,7 +685,6 @@ xfs_idata_realloc( ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, real_size, - ifp->if_real_bytes, KM_SLEEP | KM_NOFS); } } else { @@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct( if (rnew_size != ifp->if_real_bytes) { ifp->if_u1.if_extents = kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); + rnew_size, KM_NOFS); } if (rnew_size > ifp->if_real_bytes) { memset(&ifp->if_u1.if_extents[ifp->if_bytes / @@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect( if (new_size == 0) { xfs_iext_destroy(ifp); } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); + ifp->if_u1.if_ext_irec = + kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS); } } @@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct( } /* + * Remove all records from the indirection array. + */ +STATIC void +xfs_iext_irec_remove_all( + struct xfs_ifork *ifp) +{ + int nlists; + int i; + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = 0; i < nlists; i++) + kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf); + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; +} + +/* * Free incore file extents. */ void @@ -1504,14 +1544,7 @@ xfs_iext_destroy( xfs_ifork_t *ifp) /* inode fork pointer */ { if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; + xfs_iext_irec_remove_all(ifp); } else if (ifp->if_real_bytes) { kmem_free(ifp->if_u1.if_extents); } else if (ifp->if_bytes) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7d3b1ed6dcbe..f95e072ae646 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -134,6 +134,7 @@ void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); +void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index d54a8018b079..e8f49c029ff0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -212,6 +212,11 @@ typedef struct xfs_trans_header { #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ /* + * The only type valid for th_type in CIL-enabled file system logs: + */ +#define XFS_TRANS_CHECKPOINT 40 + +/* * Log item types. */ #define XFS_LI_EFI 0x1236 diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8a53eaa349f4..12ca86778e02 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -838,12 +838,10 @@ xfs_sync_sb( struct xfs_trans *tp; int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, + XFS_TRANS_NO_WRITECOUNT, &tp); + if (error) return error; - } xfs_log_sb(tp); if (wait) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 81ac870834da..16002b5ec4eb 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -56,103 +56,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; /* - * Transaction types. Used to distinguish types of buffers. These never reach - * the log. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_SB_CHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_FSYNC_TS 35 -#define XFS_TRANS_GROWFSRT_ALLOC 36 -#define XFS_TRANS_GROWFSRT_ZERO 37 -#define XFS_TRANS_GROWFSRT_FREE 38 -#define XFS_TRANS_SWAPEXT 39 -#define XFS_TRANS_CHECKPOINT 40 -#define XFS_TRANS_ICREATE 41 -#define XFS_TRANS_CREATE_TMPFILE 42 -#define XFS_TRANS_TYPE_MAX 43 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_ICREATE, "ICREATE" }, \ - { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* * This structure is used to track log items associated with * a transaction. It points to the log item and keeps some * flags to track the state of the log item. It also tracks @@ -181,8 +84,9 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ +#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */ + /* * Field values for xfs_trans_mod_sb. */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c535887c60a8..4c463b99fe57 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -84,23 +84,71 @@ xfs_find_bdev_for_inode( } /* - * We're now finished for good with this ioend structure. - * Update the page state via the associated buffer_heads, - * release holds on the inode and bio, and finally free - * up memory. Do not use the ioend after this. + * We're now finished for good with this page. Update the page state via the + * associated buffer_heads, paying attention to the start and end offsets that + * we need to process on the page. + */ +static void +xfs_finish_page_writeback( + struct inode *inode, + struct bio_vec *bvec, + int error) +{ + unsigned int end = bvec->bv_offset + bvec->bv_len - 1; + struct buffer_head *head, *bh; + unsigned int off = 0; + + ASSERT(bvec->bv_offset < PAGE_SIZE); + ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT(end < PAGE_SIZE); + ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); + + bh = head = page_buffers(bvec->bv_page); + + do { + if (off < bvec->bv_offset) + goto next_bh; + if (off > end) + break; + bh->b_end_io(bh, !error); +next_bh: + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); +} + +/* + * We're now finished for good with this ioend structure. Update the page + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. */ STATIC void xfs_destroy_ioend( - xfs_ioend_t *ioend) + struct xfs_ioend *ioend, + int error) { - struct buffer_head *bh, *next; + struct inode *inode = ioend->io_inode; + struct bio *last = ioend->io_bio; + struct bio *bio, *next; - for (bh = ioend->io_buffer_head; bh; bh = next) { - next = bh->b_private; - bh->b_end_io(bh, !ioend->io_error); - } + for (bio = &ioend->io_inline_bio; bio; bio = next) { + struct bio_vec *bvec; + int i; + + /* + * For the last bio, bi_private points to the ioend, so we + * need to explicitly end the iteration here. + */ + if (bio == last) + next = NULL; + else + next = bio->bi_private; - mempool_free(ioend, xfs_ioend_pool); + /* walk each page on bio, ending page IO on them */ + bio_for_each_segment_all(bvec, bio, i) + xfs_finish_page_writeback(inode, bvec, error); + + bio_put(bio); + } } /* @@ -120,13 +168,9 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } ioend->io_append_trans = tp; @@ -174,7 +218,8 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend) + struct xfs_ioend *ioend, + int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_trans *tp = ioend->io_append_trans; @@ -188,53 +233,32 @@ xfs_setfilesize_ioend( __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ - if (ioend->io_error) { + if (error) { xfs_trans_cancel(tp); - return ioend->io_error; + return error; } return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* - * Schedule IO completion handling on the final put of an ioend. - * - * If there is no work to do we might as well call it a day and free the - * ioend right now. - */ -STATIC void -xfs_finish_ioend( - struct xfs_ioend *ioend) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - - if (ioend->io_type == XFS_IO_UNWRITTEN) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else - xfs_destroy_ioend(ioend); - } -} - -/* * IO write completion. */ STATIC void xfs_end_io( struct work_struct *work) { - xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); - int error = 0; + struct xfs_ioend *ioend = + container_of(work, struct xfs_ioend, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = ioend->io_bio->bi_error; /* * Set an error if the mount has shut down and proceed with end I/O * processing so it can perform whatever cleanups are necessary. */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - ioend->io_error = -EIO; + error = -EIO; /* * For unwritten extents we need to issue transactions to convert a @@ -244,55 +268,33 @@ xfs_end_io( * on error. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (ioend->io_error) + if (error) goto done; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend); + error = xfs_setfilesize_ioend(ioend, error); } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: - if (error) - ioend->io_error = error; - xfs_destroy_ioend(ioend); + xfs_destroy_ioend(ioend, error); } -/* - * Allocate and initialise an IO completion structure. - * We need to track unwritten extent write completion here initially. - * We'll need to extend this for updating the ondisk inode size later - * (vs. incore size). - */ -STATIC xfs_ioend_t * -xfs_alloc_ioend( - struct inode *inode, - unsigned int type) +STATIC void +xfs_end_bio( + struct bio *bio) { - xfs_ioend_t *ioend; - - ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); - - /* - * Set the count to 1 initially, which will prevent an I/O - * completion callback from happening before we have started - * all the I/O from calling the completion routine too early. - */ - atomic_set(&ioend->io_remaining, 1); - ioend->io_error = 0; - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; - ioend->io_inode = inode; - ioend->io_buffer_head = NULL; - ioend->io_buffer_tail = NULL; - ioend->io_offset = 0; - ioend->io_size = 0; - ioend->io_append_trans = NULL; + struct xfs_ioend *ioend = bio->bi_private; + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - INIT_WORK(&ioend->io_work, xfs_end_io); - return ioend; + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend, bio->bi_error); } STATIC int @@ -364,50 +366,6 @@ xfs_imap_valid( offset < imap->br_startoff + imap->br_blockcount; } -/* - * BIO completion handler for buffered IO. - */ -STATIC void -xfs_end_bio( - struct bio *bio) -{ - xfs_ioend_t *ioend = bio->bi_private; - - if (!ioend->io_error) - ioend->io_error = bio->bi_error; - - /* Toss bio and pass work off to an xfsdatad thread */ - bio->bi_private = NULL; - bio->bi_end_io = NULL; - bio_put(bio); - - xfs_finish_ioend(ioend); -} - -STATIC void -xfs_submit_ioend_bio( - struct writeback_control *wbc, - xfs_ioend_t *ioend, - struct bio *bio) -{ - atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; - bio->bi_end_io = xfs_end_bio; - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); -} - -STATIC struct bio * -xfs_alloc_ioend_bio( - struct buffer_head *bh) -{ - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - - ASSERT(bio->bi_private == NULL); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - return bio; -} - STATIC void xfs_start_buffer_writeback( struct buffer_head *bh) @@ -452,28 +410,35 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* - * Submit all of the bios for an ioend. We are only passed a single ioend at a - * time; the caller is responsible for chaining prior to submission. + * Submit the bio for an ioend. We are passed an ioend with a bio attached to + * it, and we submit that bio. The ioend may be used for multiple bio + * submissions, so we only want to allocate an append transaction for the ioend + * once. In the case of multiple bio submission, each bio will take an IO + * reference to the ioend to ensure that the ioend completion is only done once + * all bios have been submitted and the ioend is really done. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the ioend chain rather - * than submit it to IO. This typically only happens on a filesystem shutdown. + * and unlocked them. In this situation, we need to fail the bio and ioend + * rather than submit it to IO. This typically only happens on a filesystem + * shutdown. */ STATIC int xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend, + struct xfs_ioend *ioend, int status) { - struct buffer_head *bh; - struct bio *bio; - sector_t lastblock = 0; - /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + ioend->io_type != XFS_IO_UNWRITTEN && + xfs_ioend_is_append(ioend) && + !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); + + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = xfs_end_bio; + /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -481,33 +446,73 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_error = status; - xfs_finish_ioend(ioend); + ioend->io_bio->bi_error = status; + bio_endio(ioend->io_bio); return status; } - bio = NULL; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, + ioend->io_bio); + return 0; +} - if (!bio) { -retry: - bio = xfs_alloc_ioend_bio(bh); - } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static void +xfs_init_bio_from_bh( + struct bio *bio, + struct buffer_head *bh) +{ + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; +} - if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static struct xfs_ioend * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type, + xfs_off_t offset, + struct buffer_head *bh) +{ + struct xfs_ioend *ioend; + struct bio *bio; - lastblock = bh->b_blocknr; - } - if (bio) - xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend); - return 0; + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + xfs_init_bio_from_bh(bio, bh); + + ioend = container_of(bio, struct xfs_ioend, io_inline_bio); + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_size = 0; + ioend->io_offset = offset; + INIT_WORK(&ioend->io_work, xfs_end_io); + ioend->io_append_trans = NULL; + ioend->io_bio = bio; + return ioend; +} + +/* + * Allocate a new bio, and chain the old bio to the new one. + * + * Note that we have to do perform the chaining in this unintuitive order + * so that the bi_private linkage is set up in the right direction for the + * traversal in xfs_destroy_ioend(). + */ +static void +xfs_chain_bio( + struct xfs_ioend *ioend, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + struct bio *new; + + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + xfs_init_bio_from_bh(new, bh); + + bio_chain(ioend->io_bio, new); + bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, + ioend->io_bio); + ioend->io_bio = new; } /* @@ -523,27 +528,24 @@ xfs_add_to_ioend( struct buffer_head *bh, xfs_off_t offset, struct xfs_writepage_ctx *wpc, + struct writeback_control *wbc, struct list_head *iolist) { if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || bh->b_blocknr != wpc->last_block + 1 || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - struct xfs_ioend *new; - if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - - new = xfs_alloc_ioend(inode, wpc->io_type); - new->io_offset = offset; - new->io_buffer_head = bh; - new->io_buffer_tail = bh; - wpc->ioend = new; - } else { - wpc->ioend->io_buffer_tail->b_private = bh; - wpc->ioend->io_buffer_tail = bh; + wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); } - bh->b_private = NULL; + /* + * If the buffer doesn't fit into the bio we need to allocate a new + * one. This shouldn't happen more than once for a given buffer. + */ + while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) + xfs_chain_bio(wpc->ioend, wbc, bh); + wpc->ioend->io_size += bh->b_size; wpc->last_block = bh->b_blocknr; xfs_start_buffer_writeback(bh); @@ -803,7 +805,7 @@ xfs_writepage_map( lock_buffer(bh); if (wpc->io_type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &wpc->imap, offset); - xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); + xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); count++; } @@ -1391,13 +1393,10 @@ xfs_end_io_direct_write( trace_xfs_end_io_direct_write_append(ip, offset, size); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); - return error; - } - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, + &tp); + if (!error) + error = xfs_setfilesize(ip, tp, offset, size); } return error; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b4421177b68d..814aab790713 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern mempool_t *xfs_ioend_pool; +extern struct bio_set *xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. @@ -37,22 +37,19 @@ enum { { XFS_IO_OVERWRITE, "overwrite" } /* - * xfs_ioend struct manages large extent writes for XFS. - * It can manage several multi-page bio's at once. + * Structure for buffered I/O completions. */ -typedef struct xfs_ioend { +struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ - int io_error; /* I/O error code */ - atomic_t io_remaining; /* hold count */ struct inode *io_inode; /* file being written to */ - struct buffer_head *io_buffer_head;/* buffer linked list head */ - struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ -} xfs_ioend_t; + struct bio *io_bio; /* bio being built */ + struct bio io_inline_bio; /* MUST BE LAST! */ +}; extern const struct address_space_operations xfs_address_space_operations; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index dd4824589470..e3da5d448bcf 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ +/* Return 0 on success, or -errno; other state communicated via *context */ typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int, unsigned char *); + unsigned char *, int, int); typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ @@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context { int firstu; /* first used byte in buffer */ int flags; /* from VOP call */ int resynch; /* T/F: resynch with cursor */ - int put_value; /* T/F: need value for listent */ put_listent_func_t put_listent; /* list output fmt function */ int index; /* index into output buffer */ } xfs_attr_list_context_t; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 2bb959ada45b..55d214981ed2 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -405,21 +405,11 @@ xfs_attr_inactive( goto out_destroy_fork; xfs_iunlock(dp, lock_mode); - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ lock_mode = 0; - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans); if (error) - goto out_cancel; + goto out_destroy_fork; lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 4fa14820e2e2..d25f26b22ac9 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sfe->flags, sfe->nameval, (int)sfe->namelen, - (int)sfe->valuelen, - &sfe->nameval[sfe->namelen]); - + (int)sfe->valuelen); + if (error) + return error; /* * Either search callback finished early or * didn't fit it all in the buffer after all. */ if (context->seen_enough) break; - - if (error) - return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } trace_xfs_attr_list_sf_all(context); @@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->flags, sbp->name, sbp->namelen, - sbp->valuelen, - &sbp->name[sbp->namelen]); + sbp->valuelen); if (error) { kmem_free(sbuf); return error; @@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int( */ retval = 0; for (; i < ichdr.count; entry++, i++) { + char *name; + int namelen, valuelen; + if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int( continue; /* skip incomplete entries */ if (entry->flags & XFS_ATTR_LOCAL) { - xfs_attr_leaf_name_local_t *name_loc = - xfs_attr3_leaf_name_local(leaf, i); - - retval = context->put_listent(context, - entry->flags, - name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen), - &name_loc->nameval[name_loc->namelen]); - if (retval) - return retval; + xfs_attr_leaf_name_local_t *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + valuelen = be16_to_cpu(name_loc->valuelen); } else { - xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr3_leaf_name_remote(leaf, i); - - int valuelen = be32_to_cpu(name_rmt->valuelen); - - if (context->put_value) { - xfs_da_args_t args; - - memset((char *)&args, 0, sizeof(args)); - args.geo = context->dp->i_mount->m_attr_geo; - args.dp = context->dp; - args.whichfork = XFS_ATTR_FORK; - args.valuelen = valuelen; - args.rmtvaluelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); - args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = xfs_attr3_rmt_blocks( - args.dp->i_mount, valuelen); - retval = xfs_attr_rmtval_get(&args); - if (!retval) - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); - kmem_free(args.value); - } else { - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - NULL); - } - if (retval) - return retval; + xfs_attr_leaf_name_remote_t *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + valuelen = be32_to_cpu(name_rmt->valuelen); } + + retval = context->put_listent(context, entry->flags, + name, namelen, valuelen); + if (retval) + break; if (context->seen_enough) break; cursor->offset++; @@ -551,8 +519,7 @@ xfs_attr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; @@ -581,7 +548,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 1; + return 0; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3b6309865c65..586bb64e674b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -72,18 +72,11 @@ xfs_zero_extent( struct xfs_mount *mp = ip->i_mount; xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); sector_t block = XFS_BB_TO_FSBT(mp, sector); - ssize_t size = XFS_FSB_TO_B(mp, count_fsb); - - if (IS_DAX(VFS_I(ip))) - return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), - sector, size); - - /* - * let the block layer decide on the fastest method of - * implementing the zeroing. - */ - return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + block << (mp->m_super->s_blocksize_bits - 9), + count_fsb << (mp->m_super->s_blocksize_bits - 9), + GFP_NOFS, true); } /* @@ -900,19 +893,15 @@ xfs_free_eofblocks( * Free them up now by truncating the file to * its current size. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return -EAGAIN; - } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -1037,9 +1026,9 @@ xfs_alloc_file_space( /* * Allocate and setup the transaction. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + resrtextents, 0, &tp); + /* * Check for running out of space */ @@ -1048,7 +1037,6 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1311,18 +1299,10 @@ xfs_free_file_space( * transaction to dip into the reserve blocks to ensure * the freeing of the space succeeds at ENOSPC. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); - - /* - * check for running out of space - */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, + &tp); if (error) { - /* - * Free the transaction structure. - */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1482,19 +1462,16 @@ xfs_shift_file_space( } while (!error && !done) { - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* * We would need to reserve permanent block for transaction. * This will come into picture when after shifting extent into * hole we found that adjacent extents can be merged which * may lead to freeing of a block during record update. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) break; - } xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, @@ -1747,12 +1724,9 @@ xfs_swap_extents( if (error) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_unlock; - } /* * Lock and join the inodes to the tansaction so that transaction commit diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9a2191b91137..e71cfbd5acb3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1100,22 +1100,18 @@ xfs_bwrite( return error; } -STATIC void +static void xfs_buf_bio_end_io( struct bio *bio) { - xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) { - spin_lock(&bp->b_lock); - if (!bp->b_io_error) - bp->b_io_error = bio->bi_error; - spin_unlock(&bp->b_lock); - } + if (bio->bi_error) + cmpxchg(&bp->b_io_error, 0, bio->bi_error); if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4eb89bd4ee73..8bfb974f0772 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -183,6 +183,26 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ int b_error; /* error code on I/O */ + + /* + * async write failure retry count. Initialised to zero on the first + * failure, then when it exceeds the maximum configured without a + * success the write is considered to be failed permanently and the + * iodone handler will take appropriate action. + * + * For retry timeouts, we record the jiffie of the first failure. This + * means that we can change the retry timeout for buffers already under + * I/O and thus avoid getting stuck in a retry loop with a long timeout. + * + * last_error is used to ensure that we are getting repeated errors, not + * different errors. e.g. a block device might change ENOSPC to EIO when + * a failure timeout occurs, so we want to re-initialise the error + * retry behaviour appropriately when that happens. + */ + int b_retries; + unsigned long b_first_retry_time; /* in jiffies */ + int b_last_error; + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 99e91a0e554e..34257992934c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks( } } -/* - * This is the iodone() function for buffers which have had callbacks - * attached to them by xfs_buf_attach_iodone(). It should remove each - * log item from the buffer's list and call the callback of each in turn. - * When done, the buffer's fsprivate field is set to NULL and the buffer - * is unlocked with a call to iodone(). - */ -void -xfs_buf_iodone_callbacks( +static bool +xfs_buf_iodone_callback_error( struct xfs_buf *bp) { struct xfs_log_item *lip = bp->b_fspriv; struct xfs_mount *mp = lip->li_mountp; static ulong lasttime; static xfs_buftarg_t *lasttarg; - - if (likely(!bp->b_error)) - goto do_callbacks; + struct xfs_error_cfg *cfg; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_item_iodone(bp, _RET_IP_); - goto do_callbacks; - } + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1079,45 +1066,93 @@ xfs_buf_iodone_callbacks( } lasttarg = bp->b_target; + /* synchronous writes will have callers process the error */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + ASSERT(bp->b_iodone != NULL); + /* * If the write was asynchronous then no one will be looking for the - * error. Clear the error state and write the buffer out again. - * - * XXX: This helps against transient write errors, but we need to find - * a way to shut the filesystem down if the writes keep failing. - * - * In practice we'll shut the filesystem down soon as non-transient - * errors tend to affect the whole device and a failing log write - * will make us give up. But we really ought to do better here. + * error. If this is the first failure of this type, clear the error + * state and write the buffer out again. This means we always retry an + * async write failure at least once, but we also need to set the buffer + * up to behave correctly now for repeated failures. */ - if (bp->b_flags & XBF_ASYNC) { - ASSERT(bp->b_iodone != NULL); + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || + bp->b_last_error != bp->b_error) { + bp->b_flags |= (XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + bp->b_retries = 0; + bp->b_first_retry_time = jiffies; + + xfs_buf_ioerror(bp, 0); + xfs_buf_submit(bp); + return true; + } - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + /* + * Repeated failure on an async write. Take action according to the + * error configuration we have been set up to use. + */ + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + goto permanent_error; + if (cfg->retry_timeout && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + goto permanent_error; - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_submit(bp); - } else { - xfs_buf_relse(bp); - } + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + goto permanent_error; - return; - } + /* still a transient error, higher layers will retry */ + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; /* - * If the write of the buffer was synchronous, we want to make - * sure to return the error to the caller of xfs_bwrite(). + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. */ +permanent_error: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); +out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +/* + * This is the iodone() function for buffers which have had callbacks attached + * to them by xfs_buf_attach_iodone(). We need to iterate the items on the + * callback list, mark the buffer as having no more callbacks and then push the + * buffer through IO completion processing. + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + /* + * If there is an error, process it. Some errors require us + * to run callbacks after failure processing is done so we + * detect that and take appropriate action. + */ + if (bp->b_error && xfs_buf_iodone_callback_error(bp)) + return; + + /* + * Successful IO or permanent error. Either way, we can clear the + * retry state here in preparation for the next error that may occur. + */ + bp->b_last_error = 0; + bp->b_retries = 0; -do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 316b2a1bdba5..e0646659ce16 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -614,11 +614,10 @@ xfs_qm_dqread( trace_xfs_dqread(dqp); if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); if (error) - goto error1; + goto error0; } /* @@ -692,7 +691,7 @@ error0: * end of the chunk, skip ahead to first id in next allocated chunk * using the SEEK_DATA interface. */ -int +static int xfs_dq_get_next_id( xfs_mount_t *mp, uint type, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 85ce3032f815..47fc63295422 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -145,12 +145,10 @@ xfs_update_prealloc_flags( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1553,7 +1551,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); @@ -1587,7 +1585,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1624,8 +1622,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ee3aaa0a5317..b4d75825ae37 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -198,14 +198,10 @@ xfs_growfs_data_private( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, - XFS_GROWFS_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } /* * Write new AG headers to disk. Non-transactional, but written @@ -243,8 +239,8 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - agf->agf_flfirst = 0; - agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bf2d60749278..99ee6eee5e0b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,9 +37,6 @@ #include <linux/kthread.h> #include <linux/freezer.h> -STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, - struct xfs_perag *pag, struct xfs_inode *ip); - /* * Allocate and initialise an xfs_inode. */ @@ -94,13 +91,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: @@ -118,6 +108,25 @@ xfs_inode_free( ip->i_itemp = NULL; } + kmem_zone_free(xfs_inode_zone, ip); +} + +static void +__xfs_inode_free( + struct xfs_inode *ip) +{ + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(ip->i_mount, vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -129,12 +138,123 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(ip->i_mount, vn_active); + __xfs_inode_free(ip); +} - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +xfs_perag_set_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (pag->pag_ici_reclaimable++) + return; + + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(mp); + + trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + +static void +xfs_perag_clear_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (--pag->pag_ici_reclaimable) + return; + + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_set_reclaim_tag(pag); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +xfs_inode_clear_reclaim_tag( + struct xfs_perag *pag, + xfs_ino_t ino) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(pag->pag_mount, ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_clear_reclaim_tag(pag); } /* @@ -264,7 +384,7 @@ xfs_iget_cache_hit( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); @@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. - */ -static void -xfs_reclaim_work_queue( - struct xfs_mount *mp) -{ - - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, - msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); - } - rcu_read_unlock(); -} - -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - -static void -__xfs_inode_set_reclaim_tag( - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - - if (!pag->pag_ici_reclaimable) { - /* propagate the reclaim tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(ip->i_mount); - - trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } - pag->pag_ici_reclaimable++; -} - -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_set_reclaim_tag( - xfs_inode_t *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_set_reclaim_tag(pag, ip); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - -STATIC void -__xfs_inode_clear_reclaim( - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - pag->pag_ici_reclaimable--; - if (!pag->pag_ici_reclaimable) { - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } -} - -STATIC void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_inode_clear_reclaim(pag, ip); -} - -/* * Grab the inode for reclaim exclusively. * Return 0 if we grabbed it, non-zero otherwise. */ @@ -929,6 +934,7 @@ xfs_reclaim_inode( int sync_mode) { struct xfs_buf *bp = NULL; + xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: @@ -993,6 +999,22 @@ restart: xfs_iflock(ip); reclaim: + /* + * Because we use RCU freeing we need to ensure the inode always appears + * to be reclaimed with an invalid inode number when in the free state. + * We do this as early as possible under the ILOCK and flush lock so + * that xfs_iflush_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush_cluster has + * locked both the XFS_ILOCK and the flush lock that it will see either + * a valid, flushable inode that will serialise correctly against the + * locks below, or it will see a clean (and invalid) inode that it can + * skip. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1006,9 +1028,9 @@ reclaim: */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - __xfs_inode_clear_reclaim(pag, ip); + xfs_perag_clear_reclaim_tag(pag); spin_unlock(&pag->pag_ici_lock); /* @@ -1023,7 +1045,7 @@ reclaim: xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_inode_free(ip); + __xfs_inode_free(ip); return error; out_ifunlock: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 96f606deee31..ee6799e0476f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1030,7 +1030,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, 0); + code = xfs_trans_roll(&tp, NULL); if (committed != NULL) *committed = 1; @@ -1161,11 +1161,9 @@ xfs_create( rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_mkdir; - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_create; - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } /* @@ -1174,20 +1172,19 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; - + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -1337,17 +1334,16 @@ xfs_create_tmpfile( return error; resblks = XFS_IALLOC_SPACE_RES(mp); - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); - tres = &M_RES(mp)->tr_create_tmpfile; - error = xfs_trans_reserve(tp, tres, resblks, 0); + + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1432,15 +1428,14 @@ xfs_link( if (error) goto std_return; - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); } if (error) - goto error_return; + goto std_return; xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1710,11 +1705,9 @@ xfs_inactive_truncate( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); return error; } @@ -1764,8 +1757,6 @@ xfs_inactive_ifree( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - /* * The ifree transaction might need to allocate blocks for record * insertion to the finobt. We don't want to fail here at ENOSPC, so @@ -1781,9 +1772,8 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1792,7 +1782,6 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp); return error; } @@ -2525,11 +2514,6 @@ xfs_remove( if (error) goto std_return; - if (is_dir) - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - else - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - /* * We try to get the real space reservation first, * allowing for directory btree deletion(s) implying @@ -2540,14 +2524,15 @@ xfs_remove( * block from the directory. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, + &tp); } if (error) { ASSERT(error != -ENOSPC); - goto out_trans_cancel; + goto std_return; } xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); @@ -2855,6 +2840,7 @@ xfs_rename_alloc_whiteout( * and flag it as linkable. */ drop_nlink(VFS_I(tmpfile)); + xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -2910,15 +2896,15 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { spaceres = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_wip; /* * Attach the dquots to the inodes @@ -3155,6 +3141,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: xfs_trans_cancel(tp); +out_release_wip: if (wip) IRELE(wip); return error; @@ -3162,16 +3149,16 @@ out_trans_cancel: STATIC int xfs_iflush_cluster( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_mount_t *mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; - int ilist_size; - xfs_inode_t **ilist; - xfs_inode_t *iq; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; int nr_found; int clcount = 0; int bufwasdelwri; @@ -3180,23 +3167,23 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); - ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); - if (!ilist) + cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) goto out_put; mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, first_index, inodes_per_cluster); if (nr_found == 0) goto out_free; for (i = 0; i < nr_found; i++) { - iq = ilist[i]; - if (iq == ip) + cip = cilist[i]; + if (cip == ip) continue; /* @@ -3205,20 +3192,30 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino || - (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&ip->i_flags_lock); + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); continue; } - spin_unlock(&ip->i_flags_lock); + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); + break; + } + spin_unlock(&cip->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) continue; /* @@ -3226,15 +3223,28 @@ xfs_iflush_cluster( * then this inode cannot be flushed and is skipped. */ - if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; - if (!xfs_iflock_nowait(iq)) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } - if (xfs_ipincount(iq)) { - xfs_ifunlock(iq); - xfs_iunlock(iq, XFS_ILOCK_SHARED); + + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } @@ -3242,18 +3252,18 @@ xfs_iflush_cluster( * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. */ - if (!xfs_inode_clean(iq)) { + if (!xfs_inode_clean(cip)) { int error; - error = xfs_iflush_int(iq, bp); + error = xfs_iflush_int(cip, bp); if (error) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); goto cluster_corrupt_out; } clcount++; } else { - xfs_ifunlock(iq); + xfs_ifunlock(cip); } - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); } if (clcount) { @@ -3263,7 +3273,7 @@ xfs_iflush_cluster( out_free: rcu_read_unlock(); - kmem_free(ilist); + kmem_free(cilist); out_put: xfs_perag_put(pag); return 0; @@ -3306,8 +3316,8 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq, false); - kmem_free(ilist); + xfs_iflush_abort(cip, false); + kmem_free(cilist); xfs_perag_put(pag); return -EFSCORRUPTED; } @@ -3327,7 +3337,7 @@ xfs_iflush( struct xfs_buf **bpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp; + struct xfs_buf *bp = NULL; struct xfs_dinode *dip; int error; @@ -3369,14 +3379,22 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. + * Get the buffer containing the on-disk inode. We are doing a try-lock + * operation here, so we may get an EAGAIN error. In that case, we + * simply want to return with the inode still dirty. + * + * If we get any other error, we effectively have a corruption situation + * and we cannot flush the inode, so we treat it the same as failing + * xfs_iflush_int(). */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error || !bp) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } + if (error) + goto corrupt_out; /* * First flush out the inode that xfs_iflush was called with. @@ -3404,7 +3422,8 @@ xfs_iflush( return 0; corrupt_out: - xfs_buf_relse(bp); + if (bp) + xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: error = -EFSCORRUPTED; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 43e1d51b15eb..e52d7c7aeb5b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -440,6 +440,9 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, /* from xfs_iops.c */ +extern void xfs_setup_inode(struct xfs_inode *ip); +extern void xfs_setup_iops(struct xfs_inode *ip); + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at @@ -447,7 +450,6 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, * before we've completed instantiation. Otherwise we can do it * the moment the inode lookup is complete. */ -extern void xfs_setup_inode(struct xfs_inode *ip); static inline void xfs_finish_inode_setup(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_INEW); @@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) static inline void xfs_setup_existing_inode(struct xfs_inode *ip) { xfs_setup_inode(ip); + xfs_setup_iops(ip); xfs_finish_inode_setup(ip); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c48b5b18d771..a1b07612224c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork( */ data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_real_bytes == 0 || - ip->i_df.if_real_bytes == data_bytes); + ip->i_df.if_real_bytes >= data_bytes); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, @@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork( */ data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_real_bytes == 0 || - ip->i_afp->if_real_bytes == data_bytes); + ip->i_afp->if_real_bytes >= data_bytes); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, @@ -479,6 +479,8 @@ STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) + __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcb6c19ce3ea..dbca7375deef 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -277,7 +277,6 @@ xfs_readlink_by_handle( { struct dentry *dentry; __u32 olen; - void *link; int error; if (!capable(CAP_SYS_ADMIN)) @@ -288,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!d_is_symlink(dentry)) { + if (!d_inode(dentry)->i_op->readlink) { error = -EINVAL; goto out_dput; } @@ -298,21 +297,8 @@ xfs_readlink_by_handle( goto out_dput; } - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) { - error = -ENOMEM; - goto out_dput; - } - - error = xfs_readlink(XFS_I(d_inode(dentry)), link); - if (error) - goto out_kfree; - error = readlink_copy(hreq->ohandle, olen, link); - if (error) - goto out_kfree; + error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen); - out_kfree: - kfree(link); out_dput: dput(dentry); return error; @@ -334,12 +320,10 @@ xfs_set_dmattrs( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1141,10 +1125,9 @@ xfs_ioctl_setattr_get_trans( if (XFS_FORCED_SHUTDOWN(mp)) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_cancel; + return ERR_PTR(error); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d81bdc080370..58391355a44d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -132,6 +132,7 @@ xfs_iomap_write_direct( int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; + uint tflags = 0; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); @@ -192,11 +193,6 @@ xfs_iomap_write_direct( return error; /* - * Allocate and setup the transaction - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - - /* * For DAX, we do not allocate unwritten extents, but instead we zero * the block before we commit the transaction. Ideally we'd like to do * this outside the transaction context, but if we commit and then crash @@ -209,23 +205,17 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (ISUNWRITTEN(imap)) { - tp->t_flags |= XFS_TRANS_RESERVE; + tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); - /* - * Check for running out of space, note: need lock to return - */ - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, + tflags, &tp); + if (error) return error; - } lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); @@ -726,15 +716,13 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - nres, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, + 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -878,25 +866,18 @@ xfs_iomap_write_unwritten( do { /* - * set up a transaction to convert the range of extents + * Set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. * - * Note that we open code the transaction allocation here - * to pass KM_NOFS--we can't risk to recursing back into - * the filesystem here as we might be asked to write out - * the same inode that we complete here and might deadlock - * on the iolock. + * Note that we can't risk to recursing back into the filesystem + * here as we might be asked to write out the same inode that we + * complete here and might deadlock on the iolock. */ - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fb7dc61f4a29..c5d4eba6972e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -181,6 +181,8 @@ xfs_generic_create( } #endif + xfs_setup_iops(ip); + if (tmpfile) d_tmpfile(dentry, inode); else @@ -368,6 +370,8 @@ xfs_vn_symlink( if (unlikely(error)) goto out_cleanup_inode; + xfs_setup_iops(cip); + d_instantiate(dentry, inode); xfs_finish_inode_setup(cip); return 0; @@ -442,6 +446,16 @@ xfs_vn_get_link( return ERR_PTR(error); } +STATIC const char * +xfs_vn_get_link_inline( + struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + return XFS_I(inode)->i_df.if_u1.if_data; +} + STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -599,12 +613,12 @@ xfs_setattr_nonsize( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -633,12 +647,10 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_unlock; + goto out_cancel; } } - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -722,10 +734,9 @@ xfs_setattr_nonsize( return 0; -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_trans_cancel: +out_cancel: xfs_trans_cancel(tp); +out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; @@ -834,7 +845,7 @@ xfs_setattr_size( * We have to do all the page cache truncate work outside the * transaction context as the "lock" order is page lock->log space * reservation as defined by extent allocation in the writeback path. - * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but * having already truncated the in-memory version of the file (i.e. made * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate @@ -849,10 +860,9 @@ xfs_setattr_size( return error; truncate_setsize(inode, newsize); - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -971,12 +981,9 @@ xfs_vn_update_time( trace_xfs_update_time(ip); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); if (flags & S_CTIME) @@ -1167,6 +1174,18 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +static const struct inode_operations xfs_inline_symlink_inode_operations = { + .readlink = generic_readlink, + .get_link = xfs_vn_get_link_inline, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1193,7 +1212,7 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode and set up the operation vectors. + * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, * when creating a new inode it is called from xfs_ialloc after setting up the @@ -1232,32 +1251,12 @@ xfs_setup_inode( i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); - ip->d_ops = ip->i_mount->m_nondir_inode_ops; - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: + if (S_ISDIR(inode->i_mode)) { lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) - inode->i_op = &xfs_dir_ci_inode_operations; - else - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; ip->d_ops = ip->i_mount->m_dir_inode_ops; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (!(ip->i_df.if_flags & XFS_IFINLINE)) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; + } else { + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); } /* @@ -1277,3 +1276,35 @@ xfs_setup_inode( cache_no_acl(inode); } } + +void +xfs_setup_iops( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + if (ip->i_df.if_flags & XFS_IFINLINE) + inode->i_op = &xfs_inline_symlink_inode_operations; + else + inode->i_op = &xfs_symlink_inode_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b49ccf5c1d75..bde02f1fba73 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -435,8 +435,7 @@ xfs_log_reserve( int cnt, struct xlog_ticket **ticp, __uint8_t client, - bool permanent, - uint t_type) + bool permanent) { struct xlog *log = mp->m_log; struct xlog_ticket *tic; @@ -456,7 +455,6 @@ xfs_log_reserve( if (!tic) return -ENOMEM; - tic->t_trans_type = t_type; *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - error = xfs_log_reserve(mp, 600, 1, &tic, - XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (!error) { /* the data section must be 32 bit size aligned */ struct { @@ -2032,58 +2029,8 @@ xlog_print_tic_res( REG_TYPE_STR(ICREATE, "inode create") }; #undef REG_TYPE_STR -#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type - static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { - TRANS_TYPE_STR(SETATTR_NOT_SIZE), - TRANS_TYPE_STR(SETATTR_SIZE), - TRANS_TYPE_STR(INACTIVE), - TRANS_TYPE_STR(CREATE), - TRANS_TYPE_STR(CREATE_TRUNC), - TRANS_TYPE_STR(TRUNCATE_FILE), - TRANS_TYPE_STR(REMOVE), - TRANS_TYPE_STR(LINK), - TRANS_TYPE_STR(RENAME), - TRANS_TYPE_STR(MKDIR), - TRANS_TYPE_STR(RMDIR), - TRANS_TYPE_STR(SYMLINK), - TRANS_TYPE_STR(SET_DMATTRS), - TRANS_TYPE_STR(GROWFS), - TRANS_TYPE_STR(STRAT_WRITE), - TRANS_TYPE_STR(DIOSTRAT), - TRANS_TYPE_STR(WRITEID), - TRANS_TYPE_STR(ADDAFORK), - TRANS_TYPE_STR(ATTRINVAL), - TRANS_TYPE_STR(ATRUNCATE), - TRANS_TYPE_STR(ATTR_SET), - TRANS_TYPE_STR(ATTR_RM), - TRANS_TYPE_STR(ATTR_FLAG), - TRANS_TYPE_STR(CLEAR_AGI_BUCKET), - TRANS_TYPE_STR(SB_CHANGE), - TRANS_TYPE_STR(DUMMY1), - TRANS_TYPE_STR(DUMMY2), - TRANS_TYPE_STR(QM_QUOTAOFF), - TRANS_TYPE_STR(QM_DQALLOC), - TRANS_TYPE_STR(QM_SETQLIM), - TRANS_TYPE_STR(QM_DQCLUSTER), - TRANS_TYPE_STR(QM_QINOCREATE), - TRANS_TYPE_STR(QM_QUOTAOFF_END), - TRANS_TYPE_STR(FSYNC_TS), - TRANS_TYPE_STR(GROWFSRT_ALLOC), - TRANS_TYPE_STR(GROWFSRT_ZERO), - TRANS_TYPE_STR(GROWFSRT_FREE), - TRANS_TYPE_STR(SWAPEXT), - TRANS_TYPE_STR(CHECKPOINT), - TRANS_TYPE_STR(ICREATE), - TRANS_TYPE_STR(CREATE_TMPFILE) - }; -#undef TRANS_TYPE_STR xfs_warn(mp, "xlog_write: reservation summary:"); - xfs_warn(mp, " trans type = %s (%u)", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? - "bad-trans-type" : trans_type_str[ticket->t_trans_type]), - ticket->t_trans_type); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); xfs_warn(mp, " current res = %d bytes", @@ -3378,7 +3325,7 @@ xfs_log_force( { int error; - trace_xfs_log_force(mp, 0); + trace_xfs_log_force(mp, 0, _RET_IP_); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3527,7 +3474,7 @@ xfs_log_force_lsn( { int error; - trace_xfs_log_force(mp, lsn); + trace_xfs_log_force(mp, lsn, _RET_IP_); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3709,7 +3656,6 @@ xlog_ticket_alloc( tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; - tic->t_trans_type = 0; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index aa533a7d50f2..80ba0c047090 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -161,8 +161,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int count, struct xlog_ticket **ticket, __uint8_t clientid, - bool permanent, - uint t_type); + bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4e7649351f5a..5e54e7955ea6 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -51,7 +51,6 @@ xlog_cil_ticket_alloc( tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, KM_SLEEP|KM_NOFS); - tic->t_trans_type = XFS_TRANS_CHECKPOINT; /* * set the current reservation to zero so we know to steal the basic diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ed8896310c00..765f084759b5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -175,7 +175,6 @@ typedef struct xlog_ticket { char t_cnt; /* current count : 1 */ char t_clientid; /* who does this belong to; : 1 */ char t_flags; /* properties of reservation : 1 */ - uint t_trans_type; /* transaction type : 4 */ /* reservation array fields */ uint t_res_num; /* num in array : 4 */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 396565f43247..835997843846 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4205,10 +4205,9 @@ xlog_recover_process_efi( } } - tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto abort_error; + return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { @@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket( int offset; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) - goto out_abort; + goto out_error; error = xfs_read_agi(mp, tp, agno, &agibp); if (error) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index cfd4210dd015..e39b02351b4a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -89,7 +89,6 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - xfs_uuid_table_size * sizeof(*xfs_uuid_table), KM_SLEEP); hole = xfs_uuid_table_size++; } @@ -681,6 +680,9 @@ xfs_mountfs( xfs_set_maxicount(mp); + /* enable fail_at_unmount as default */ + mp->m_fail_unmount = 1; + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; @@ -690,10 +692,15 @@ xfs_mountfs( if (error) goto out_remove_sysfs; - error = xfs_uuid_mount(mp); + error = xfs_error_sysfs_init(mp); if (error) goto out_del_stats; + + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_error_sysfs; + /* * Set the minimum read and write sizes */ @@ -957,6 +964,7 @@ xfs_mountfs( cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: + mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -968,6 +976,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_error_sysfs: + xfs_error_sysfs_del(mp); out_del_stats: xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: @@ -1006,6 +1016,14 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* + * We now need to tell the world we are unmounting. This will allow + * us to detect that the filesystem is going away and we should error + * out anything that we have been retrying in the background. This will + * prevent neverending retries in AIL pushing from hanging the unmount. + */ + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + /* * Flush all pending changes from the AIL. */ xfs_ail_push_all_sync(mp->m_ail); @@ -1056,6 +1074,7 @@ xfs_unmountfs( #endif xfs_free_perag(mp); + xfs_error_sysfs_del(mp); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index eafe257b357a..c1b798c72126 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -37,6 +37,32 @@ enum { XFS_LOWSP_MAX, }; +/* + * Error Configuration + * + * Error classes define the subsystem the configuration belongs to. + * Error numbers define the errors that are configurable. + */ +enum { + XFS_ERR_METADATA, + XFS_ERR_CLASS_MAX, +}; +enum { + XFS_ERR_DEFAULT, + XFS_ERR_EIO, + XFS_ERR_ENOSPC, + XFS_ERR_ENODEV, + XFS_ERR_ERRNO_MAX, +}; + +#define XFS_ERR_RETRY_FOREVER -1 + +struct xfs_error_cfg { + struct xfs_kobj kobj; + int max_retries; + unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ +}; + typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ @@ -127,6 +153,9 @@ typedef struct xfs_mount { int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct xfs_kobj m_error_kobj; + struct xfs_kobj m_error_meta_kobj; + struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; @@ -148,6 +177,7 @@ typedef struct xfs_mount { */ __uint32_t m_generation; + bool m_fail_unmount; #ifdef DEBUG /* * DEBUG mode instrumentation to test and/or trigger delayed allocation @@ -166,6 +196,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb); +struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, + int error_class, int error); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 51ddaf2c2b8c..d5b756669fb5 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -308,12 +308,9 @@ xfs_fs_commit_blocks( goto out_drop_iolock; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_drop_iolock; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be125e1758c1..a60d9e2739d1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -783,13 +783,10 @@ xfs_qm_qino_alloc( } } - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + if (error) return error; - } if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index f4d0e0a8f517..475a3882a81f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_IOLOCK_EXCL); - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -436,12 +434,9 @@ xfs_qm_scall_setqlim( defq = xfs_get_defquota(dqp, q); xfs_dqunlock(dqp); - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); + if (error) goto out_rele; - } xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); @@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end( int error; xfs_qoff_logitem_t *qoffi; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) return error; - } qoffi = xfs_trans_get_qoff_item(tp, startqoff, flags & XFS_ALL_QUOTA_ACCT); @@ -603,12 +594,9 @@ xfs_qm_log_quotaoff( *qoffstartp = NULL; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) goto out; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index abf44435d04a..3938b37d1043 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,15 +780,14 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* * Reserve space & log for one extent added to the file. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, - resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks, + 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the inode. */ @@ -823,14 +822,13 @@ xfs_growfs_rt_alloc( for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* * Reserve log for one block zeroing. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, + 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the bitmap inode. */ @@ -994,11 +992,10 @@ xfs_growfs_rt( /* * Start a transaction, get the log reservation. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0, + &tp); if (error) - goto error_cancel; + break; /* * Lock out other callers by grabbing the bitmap inode lock. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 187e14b696c2..11ea5d51db56 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -58,8 +58,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -static kmem_zone_t *xfs_ioend_zone; -mempool_t *xfs_ioend_pool; +struct bio_set *xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -350,6 +349,7 @@ xfs_parseargs( case Opt_pqnoenforce: mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); mp->m_qflags &= ~XFS_PQUOTA_ENFD; + break; case Opt_gquota: case Opt_grpquota: mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | @@ -928,7 +928,7 @@ xfs_fs_alloc_inode( /* * Now that the generic code is guaranteed not to be accessing - * the linux inode, we can reclaim the inode. + * the linux inode, we can inactivate and reclaim the inode. */ STATIC void xfs_fs_destroy_inode( @@ -938,9 +938,14 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - XFS_STATS_INC(ip->i_mount, vn_reclaim); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + XFS_STATS_INC(ip->i_mount, vn_rele); + XFS_STATS_INC(ip->i_mount, vn_remove); + + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* * We should never get here with one of the reclaim flags already set. @@ -987,24 +992,6 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -STATIC void -xfs_fs_evict_inode( - struct inode *inode) -{ - xfs_inode_t *ip = XFS_I(inode); - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - - trace_xfs_evict_inode(ip); - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - XFS_STATS_INC(ip->i_mount, vn_rele); - XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); -} - /* * We do an unlocked check for XFS_IDONTCACHE here because we are already * serialised against cache hits here via the inode->i_lock and igrab() in @@ -1276,6 +1263,16 @@ xfs_fs_remount( return -EINVAL; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1558,14 +1555,12 @@ xfs_fs_fill_super( if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - if (sb->s_blocksize != PAGE_SIZE) { - xfs_alert(mp, - "Filesystem block size invalid for DAX Turning DAX off."); - mp->m_flags &= ~XFS_MOUNT_DAX; - } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + + error = bdev_dax_supported(sb, sb->s_blocksize); + if (error) { xfs_alert(mp, - "Block device does not support DAX Turning DAX off."); + "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } } @@ -1663,7 +1658,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, @@ -1688,20 +1682,15 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) + xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + offsetof(struct xfs_ioend, io_inline_bio)); + if (!xfs_ioend_bioset) goto out; - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_destroy_ioend_zone; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), "xfs_log_ticket"); if (!xfs_log_ticket_zone) - goto out_destroy_ioend_pool; + goto out_free_ioend_bioset; xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), "xfs_bmap_free_item"); @@ -1797,10 +1786,8 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); - out_destroy_ioend_pool: - mempool_destroy(xfs_ioend_pool); - out_destroy_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); + out_free_ioend_bioset: + bioset_free(xfs_ioend_bioset); out: return -ENOMEM; } @@ -1826,9 +1813,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_ioend_zone); - + bioset_free(xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index b44284c1adda..08a46c6181fd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -131,6 +131,8 @@ xfs_readlink( trace_xfs_readlink(ip); + ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE)); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -150,12 +152,7 @@ xfs_readlink( } - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } + error = xfs_readlink_bmap(ip, link); out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -221,7 +218,6 @@ xfs_symlink( if (error) return error; - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -231,13 +227,15 @@ xfs_symlink( else fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); if (error == -ENOSPC && fs_blocks == 0) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -302,19 +300,11 @@ xfs_symlink( * If the symlink will fit into the inode, write it inline. */ if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + ip->i_d.di_size = pathlen; ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - } else { int offset; @@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt( */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 6ced4f143494..4c2c55086208 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -17,10 +17,11 @@ */ #include "xfs.h" -#include "xfs_sysfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_sysfs.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_stats.h" @@ -362,3 +363,291 @@ struct kobj_type xfs_log_ktype = { .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_log_attrs, }; + +/* + * Metadata IO error configuration + * + * The sysfs structure here is: + * ...xfs/<dev>/error/<class>/<errno>/<error_attrs> + * + * where <class> allows us to discriminate between data IO and metadata IO, + * and any other future type of IO (e.g. special inode or directory error + * handling) we care to support. + */ +static inline struct xfs_error_cfg * +to_error_cfg(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_error_cfg, kobj); +} + +static inline struct xfs_mount * +err_to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_mount, m_error_kobj); +} + +static ssize_t +max_retries_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); +} + +static ssize_t +max_retries_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < -1) + return -EINVAL; + + cfg->max_retries = val; + return count; +} +XFS_SYSFS_ATTR_RW(max_retries); + +static ssize_t +retry_timeout_seconds_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%ld\n", + jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); +} + +static ssize_t +retry_timeout_seconds_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + /* 1 day timeout maximum */ + if (val < 0 || val > 86400) + return -EINVAL; + + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + return count; +} +XFS_SYSFS_ATTR_RW(retry_timeout_seconds); + +static ssize_t +fail_at_unmount_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_mount *mp = err_to_mp(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount); +} + +static ssize_t +fail_at_unmount_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = err_to_mp(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 1) + return -EINVAL; + + mp->m_fail_unmount = val; + return count; +} +XFS_SYSFS_ATTR_RW(fail_at_unmount); + +static struct attribute *xfs_error_attrs[] = { + ATTR_LIST(max_retries), + ATTR_LIST(retry_timeout_seconds), + NULL, +}; + + +struct kobj_type xfs_error_cfg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_error_attrs, +}; + +struct kobj_type xfs_error_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, +}; + +/* + * Error initialization tables. These need to be ordered in the same + * order as the enums used to index the array. All class init tables need to + * define a "default" behaviour as the first entry, all other entries can be + * empty. + */ +struct xfs_error_init { + char *name; + int max_retries; + int retry_timeout; /* in seconds */ +}; + +static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { + { .name = "default", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "EIO", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENOSPC", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENODEV", + .max_retries = 0, + }, +}; + +static int +xfs_error_sysfs_init_class( + struct xfs_mount *mp, + int class, + const char *parent_name, + struct xfs_kobj *parent_kobj, + const struct xfs_error_init init[]) +{ + struct xfs_error_cfg *cfg; + int error; + int i; + + ASSERT(class < XFS_ERR_CLASS_MAX); + + error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype, + &mp->m_error_kobj, parent_name); + if (error) + return error; + + for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) { + cfg = &mp->m_error_cfg[class][i]; + error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype, + parent_kobj, init[i].name); + if (error) + goto out_error; + + cfg->max_retries = init[i].max_retries; + cfg->retry_timeout = msecs_to_jiffies( + init[i].retry_timeout * MSEC_PER_SEC); + } + return 0; + +out_error: + /* unwind the entries that succeeded */ + for (i--; i >= 0; i--) { + cfg = &mp->m_error_cfg[class][i]; + xfs_sysfs_del(&cfg->kobj); + } + xfs_sysfs_del(parent_kobj); + return error; +} + +int +xfs_error_sysfs_init( + struct xfs_mount *mp) +{ + int error; + + /* .../xfs/<dev>/error/ */ + error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, + &mp->m_kobj, "error"); + if (error) + return error; + + error = sysfs_create_file(&mp->m_error_kobj.kobject, + ATTR_LIST(fail_at_unmount)); + + if (error) + goto out_error; + + /* .../xfs/<dev>/error/metadata/ */ + error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, + "metadata", &mp->m_error_meta_kobj, + xfs_error_meta_init); + if (error) + goto out_error; + + return 0; + +out_error: + xfs_sysfs_del(&mp->m_error_kobj); + return error; +} + +void +xfs_error_sysfs_del( + struct xfs_mount *mp) +{ + struct xfs_error_cfg *cfg; + int i, j; + + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { + for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { + cfg = &mp->m_error_cfg[i][j]; + + xfs_sysfs_del(&cfg->kobj); + } + } + xfs_sysfs_del(&mp->m_error_meta_kobj); + xfs_sysfs_del(&mp->m_error_kobj); +} + +struct xfs_error_cfg * +xfs_error_get_cfg( + struct xfs_mount *mp, + int error_class, + int error) +{ + struct xfs_error_cfg *cfg; + + switch (error) { + case EIO: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; + break; + case ENOSPC: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC]; + break; + case ENODEV: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV]; + break; + default: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT]; + break; + } + + return cfg; +} diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index be692e59938d..d04637181ef2 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -58,4 +58,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } +int xfs_error_sysfs_init(struct xfs_mount *mp); +void xfs_error_sysfs_del(struct xfs_mount *mp); + #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c8d58426008e..ea94ee0fe5ea 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); @@ -944,7 +943,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) - __field(unsigned, trans_type) __field(char, ocnt) __field(char, cnt) __field(int, curr_res) @@ -962,7 +960,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ), TP_fast_assign( __entry->dev = log->l_mp->m_super->s_dev; - __entry->trans_type = tic->t_trans_type; __entry->ocnt = tic->t_ocnt; __entry->cnt = tic->t_cnt; __entry->curr_res = tic->t_curr_res; @@ -980,14 +977,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_block = log->l_curr_block; __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), - TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u " "t_unit_res %u t_flags %s reserveq %s " "writeq %s grant_reserve_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d " "grant_write_bytes %d curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), __entry->ocnt, __entry->cnt, __entry->curr_res, @@ -1053,19 +1049,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, ) TRACE_EVENT(xfs_log_force, - TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), - TP_ARGS(mp, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip), + TP_ARGS(mp, lsn, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_lsn_t, lsn) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->lsn = lsn; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d lsn 0x%llx", + TP_printk("dev %d:%d lsn 0x%llx caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->lsn) + __entry->lsn, (void *)__entry->caller_ip) ) #define DEFINE_LOG_ITEM_EVENT(name) \ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 20c53666cb4b..5f3d33d16e67 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -47,47 +47,6 @@ xfs_trans_init( } /* - * This routine is called to allocate a transaction structure. - * The type parameter indicates the type of the transaction. These - * are enumerated in xfs_trans.h. - * - * Dynamically allocate the transaction structure from the transaction - * zone, initialize it, and return it to the caller. - */ -xfs_trans_t * -xfs_trans_alloc( - xfs_mount_t *mp, - uint type) -{ - xfs_trans_t *tp; - - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, type, KM_SLEEP); - tp->t_flags |= XFS_TRANS_FREEZE_PROT; - return tp; -} - -xfs_trans_t * -_xfs_trans_alloc( - xfs_mount_t *mp, - uint type, - xfs_km_flags_t memflags) -{ - xfs_trans_t *tp; - - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); - - tp = kmem_zone_zalloc(xfs_trans_zone, memflags); - tp->t_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_type = type; - tp->t_mountp = mp; - INIT_LIST_HEAD(&tp->t_items); - INIT_LIST_HEAD(&tp->t_busy); - return tp; -} - -/* * Free the transaction structure. If there is more clean up * to do when the structure is freed, add it here. */ @@ -99,7 +58,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); - if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); @@ -125,7 +84,6 @@ xfs_trans_dup( * Initialize the new transaction structure. */ ntp->t_magic = XFS_TRANS_HEADER_MAGIC; - ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); @@ -135,9 +93,9 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_FREEZE_PROT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); /* We gave our writer reference to the new transaction */ - tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; @@ -165,7 +123,7 @@ xfs_trans_dup( * This does not do quota reservations. That typically is done by the * caller afterwards. */ -int +static int xfs_trans_reserve( struct xfs_trans *tp, struct xfs_trans_res *resp, @@ -219,7 +177,7 @@ xfs_trans_reserve( resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, - permanent, tp->t_type); + permanent); } if (error) @@ -268,6 +226,42 @@ undo_blocks: return error; } +int +xfs_trans_alloc( + struct xfs_mount *mp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents, + uint flags, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + int error; + + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) + sb_start_intwrite(mp->m_super); + + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_flags = flags; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} + /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e7c49cf43fbc..9a462e892e4f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -90,7 +90,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ unsigned int t_blk_res; /* # of blocks resvd */ @@ -148,10 +147,9 @@ typedef struct xfs_trans { /* * XFS transaction mechanism exported interfaces. */ -xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, - uint, uint); +int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, + uint blocks, uint rtextents, uint flags, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index d111f691f313..ea62245fee26 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -74,11 +74,12 @@ xfs_forget_acl( } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(inode); int error; /* Convert Linux syscall to XFS internal ATTR flags */ @@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); if (!error) - xfs_forget_acl(d_inode(dentry), name, xflags); + xfs_forget_acl(inode, name, xflags); return error; } @@ -146,7 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ - return 1; + return 0; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -166,8 +167,7 @@ xfs_xattr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { char *prefix; int prefix_len; @@ -221,11 +221,15 @@ xfs_xattr_put_listent( } ssize_t -xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +xfs_vn_listxattr( + struct dentry *dentry, + char *data, + size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(dentry); + int error; /* * First read the regular on-disk attributes. @@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - xfs_attr_list_int(&context); + error = xfs_attr_list_int(&context); + if (error) + return error; if (context.count < 0) return -ERANGE; |