diff options
Diffstat (limited to 'fs')
47 files changed, 1834 insertions, 208 deletions
diff --git a/fs/coredump.c b/fs/coredump.c index 9ea87e9fdccf..47c32c3bfa1d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,9 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/path.h> #include <linux/timekeeping.h> #include <asm/uaccess.h> @@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo) } } else { struct inode *inode; + int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) goto fail_unlock; @@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo) * what matters is that at least one of the two processes * writes its coredump successfully, not which one. */ - cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | - O_LARGEFILE | O_EXCL, - 0600); + if (need_suid_safe) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + cprm.file = file_open_root(root.dentry, root.mnt, + cn.corename, open_flags, 0600); + path_put(&root); + } else { + cprm.file = filp_open(cn.corename, open_flags, 0600); + } if (IS_ERR(cprm.file)) goto fail_unlock; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index c5c84dfb5b3e..9893d1538122 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -635,8 +635,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); - rc = -ENOMEM; - goto out; + return -ENOMEM; } (*packet_size) = 0; rc = ecryptfs_find_auth_tok_for_sig( @@ -922,8 +921,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); - rc = -ENOMEM; - goto out; + return -ENOMEM; } if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " diff --git a/fs/eventfd.c b/fs/eventfd.c index ed70cf9fdc7b..1231cd1999d8 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) u64 count; poll_wait(file, &ctx->wqh, wait); - smp_rmb(); - count = ctx->count; + + /* + * All writes to ctx->count occur within ctx->wqh.lock. This read + * can be done outside ctx->wqh.lock because we know that poll_wait + * takes that lock (through add_wait_queue) if our caller will sleep. + * + * The read _can_ therefore seep into add_wait_queue's critical + * section, but cannot move above it! add_wait_queue's spin_lock acts + * as an acquire barrier and ensures that the read be ordered properly + * against the writes. The following CAN happen and is safe: + * + * poll write + * ----------------- ------------ + * lock ctx->wqh.lock (in poll_wait) + * count = ctx->count + * __add_wait_queue + * unlock ctx->wqh.lock + * lock ctx->qwh.lock + * ctx->count += n + * if (waitqueue_active) + * wake_up_locked_poll + * unlock ctx->qwh.lock + * eventfd_poll returns 0 + * + * but the following, which would miss a wakeup, cannot happen: + * + * poll write + * ----------------- ------------ + * count = ctx->count (INVALID!) + * lock ctx->qwh.lock + * ctx->count += n + * **waitqueue_active is false** + * **no wake_up_locked_poll!** + * unlock ctx->qwh.lock + * lock ctx->wqh.lock (in poll_wait) + * __add_wait_queue + * unlock ctx->wqh.lock + * eventfd_poll returns 0 + */ + count = READ_ONCE(ctx->count); if (count > 0) events |= POLLIN; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 33f5e2a50cf8..50ba27cbed03 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -285,7 +285,7 @@ errout: static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT - return is_compat_task(); + return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 182f9ffe2b51..3ff1772f612e 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET that most of your FAT filesystems use, and can be overridden with the "iocharset" mount option for FAT filesystems. Note that "utf8" is not recommended for FAT filesystems. - If unsure, you shouldn't set "utf8" here. + If unsure, you shouldn't set "utf8" here - select the next option + instead if you would like to use UTF-8 encoded file names by default. See <file:Documentation/filesystems/vfat.txt> for more information. Enable any character sets you need in File Systems/Native Language Support. + +config FAT_DEFAULT_UTF8 + bool "Enable FAT UTF-8 option by default" + depends on VFAT_FS + default n + help + Set this if you would like to have "utf8" mount option set + by default when mounting FAT filesystems. + + Even if you say Y here can always disable UTF-8 for + particular mount by adding "utf8=0" to mount options. + + Say Y if you use UTF-8 encoding for file names, N otherwise. + + See <file:Documentation/filesystems/vfat.txt> for more information. diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a5599052116c..226281068a46 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1127,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, } opts->name_check = 'n'; opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; - opts->utf8 = opts->unicode_xlate = 0; + opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; opts->tz_set = 0; @@ -1135,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->errors = FAT_ERRORS_RO; *debug = 0; + opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat; + if (!options) goto out; diff --git a/fs/fhandle.c b/fs/fhandle.c index d59712dfa3e7..ca3c3dd01789 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag); + file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5c46ed9f3e14..fee81e8768c9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode) wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); - wb_put(wb); /* not gonna deref it anymore */ /* i_wb may have changed inbetween, can't use inode_to_wb() */ - if (likely(wb == inode->i_wb)) - return wb; /* @inode already has ref */ + if (likely(wb == inode->i_wb)) { + wb_put(wb); /* @inode already has ref */ + return wb; + } spin_unlock(&wb->list_lock); + wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } @@ -1337,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() * and does more profound writeback list handling in writeback_sb_inodes(). */ -static int -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_single_inode(struct inode *inode, + struct writeback_control *wbc) { + struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); @@ -1378,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); - spin_lock(&wb->list_lock); + + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If inode is clean, remove it from writeback lists. Otherwise don't @@ -1453,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb, while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); + struct bdi_writeback *tmp_wb; if (inode->i_sb != sb) { if (work->sb) { @@ -1543,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb, cond_resched(); } - - spin_lock(&wb->list_lock); + /* + * Requeue @inode if still dirty. Be careful as @inode may + * have been switched to another wb in the meantime. + */ + tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) wrote++; - requeue_inode(inode, wb, &wbc); + requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); + if (unlikely(tmp_wb != wb)) { + spin_unlock(&tmp_wb->list_lock); + spin_lock(&wb->list_lock); + } + /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -2338,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, @@ -2350,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - return writeback_single_inode(inode, wb, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); @@ -2367,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { - return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); + return writeback_single_inode(inode, wbc); } EXPORT_SYMBOL(sync_inode); diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 95d5880a63ee..7e553f286775 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -134,37 +134,59 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) if (mutex_lock_interruptible(&c->alloc_sem)) return -EINTR; + for (;;) { + /* We can't start doing GC until we've finished checking + the node CRCs etc. */ + int bucket, want_ino; + spin_lock(&c->erase_completion_lock); if (!c->unchecked_size) break; - - /* We can't start doing GC yet. We haven't finished checking - the node CRCs etc. Do it now. */ - - /* checked_ino is protected by the alloc_sem */ - if (c->checked_ino > c->highest_ino && xattr) { - pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n", - c->unchecked_size); - jffs2_dbg_dump_block_lists_nolock(c); - spin_unlock(&c->erase_completion_lock); - mutex_unlock(&c->alloc_sem); - return -ENOSPC; - } - spin_unlock(&c->erase_completion_lock); if (!xattr) xattr = jffs2_verify_xattr(c); spin_lock(&c->inocache_lock); + /* Instead of doing the inodes in numeric order, doing a lookup + * in the hash for each possible number, just walk the hash + * buckets of *existing* inodes. This means that we process + * them out-of-order, but it can be a lot faster if there's + * a sparse inode# space. Which there often is. */ + want_ino = c->check_ino; + for (bucket = c->check_ino % c->inocache_hashsize ; bucket < c->inocache_hashsize; bucket++) { + for (ic = c->inocache_list[bucket]; ic; ic = ic->next) { + if (ic->ino < want_ino) + continue; + + if (ic->state != INO_STATE_CHECKEDABSENT && + ic->state != INO_STATE_PRESENT) + goto got_next; /* with inocache_lock held */ + + jffs2_dbg(1, "Skipping ino #%u already checked\n", + ic->ino); + } + want_ino = 0; + } - ic = jffs2_get_ino_cache(c, c->checked_ino++); + /* Point c->check_ino past the end of the last bucket. */ + c->check_ino = ((c->highest_ino + c->inocache_hashsize + 1) & + ~c->inocache_hashsize) - 1; - if (!ic) { - spin_unlock(&c->inocache_lock); - continue; - } + spin_unlock(&c->inocache_lock); + + pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n", + c->unchecked_size); + jffs2_dbg_dump_block_lists_nolock(c); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + + got_next: + /* For next time round the loop, we want c->checked_ino to indicate + * the *next* one we want to check. And since we're walking the + * buckets rather than doing it sequentially, it's: */ + c->check_ino = ic->ino + c->inocache_hashsize; if (!ic->pino_nlink) { jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n", @@ -176,8 +198,6 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) switch(ic->state) { case INO_STATE_CHECKEDABSENT: case INO_STATE_PRESENT: - jffs2_dbg(1, "Skipping ino #%u already checked\n", - ic->ino); spin_unlock(&c->inocache_lock); continue; @@ -196,7 +216,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) ic->ino); /* We need to come back again for the _same_ inode. We've made no progress in this case, but that should be OK */ - c->checked_ino--; + c->check_ino = ic->ino; mutex_unlock(&c->alloc_sem); sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 046fee8b6e9b..778275f48a87 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -49,7 +49,7 @@ struct jffs2_sb_info { struct mtd_info *mtd; uint32_t highest_ino; - uint32_t checked_ino; + uint32_t check_ino; /* *NEXT* inode to be checked */ unsigned int flags; diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index b6bd4affd9ad..cda0774c2c9c 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -846,8 +846,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c) return 1; if (c->unchecked_size) { - jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", - c->unchecked_size, c->checked_ino); + jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, check_ino #%d\n", + c->unchecked_size, c->check_ino); return 1; } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 5a3da3f52908..b25d28a21212 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1183,22 +1183,20 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c) int jffs2_nand_flash_setup(struct jffs2_sb_info *c) { - struct nand_ecclayout *oinfo = c->mtd->ecclayout; - if (!c->mtd->oobsize) return 0; /* Cleanmarker is out-of-band, so inline size zero */ c->cleanmarker_size = 0; - if (!oinfo || oinfo->oobavail == 0) { + if (c->mtd->oobavail == 0) { pr_err("inconsistent device description\n"); return -EINVAL; } jffs2_dbg(1, "using OOB on NAND\n"); - c->oobavail = oinfo->oobavail; + c->oobavail = c->mtd->oobavail; /* Initialise write buffer */ init_rwsem(&c->wbuf_sem); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 8bc870e4c467..02e4d87d2ed3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) kfree(bl); } -static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, - gfp_t gfp_flags) +static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags, bool is_scsi_layout) { struct pnfs_block_layout *bl; @@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, bl->bl_ext_ro = RB_ROOT; spin_lock_init(&bl->bl_ext_lock); + bl->bl_scsi_layout = is_scsi_layout; return &bl->bl_layout; } +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + return __bl_alloc_layout_hdr(inode, gfp_flags, false); +} + +static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + return __bl_alloc_layout_hdr(inode, gfp_flags, true); +} + static void bl_free_lseg(struct pnfs_layout_segment *lseg) { dprintk("%s enter\n", __func__); @@ -889,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .sync = pnfs_generic_sync, }; +static struct pnfs_layoutdriver_type scsilayout_type = { + .id = LAYOUT_SCSI, + .name = "LAYOUT_SCSI", + .owner = THIS_MODULE, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_READ_WHOLE_PAGE, + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = sl_alloc_layout_hdr, + .free_layout_hdr = bl_free_layout_hdr, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .return_range = bl_return_range, + .prepare_layoutcommit = bl_prepare_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .set_layoutdriver = bl_set_layoutdriver, + .alloc_deviceid_node = bl_alloc_deviceid_node, + .free_deviceid_node = bl_free_deviceid_node, + .pg_read_ops = &bl_pg_read_ops, + .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, +}; + + static int __init nfs4blocklayout_init(void) { int ret; dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); - ret = pnfs_register_layoutdriver(&blocklayout_type); + ret = bl_init_pipefs(); if (ret) goto out; - ret = bl_init_pipefs(); + + ret = pnfs_register_layoutdriver(&blocklayout_type); if (ret) - goto out_unregister; + goto out_cleanup_pipe; + + ret = pnfs_register_layoutdriver(&scsilayout_type); + if (ret) + goto out_unregister_block; return 0; -out_unregister: +out_unregister_block: pnfs_unregister_layoutdriver(&blocklayout_type); +out_cleanup_pipe: + bl_cleanup_pipefs(); out: return ret; } @@ -914,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void) dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", __func__); - bl_cleanup_pipefs(); + pnfs_unregister_layoutdriver(&scsilayout_type); pnfs_unregister_layoutdriver(&blocklayout_type); + bl_cleanup_pipefs(); } MODULE_ALIAS("nfs-layouttype4-3"); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index c556640dcf3b..bc21205309e0 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -55,7 +55,6 @@ struct pnfs_block_dev; */ #define PNFS_BLOCK_UUID_LEN 128 - struct pnfs_block_volume { enum pnfs_block_volume_type type; union { @@ -82,6 +81,13 @@ struct pnfs_block_volume { u32 volumes_count; u32 volumes[PNFS_BLOCK_MAX_DEVICES]; } stripe; + struct { + enum scsi_code_set code_set; + enum scsi_designator_type designator_type; + int designator_len; + u8 designator[256]; + u64 pr_key; + } scsi; }; }; @@ -106,6 +112,9 @@ struct pnfs_block_dev { struct block_device *bdev; u64 disk_offset; + u64 pr_key; + bool pr_registered; + bool (*map)(struct pnfs_block_dev *dev, u64 offset, struct pnfs_block_dev_map *map); }; @@ -131,6 +140,7 @@ struct pnfs_block_layout { struct rb_root bl_ext_rw; struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ + bool bl_scsi_layout; }; static inline struct pnfs_block_layout * @@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); dev_t bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, gfp_t gfp_mask); int __init bl_init_pipefs(void); -void __exit bl_cleanup_pipefs(void); +void bl_cleanup_pipefs(void); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index a861bbdfe577..e5b89675263e 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -1,11 +1,12 @@ /* - * Copyright (c) 2014 Christoph Hellwig. + * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/sunrpc/svc.h> #include <linux/blkdev.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_xdr.h> +#include <linux/pr.h> #include "blocklayout.h" @@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev) bl_free_device(&dev->children[i]); kfree(dev->children); } else { + if (dev->pr_registered) { + const struct pr_ops *ops = + dev->bdev->bd_disk->fops->pr_ops; + int error; + + error = ops->pr_register(dev->bdev, dev->pr_key, 0, + false); + if (error) + pr_err("failed to unregister PR key.\n"); + } + if (dev->bdev) blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); } @@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) for (i = 0; i < b->stripe.volumes_count; i++) b->stripe.volumes[i] = be32_to_cpup(p++); break; + case PNFS_BLOCK_VOLUME_SCSI: + p = xdr_inline_decode(xdr, 4 + 4 + 4); + if (!p) + return -EIO; + b->scsi.code_set = be32_to_cpup(p++); + b->scsi.designator_type = be32_to_cpup(p++); + b->scsi.designator_len = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, b->scsi.designator_len); + if (!p) + return -EIO; + if (b->scsi.designator_len > 256) + return -EIO; + memcpy(&b->scsi.designator, p, b->scsi.designator_len); + p = xdr_inline_decode(xdr, 8); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->scsi.pr_key); + break; default: dprintk("unknown volume type!\n"); return -EIO; @@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, return 0; } +static bool +bl_validate_designator(struct pnfs_block_volume *v) +{ + switch (v->scsi.designator_type) { + case PS_DESIGNATOR_EUI64: + if (v->scsi.code_set != PS_CODE_SET_BINARY) + return false; + + if (v->scsi.designator_len != 8 && + v->scsi.designator_len != 10 && + v->scsi.designator_len != 16) + return false; + + return true; + case PS_DESIGNATOR_NAA: + if (v->scsi.code_set != PS_CODE_SET_BINARY) + return false; + + if (v->scsi.designator_len != 8 && + v->scsi.designator_len != 16) + return false; + + return true; + case PS_DESIGNATOR_T10: + case PS_DESIGNATOR_NAME: + pr_err("pNFS: unsupported designator " + "(code set %d, type %d, len %d.\n", + v->scsi.code_set, + v->scsi.designator_type, + v->scsi.designator_len); + return false; + default: + pr_err("pNFS: invalid designator " + "(code set %d, type %d, len %d.\n", + v->scsi.code_set, + v->scsi.designator_type, + v->scsi.designator_len); + return false; + } +} + +static int +bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + const struct pr_ops *ops; + const char *devname; + int error; + + if (!bl_validate_designator(v)) + return -EINVAL; + + switch (v->scsi.designator_len) { + case 8: + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", + v->scsi.designator); + break; + case 12: + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", + v->scsi.designator); + break; + case 16: + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", + v->scsi.designator); + break; + default: + return -EINVAL; + } + + d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); + if (IS_ERR(d->bdev)) { + pr_warn("pNFS: failed to open device %s (%ld)\n", + devname, PTR_ERR(d->bdev)); + kfree(devname); + return PTR_ERR(d->bdev); + } + + kfree(devname); + + d->len = i_size_read(d->bdev->bd_inode); + d->map = bl_map_simple; + d->pr_key = v->scsi.pr_key; + + pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", + d->bdev->bd_disk->disk_name, d->pr_key); + + ops = d->bdev->bd_disk->fops->pr_ops; + if (!ops) { + pr_err("pNFS: block device %s does not support reservations.", + d->bdev->bd_disk->disk_name); + error = -EINVAL; + goto out_blkdev_put; + } + + error = ops->pr_register(d->bdev, 0, d->pr_key, true); + if (error) { + pr_err("pNFS: failed to register key for block device %s.", + d->bdev->bd_disk->disk_name); + goto out_blkdev_put; + } + + d->pr_registered = true; + return 0; + +out_blkdev_put: + blkdev_put(d->bdev, FMODE_READ); + return error; +} + static int bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) @@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, return bl_parse_concat(server, d, volumes, idx, gfp_mask); case PNFS_BLOCK_VOLUME_STRIPE: return bl_parse_stripe(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_SCSI: + return bl_parse_scsi(server, d, volumes, idx, gfp_mask); default: dprintk("unsupported volume type: %d\n", volumes[idx].type); return -EIO; diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 35ab51c04814..720b3ff55fa9 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Christoph Hellwig. + * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/vmalloc.h> @@ -462,10 +462,12 @@ out: return err; } -static size_t ext_tree_layoutupdate_size(size_t count) +static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count) { - return sizeof(__be32) /* number of entries */ + - PNFS_BLOCK_EXTENT_SIZE * count; + if (bl->bl_scsi_layout) + return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count; + else + return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count; } static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, @@ -483,6 +485,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, } } +static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p) +{ + p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, + NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + return p; +} + +static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) +{ + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); + return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); +} + static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, size_t buffer_size, size_t *count) { @@ -496,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, continue; (*count)++; - if (ext_tree_layoutupdate_size(*count) > buffer_size) { + if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { /* keep counting.. */ ret = -ENOSPC; continue; } - p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, - NFS4_DEVICEID4_SIZE); - p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); - p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); - p = xdr_encode_hyper(p, 0LL); - *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); - + if (bl->bl_scsi_layout) + p = encode_scsi_range(be, p); + else + p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; } spin_unlock(&bl->bl_ext_lock); @@ -537,7 +553,7 @@ retry: if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = ext_tree_layoutupdate_size(count); + buffer_size = ext_tree_layoutupdate_size(bl, count); count = 0; arg->layoutupdate_pages = @@ -556,7 +572,7 @@ retry: } *start_p = cpu_to_be32(count); - arg->layoutupdate_len = ext_tree_layoutupdate_size(count); + arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count); if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { void *p = start_p, *end = p + arg->layoutupdate_len; diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index dbe5839cdeba..9fb067a6f7e0 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -281,7 +281,7 @@ out: return ret; } -void __exit bl_cleanup_pipefs(void) +void bl_cleanup_pipefs(void) { rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); unregister_pernet_subsys(&nfs4blocklayout_net_ops); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index a0b77fc1bd39..c9f583d7bac8 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -84,12 +84,30 @@ config NFSD_V4 If unsure, say N. config NFSD_PNFS - bool "NFSv4.1 server support for Parallel NFS (pNFS)" - depends on NFSD_V4 + bool + +config NFSD_BLOCKLAYOUT + bool "NFSv4.1 server support for pNFS block layouts" + depends on NFSD_V4 && BLOCK + select NFSD_PNFS + help + This option enables support for the exporting pNFS block layouts + in the kernel's NFS server. The pNFS block layout enables NFS + clients to directly perform I/O to block devices accesible to both + the server and the clients. See RFC 5663 for more details. + + If unsure, say N. + +config NFSD_SCSILAYOUT + bool "NFSv4.1 server support for pNFS SCSI layouts" + depends on NFSD_V4 && BLOCK + select NFSD_PNFS help - This option enables support for the parallel NFS features of the - minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS - server. + This option enables support for the exporting pNFS SCSI layouts + in the kernel's NFS server. The pNFS SCSI layout enables NFS + clients to directly perform I/O to SCSI devices accesible to both + the server and the clients. See draft-ietf-nfsv4-scsi-layout for + more details. If unsure, say N. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 9a6028e120c6..3ae5f3c77e28 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfs4acl.o nfs4callback.o nfs4recover.o -nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o +nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index c29d9421bd5e..e55b5242614d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -1,11 +1,14 @@ /* - * Copyright (c) 2014 Christoph Hellwig. + * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/exportfs.h> #include <linux/genhd.h> #include <linux/slab.h> +#include <linux/pr.h> #include <linux/nfsd/debug.h> +#include <scsi/scsi_proto.h> +#include <scsi/scsi_common.h> #include "blocklayoutxdr.h" #include "pnfs.h" @@ -13,37 +16,6 @@ #define NFSDDBG_FACILITY NFSDDBG_PNFS -static int -nfsd4_block_get_device_info_simple(struct super_block *sb, - struct nfsd4_getdeviceinfo *gdp) -{ - struct pnfs_block_deviceaddr *dev; - struct pnfs_block_volume *b; - - dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + - sizeof(struct pnfs_block_volume), GFP_KERNEL); - if (!dev) - return -ENOMEM; - gdp->gd_device = dev; - - dev->nr_volumes = 1; - b = &dev->volumes[0]; - - b->type = PNFS_BLOCK_VOLUME_SIMPLE; - b->simple.sig_len = PNFS_BLOCK_UUID_LEN; - return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, - &b->simple.offset); -} - -static __be32 -nfsd4_block_proc_getdeviceinfo(struct super_block *sb, - struct nfsd4_getdeviceinfo *gdp) -{ - if (sb->s_bdev != sb->s_bdev->bd_contains) - return nfserr_inval; - return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); -} - static __be32 nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, struct nfsd4_layoutget *args) @@ -141,20 +113,13 @@ out_layoutunavailable: } static __be32 -nfsd4_block_proc_layoutcommit(struct inode *inode, - struct nfsd4_layoutcommit *lcp) +nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, + struct iomap *iomaps, int nr_iomaps) { loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; - struct iomap *iomaps; - int nr_iomaps; int error; - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); - if (nr_iomaps < 0) - return nfserrno(nr_iomaps); - if (lcp->lc_mtime.tv_nsec == UTIME_NOW || timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) lcp->lc_mtime = current_fs_time(inode->i_sb); @@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, return nfserrno(error); } +#ifdef CONFIG_NFSD_BLOCKLAYOUT +static int +nfsd4_block_get_device_info_simple(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SIMPLE; + b->simple.sig_len = PNFS_BLOCK_UUID_LEN; + return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, + &b->simple.offset); +} + +static __be32 +nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdp) +{ + if (sb->s_bdev != sb->s_bdev->bd_contains) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); +} + +static __be32 +nfsd4_block_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + struct iomap *iomaps; + int nr_iomaps; + + nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); +} + const struct nfsd4_layout_ops bl_layout_ops = { /* * Pretend that we send notification to the client. This is a blatant @@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = { .encode_layoutget = nfsd4_block_encode_layoutget, .proc_layoutcommit = nfsd4_block_proc_layoutcommit, }; +#endif /* CONFIG_NFSD_BLOCKLAYOUT */ + +#ifdef CONFIG_NFSD_SCSILAYOUT +static int nfsd4_scsi_identify_device(struct block_device *bdev, + struct pnfs_block_volume *b) +{ + struct request_queue *q = bdev->bd_disk->queue; + struct request *rq; + size_t bufflen = 252, len, id_len; + u8 *buf, *d, type, assoc; + int error; + + buf = kzalloc(bufflen, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + rq = blk_get_request(q, READ, GFP_KERNEL); + if (IS_ERR(rq)) { + error = -ENOMEM; + goto out_free_buf; + } + blk_rq_set_block_pc(rq); + + error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); + if (error) + goto out_put_request; + + rq->cmd[0] = INQUIRY; + rq->cmd[1] = 1; + rq->cmd[2] = 0x83; + rq->cmd[3] = bufflen >> 8; + rq->cmd[4] = bufflen & 0xff; + rq->cmd_len = COMMAND_SIZE(INQUIRY); + + error = blk_execute_rq(rq->q, NULL, rq, 1); + if (error) { + pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", + rq->errors); + goto out_put_request; + } + + len = (buf[2] << 8) + buf[3] + 4; + if (len > bufflen) { + pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", + len); + goto out_put_request; + } + + d = buf + 4; + for (d = buf + 4; d < buf + len; d += id_len + 4) { + id_len = d[3]; + type = d[1] & 0xf; + assoc = (d[1] >> 4) & 0x3; + + /* + * We only care about a EUI-64 and NAA designator types + * with LU association. + */ + if (assoc != 0x00) + continue; + if (type != 0x02 && type != 0x03) + continue; + if (id_len != 8 && id_len != 12 && id_len != 16) + continue; + + b->scsi.code_set = PS_CODE_SET_BINARY; + b->scsi.designator_type = type == 0x02 ? + PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA; + b->scsi.designator_len = id_len; + memcpy(b->scsi.designator, d + 4, id_len); + + /* + * If we found a 8 or 12 byte descriptor continue on to + * see if a 16 byte one is available. If we find a + * 16 byte descriptor we're done. + */ + if (id_len == 16) + break; + } + +out_put_request: + blk_put_request(rq); +out_free_buf: + kfree(buf); + return error; +} + +#define NFSD_MDS_PR_KEY 0x0100000000000000 + +/* + * We use the client ID as a unique key for the reservations. + * This allows us to easily fence a client when recalls fail. + */ +static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp) +{ + return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id; +} + +static int +nfsd4_block_get_device_info_scsi(struct super_block *sb, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + const struct pr_ops *ops; + int error; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SCSI; + b->scsi.pr_key = nfsd4_scsi_pr_key(clp); + + error = nfsd4_scsi_identify_device(sb->s_bdev, b); + if (error) + return error; + + ops = sb->s_bdev->bd_disk->fops->pr_ops; + if (!ops) { + pr_err("pNFS: device %s does not support PRs.\n", + sb->s_id); + return -EINVAL; + } + + error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); + if (error) { + pr_err("pNFS: failed to register key for device %s.\n", + sb->s_id); + return -EINVAL; + } + + error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, + PR_EXCLUSIVE_ACCESS_REG_ONLY, 0); + if (error) { + pr_err("pNFS: failed to reserve device %s.\n", + sb->s_id); + return -EINVAL; + } + + return 0; +} + +static __be32 +nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdp) +{ + if (sb->s_bdev != sb->s_bdev->bd_contains) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); +} +static __be32 +nfsd4_scsi_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + struct iomap *iomaps; + int nr_iomaps; + + nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); +} + +static void +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) +{ + struct nfs4_client *clp = ls->ls_stid.sc_client; + struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev; + + bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, + nfsd4_scsi_pr_key(clp), 0, true); +} + +const struct nfsd4_layout_ops scsi_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, + .proc_layoutget = nfsd4_block_proc_layoutget, + .encode_layoutget = nfsd4_block_encode_layoutget, + .proc_layoutcommit = nfsd4_scsi_proc_layoutcommit, + .fence_client = nfsd4_scsi_fence_client, +}; +#endif /* CONFIG_NFSD_SCSILAYOUT */ diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 6d834dc9bbc8..6c3b316f932e 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Christoph Hellwig. + * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/sunrpc/svc.h> #include <linux/exportfs.h> @@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_encode_hyper(p, b->simple.offset); p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); break; + case PNFS_BLOCK_VOLUME_SCSI: + len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8; + p = xdr_reserve_space(xdr, len); + if (!p) + return -ETOOSMALL; + + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(b->scsi.code_set); + *p++ = cpu_to_be32(b->scsi.designator_type); + p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len); + p = xdr_encode_hyper(p, b->scsi.pr_key); + break; default: return -ENOTSUPP; } @@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, expected, i; + u32 nr_iomaps, i; if (len < sizeof(u32)) { dprintk("%s: extent array too small: %u\n", __func__, len); return -EINVAL; } + len -= sizeof(u32); + if (len % PNFS_BLOCK_EXTENT_SIZE) { + dprintk("%s: extent array invalid: %u\n", __func__, len); + return -EINVAL; + } nr_iomaps = be32_to_cpup(p++); - expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; - if (len != expected) { + if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { dprintk("%s: extent array size mismatch: %u/%u\n", - __func__, len, expected); + __func__, len, nr_iomaps); return -EINVAL; } @@ -155,3 +171,54 @@ fail: kfree(iomaps); return -EINVAL; } + +int +nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size) +{ + struct iomap *iomaps; + u32 nr_iomaps, expected, i; + + if (len < sizeof(u32)) { + dprintk("%s: extent array too small: %u\n", __func__, len); + return -EINVAL; + } + + nr_iomaps = be32_to_cpup(p++); + expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; + if (len != expected) { + dprintk("%s: extent array size mismatch: %u/%u\n", + __func__, len, expected); + return -EINVAL; + } + + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); + if (!iomaps) { + dprintk("%s: failed to allocate extent array\n", __func__); + return -ENOMEM; + } + + for (i = 0; i < nr_iomaps; i++) { + u64 val; + + p = xdr_decode_hyper(p, &val); + if (val & (block_size - 1)) { + dprintk("%s: unaligned offset 0x%llx\n", __func__, val); + goto fail; + } + iomaps[i].offset = val; + + p = xdr_decode_hyper(p, &val); + if (val & (block_size - 1)) { + dprintk("%s: unaligned length 0x%llx\n", __func__, val); + goto fail; + } + iomaps[i].length = val; + } + + *iomapp = iomaps; + return nr_iomaps; +fail: + kfree(iomaps); + return -EINVAL; +} diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index 6de925fe8499..397bc7563a49 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -15,6 +15,11 @@ struct pnfs_block_extent { enum pnfs_block_extent_state es; }; +struct pnfs_block_range { + u64 foff; + u64 len; +}; + /* * Random upper cap for the uuid length to avoid unbounded allocation. * Not actually limited by the protocol. @@ -29,6 +34,13 @@ struct pnfs_block_volume { u32 sig_len; u8 sig[PNFS_BLOCK_UUID_LEN]; } simple; + struct { + enum scsi_code_set code_set; + enum scsi_designator_type designator_type; + int designator_len; + u8 designator[256]; + u64 pr_key; + } scsi; }; }; @@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, struct nfsd4_layoutget *lgp); int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, u32 block_size); +int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size); #endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 7b755b7f785c..51c3b06e8036 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, { __be32 nfserr; u32 max_blocksize = svc_max_payload(rqstp); + unsigned long cnt = min(argp->count, max_blocksize); dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), @@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - resp->count = min(argp->count, max_blocksize); + resp->count = cnt; svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); @@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, &resp->count); if (nfserr == 0) { struct inode *inode = d_inode(resp->fh.fh_dentry); - - resp->eof = (argp->offset + resp->count) >= inode->i_size; + resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, + inode->i_size); } RETURN_STATUS(nfserr); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ce2d010d3b17..825c7bc8d789 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include <linux/blkdev.h> #include <linux/kmod.h> #include <linux/file.h> #include <linux/jhash.h> @@ -26,7 +27,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { +#ifdef CONFIG_NFSD_BLOCKLAYOUT [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT + [LAYOUT_SCSI] = &scsi_layout_ops, +#endif }; /* pNFS device ID to export fsid mapping */ @@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp) if (!(exp->ex_flags & NFSEXP_PNFS)) return; + /* + * Check if the file system supports exporting a block-like layout. + * If the block device supports reservations prefer the SCSI layout, + * otherwise advertise the block layout. + */ +#ifdef CONFIG_NFSD_BLOCKLAYOUT if (sb->s_export_op->get_uuid && sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks) exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT + /* overwrite block layout selection if needed */ + if (sb->s_export_op->map_blocks && + sb->s_export_op->commit_blocks && + sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops) + exp->ex_layout_type = LAYOUT_SCSI; +#endif } static void @@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); - trace_layout_recall_fail(&ls->ls_stid.sc_stateid); - printk(KERN_WARNING "nfsd: client %s failed to respond to layout recall. " " Fencing..\n", addr_str); @@ -626,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) container_of(cb, struct nfs4_layout_stateid, ls_recall); struct nfsd_net *nn; ktime_t now, cutoff; + const struct nfsd4_layout_ops *ops; LIST_HEAD(reaplist); @@ -661,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) /* * Unknown error or non-responding client, we'll need to fence. */ - nfsd4_cb_layout_fail(ls); + trace_layout_recall_fail(&ls->ls_stid.sc_stateid); + + ops = nfsd4_layout_ops[ls->ls_layout_type]; + if (ops->fence_client) + ops->fence_client(ls); + else + nfsd4_cb_layout_fail(ls); return -1; } } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4cba7865f496..de1ff1d98bb1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -864,12 +864,10 @@ static __be32 nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_secinfo *secinfo) { - struct svc_fh resfh; struct svc_export *exp; struct dentry *dentry; __be32 err; - fh_init(&resfh, NFS4_FHSIZE); err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); if (err) return err; @@ -878,6 +876,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; + fh_unlock(&cstate->current_fh); if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; @@ -1269,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, goto out; nfserr = nfs_ok; - if (gdp->gd_maxcount != 0) - nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); + if (gdp->gd_maxcount != 0) { + nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, + cstate->session->se_client, gdp); + } gdp->gd_notify_types &= ops->notify_types; out: diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 195fe2668207..66eaeb1e8c2c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1266,6 +1266,7 @@ nfsd4_umh_cltrack_init(struct net *net) /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); + kfree(grace_start); return -EINVAL; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c484a2b6cd10..0462eeddfff9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2408,7 +2408,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, default: /* checked by xdr code */ WARN_ON_ONCE(1); case SP4_SSV: - return nfserr_encr_alg_unsupp; + status = nfserr_encr_alg_unsupp; + goto out_nolock; } /* Cases below refer to rfc 5661 section 18.35.4: */ @@ -2586,21 +2587,26 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs return nfs_ok; } +/* + * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now. + * These are based on similar macros in linux/sunrpc/msg_prot.h . + */ +#define RPC_MAX_HEADER_WITH_AUTH_SYS \ + (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK)) + +#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \ + (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK)) + #define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ - RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32)) + RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32)) #define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ - RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32)) + RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \ + sizeof(__be32)) static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) { ca->headerpadsz = 0; - /* - * These RPC_MAX_HEADER macros are overkill, especially since we - * don't even do gss on the backchannel yet. But this is still - * less than 1k. Tighten up this estimate in the unlikely event - * it turns out to be a problem for some client: - */ if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) return nfserr_toosmall; if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) @@ -2710,10 +2716,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } status = nfs_ok; - /* - * We do not support RDMA or persistent sessions - */ + /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; + /* Upshifting from TCP to RDMA is not supported */ cr_ses->flags &= ~SESSION4_RDMA; init_session(rqstp, new, conf, cr_ses); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d6ef0955a979..9df898ba648f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename READ_BUF(4); rename->rn_snamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_snamelen + 4); + READ_BUF(rename->rn_snamelen); SAVEMEM(rename->rn_sname, rename->rn_snamelen); + READ_BUF(4); rename->rn_tnamelen = be32_to_cpup(p++); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); @@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient READ_BUF(8); setclientid->se_callback_prog = be32_to_cpup(p++); setclientid->se_callback_netid_len = be32_to_cpup(p++); - - READ_BUF(setclientid->se_callback_netid_len + 4); + READ_BUF(setclientid->se_callback_netid_len); SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); + READ_BUF(4); setclientid->se_callback_addr_len = be32_to_cpup(p++); - READ_BUF(setclientid->se_callback_addr_len + 4); + READ_BUF(setclientid->se_callback_addr_len); SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); + READ_BUF(4); setclientid->se_callback_ident = be32_to_cpup(p++); DECODE_TAIL; @@ -1835,8 +1837,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) READ_BUF(4); argp->taglen = be32_to_cpup(p++); - READ_BUF(argp->taglen + 8); + READ_BUF(argp->taglen); SAVEMEM(argp->tag, argp->taglen); + READ_BUF(8); argp->minorversion = be32_to_cpup(p++); argp->opcnt = be32_to_cpup(p++); max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); @@ -3060,7 +3063,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(bcts->dir); - /* Sorry, we do not yet support RDMA over 4.1: */ + /* Upshifting from TCP to RDMA is not supported */ *p++ = cpu_to_be32(0); } return nfserr; @@ -3362,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read( struct xdr_stream *xdr = &resp->xdr; struct xdr_buf *buf = xdr->buf; u32 eof; + long len; int space_left; __be32 nfserr; __be32 *p = xdr->p - 2; @@ -3370,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read( if (xdr->end - xdr->p < 1) return nfserr_resource; + len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, file, read->rd_offset, &maxcount); if (nfserr) { @@ -3382,8 +3387,8 @@ static __be32 nfsd4_encode_splice_read( return nfserr; } - eof = (read->rd_offset + maxcount >= - d_inode(read->rd_fhp->fh_dentry)->i_size); + eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, + d_inode(read->rd_fhp->fh_dentry)->i_size); *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3453,14 +3458,15 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, } read->rd_vlen = v; + len = maxcount; nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - eof = (read->rd_offset + maxcount >= - d_inode(read->rd_fhp->fh_dentry)->i_size); + eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, + d_inode(read->rd_fhp->fh_dentry)->i_size); tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index d4c4453674c6..7d073b9b1553 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -21,6 +21,7 @@ struct nfsd4_layout_ops { u32 notify_types; __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, struct nfsd4_getdeviceinfo *gdevp); @@ -32,10 +33,17 @@ struct nfsd4_layout_ops { __be32 (*proc_layoutcommit)(struct inode *inode, struct nfsd4_layoutcommit *lcp); + + void (*fence_client)(struct nfs4_layout_stateid *ls); }; extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; +#ifdef CONFIG_NFSD_BLOCKLAYOUT extern const struct nfsd4_layout_ops bl_layout_ops; +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT +extern const struct nfsd4_layout_ops scsi_layout_ops; +#endif __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index c11ba316f23f..2d573ec057f8 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -139,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode) || createmode == NFS4_CREATE_EXCLUSIVE4_1; } +static inline bool nfsd_eof_on_read(long requested, long read, + loff_t offset, loff_t size) +{ + /* We assume a short read means eof: */ + if (requested > read) + return true; + /* + * A non-short read might also reach end of file. The spec + * still requires us to set eof in that case. + * + * Further operations may have modified the file size since + * the read, so the following check is not atomic with the read. + * We've only seen that cause a problem for a client in the case + * where the read returned a count of 0 without setting eof. + * That case was fixed by the addition of the above check. + */ + return (offset + read >= size); +} + #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index ce210d4951a1..e27e6527912b 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -41,7 +41,8 @@ ocfs2-objs := \ quota_local.o \ quota_global.o \ xattr.o \ - acl.o + acl.o \ + filecheck.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c new file mode 100644 index 000000000000..2cabbcf2f28e --- /dev/null +++ b/fs/ocfs2/filecheck.c @@ -0,0 +1,606 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.c + * + * Code which implements online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/sysctl.h> +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "ocfs2_fs.h" +#include "stackglue.h" +#include "inode.h" + +#include "filecheck.h" + + +/* File check error strings, + * must correspond with error number in header file. + */ +static const char * const ocfs2_filecheck_errs[] = { + "SUCCESS", + "FAILED", + "INPROGRESS", + "READONLY", + "INJBD", + "INVALIDINO", + "BLOCKECC", + "BLOCKNO", + "VALIDFLAG", + "GENERATION", + "UNSUPPORTED" +}; + +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); +static LIST_HEAD(ocfs2_filecheck_sysfs_list); + +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ + struct list_head fs_list; + atomic_t fs_count; + struct super_block *fs_sb; + struct kset *fs_devicekset; + struct kset *fs_fcheckkset; + struct ocfs2_filecheck *fs_fcheck; +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_entry { + struct list_head fe_list; + unsigned long fe_ino; + unsigned int fe_type; + unsigned int fe_done:1; + unsigned int fe_status:31; +}; + +struct ocfs2_filecheck_args { + unsigned int fa_type; + union { + unsigned long fa_ino; + unsigned int fa_len; + }; +}; + +static const char * +ocfs2_filecheck_error(int errno) +{ + if (!errno) + return ocfs2_filecheck_errs[errno]; + + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || + errno > OCFS2_FILECHECK_ERR_END); + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_attr_filecheck_chk = + __ATTR(check, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_fix = + __ATTR(fix, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_set = + __ATTR(set, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); + +static int ocfs2_filecheck_sysfs_wait(atomic_t *p) +{ + schedule(); + return 0; +} + +static void +ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) +{ + struct ocfs2_filecheck_entry *p; + + if (!atomic_dec_and_test(&entry->fs_count)) + wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, + TASK_UNINTERRUPTIBLE); + + spin_lock(&entry->fs_fcheck->fc_lock); + while (!list_empty(&entry->fs_fcheck->fc_head)) { + p = list_first_entry(&entry->fs_fcheck->fc_head, + struct ocfs2_filecheck_entry, fe_list); + list_del(&p->fe_list); + BUG_ON(!p->fe_done); /* To free a undone file check entry */ + kfree(p); + } + spin_unlock(&entry->fs_fcheck->fc_lock); + + kset_unregister(entry->fs_fcheckkset); + kset_unregister(entry->fs_devicekset); + kfree(entry->fs_fcheck); + kfree(entry); +} + +static void +ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) +{ + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); +} + +static int ocfs2_filecheck_sysfs_del(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + list_del(&p->fs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + ocfs2_filecheck_sysfs_free(p); + return 0; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return 1; +} + +static void +ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) +{ + if (atomic_dec_and_test(&entry->fs_count)) + wake_up_atomic_t(&entry->fs_count); +} + +static struct ocfs2_filecheck_sysfs_entry * +ocfs2_filecheck_sysfs_get(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p = NULL; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + atomic_inc(&p->fs_count); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return p; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return NULL; +} + +int ocfs2_filecheck_create_sysfs(struct super_block *sb) +{ + int ret = 0; + struct kset *device_kset = NULL; + struct kset *fcheck_kset = NULL; + struct ocfs2_filecheck *fcheck = NULL; + struct ocfs2_filecheck_sysfs_entry *entry = NULL; + struct attribute **attrs = NULL; + struct attribute_group attrgp; + + if (!ocfs2_kset) + return -ENOMEM; + + attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); + if (!attrs) { + ret = -ENOMEM; + goto error; + } else { + attrs[0] = &ocfs2_attr_filecheck_chk.attr; + attrs[1] = &ocfs2_attr_filecheck_fix.attr; + attrs[2] = &ocfs2_attr_filecheck_set.attr; + attrs[3] = NULL; + memset(&attrgp, 0, sizeof(attrgp)); + attrgp.attrs = attrs; + } + + fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); + if (!fcheck) { + ret = -ENOMEM; + goto error; + } else { + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + } + + if (strlen(sb->s_id) <= 0) { + mlog(ML_ERROR, + "Cannot get device basename when create filecheck sysfs\n"); + ret = -ENODEV; + goto error; + } + + device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); + if (!device_kset) { + ret = -ENOMEM; + goto error; + } + + fcheck_kset = kset_create_and_add("filecheck", NULL, + &device_kset->kobj); + if (!fcheck_kset) { + ret = -ENOMEM; + goto error; + } + + ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); + if (ret) + goto error; + + entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto error; + } else { + atomic_set(&entry->fs_count, 1); + entry->fs_sb = sb; + entry->fs_devicekset = device_kset; + entry->fs_fcheckkset = fcheck_kset; + entry->fs_fcheck = fcheck; + ocfs2_filecheck_sysfs_add(entry); + } + + kfree(attrs); + return 0; + +error: + kfree(attrs); + kfree(entry); + kfree(fcheck); + kset_unregister(fcheck_kset); + kset_unregister(device_kset); + return ret; +} + +int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +{ + return ocfs2_filecheck_sysfs_del(sb->s_id); +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count); +static int +ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int len) +{ + int ret; + + if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE)) + return -EINVAL; + + spin_lock(&ent->fs_fcheck->fc_lock); + if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { + mlog(ML_ERROR, + "Cannot set online file check maximum entry number " + "to %u due to too many pending entries(%u)\n", + len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); + ret = -EBUSY; + } else { + if (len < ent->fs_fcheck->fc_size) + BUG_ON(!ocfs2_filecheck_erase_entries(ent, + ent->fs_fcheck->fc_size - len)); + + ent->fs_fcheck->fc_max = len; + ret = 0; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + return ret; +} + +#define OCFS2_FILECHECK_ARGS_LEN 24 +static int +ocfs2_filecheck_args_get_long(const char *buf, size_t count, + unsigned long *val) +{ + char buffer[OCFS2_FILECHECK_ARGS_LEN]; + + memcpy(buffer, buf, count); + buffer[count] = '\0'; + + if (kstrtoul(buffer, 0, val)) + return 1; + + return 0; +} + +static int +ocfs2_filecheck_type_parse(const char *name, unsigned int *type) +{ + if (!strncmp(name, "fix", 4)) + *type = OCFS2_FILECHECK_TYPE_FIX; + else if (!strncmp(name, "check", 6)) + *type = OCFS2_FILECHECK_TYPE_CHK; + else if (!strncmp(name, "set", 4)) + *type = OCFS2_FILECHECK_TYPE_SET; + else + return 1; + + return 0; +} + +static int +ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, + struct ocfs2_filecheck_args *args) +{ + unsigned long val = 0; + unsigned int type; + + /* too short/long args length */ + if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN)) + return 1; + + if (ocfs2_filecheck_type_parse(name, &type)) + return 1; + if (ocfs2_filecheck_args_get_long(buf, count, &val)) + return 1; + + if (val <= 0) + return 1; + + args->fa_type = type; + if (type == OCFS2_FILECHECK_TYPE_SET) + args->fa_len = (unsigned int)val; + else + args->fa_ino = val; + + return 0; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + unsigned int type; + struct ocfs2_filecheck_entry *p; + struct ocfs2_filecheck_sysfs_entry *ent; + + if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) + return -EINVAL; + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->name); + return -ENODEV; + } + + if (type == OCFS2_FILECHECK_TYPE_SET) { + spin_lock(&ent->fs_fcheck->fc_lock); + total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); + spin_unlock(&ent->fs_fcheck->fc_lock); + goto exit; + } + + ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n"); + total += ret; + remain -= ret; + spin_lock(&ent->fs_fcheck->fc_lock); + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_type != type) + continue; + + ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", + p->fe_ino, p->fe_done, + ocfs2_filecheck_error(p->fe_status)); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return total; +} + +static int +ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_done) { + list_del(&p->fe_list); + kfree(p); + ent->fs_fcheck->fc_size--; + ent->fs_fcheck->fc_done--; + return 1; + } + } + + return 0; +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count) +{ + unsigned int i = 0; + unsigned int ret = 0; + + while (i++ < count) { + if (ocfs2_filecheck_erase_entry(ent)) + ret++; + else + break; + } + + return (ret == count ? 1 : 0); +} + +static void +ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + entry->fe_done = 1; + spin_lock(&ent->fs_fcheck->fc_lock); + ent->fs_fcheck->fc_done++; + spin_unlock(&ent->fs_fcheck->fc_lock); +} + +static unsigned int +ocfs2_filecheck_handle(struct super_block *sb, + unsigned long ino, unsigned int flags) +{ + unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; + struct inode *inode = NULL; + int rc; + + inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + if (IS_ERR(inode)) { + rc = (int)(-(long)inode); + if (rc >= OCFS2_FILECHECK_ERR_START && + rc < OCFS2_FILECHECK_ERR_END) + ret = rc; + else + ret = OCFS2_FILECHECK_ERR_FAILED; + } else + iput(inode); + + return ret; +} + +static void +ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); + else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); + else + entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; + + ocfs2_filecheck_done_entry(ent, entry); +} + +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct ocfs2_filecheck_args args; + struct ocfs2_filecheck_entry *entry; + struct ocfs2_filecheck_sysfs_entry *ent; + ssize_t ret = 0; + + if (count == 0) + return count; + + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { + mlog(ML_ERROR, "Invalid arguments for online file check\n"); + return -EINVAL; + } + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->parent->name); + return -ENODEV; + } + + if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { + ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); + goto exit; + } + + entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + spin_lock(&ent->fs_fcheck->fc_lock); + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_ERROR, + "Cannot do more file check " + "since file check queue(%u) is full now\n", + ent->fs_fcheck->fc_max); + ret = -EBUSY; + kfree(entry); + } else { + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done > 0)) { + /* Delete the oldest entry which was done, + * make sure the entry size in list does + * not exceed maximum value + */ + BUG_ON(!ocfs2_filecheck_erase_entry(ent)); + } + + entry->fe_ino = args.fa_ino; + entry->fe_type = args.fa_type; + entry->fe_done = 0; + entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS; + list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head); + ent->fs_fcheck->fc_size++; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + if (!ret) + ocfs2_filecheck_handle_entry(ent, entry); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return (!ret ? count : ret); +} diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h new file mode 100644 index 000000000000..e5cd002a2c09 --- /dev/null +++ b/fs/ocfs2/filecheck.h @@ -0,0 +1,49 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.h + * + * Online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef FILECHECK_H +#define FILECHECK_H + +#include <linux/types.h> +#include <linux/list.h> + + +/* File check errno */ +enum { + OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */ + OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */ + OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */ + OCFS2_FILECHECK_ERR_READONLY, /* Read only */ + OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */ + OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */ + OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */ + OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */ + OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */ + OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */ + OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */ +}; + +#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED +#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED + +int ocfs2_filecheck_create_sysfs(struct super_block *sb); +int ocfs2_filecheck_remove_sysfs(struct super_block *sb); + +#endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 36294446d960..ba495beff1c2 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -53,6 +53,7 @@ #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" +#include "filecheck.h" #include "buffer_head_io.h" @@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); +static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type); +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh); + void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; @@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { + int rc = 0; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; @@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } trace_ocfs2_iget5_locked(inode->i_state); if (inode->i_state & I_NEW) { - ocfs2_read_locked_inode(inode, &args); + rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); - inode = ERR_PTR(-ESTALE); + if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || + (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) + /* Return OCFS2_FILECHECK_ERR_XXX related errno */ + inode = ERR_PTR(rc); + else + inode = ERR_PTR(-ESTALE); goto bail; } @@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; - int status, can_lock; + int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; @@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, mlog_errno(status); return status; } - status = ocfs2_inode_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); @@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode, } if (can_lock) { - status = ocfs2_read_inode_block_full(inode, &bh, - OCFS2_BH_IGNORE_CACHE); + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 0); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 1); + else + status = ocfs2_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ - if (!status && !buffer_jbd(bh)) - status = ocfs2_validate_inode_block(osb->sb, bh); + if (!status && !buffer_jbd(bh)) { + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_validate_inode_block( + osb->sb, bh); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_repair_inode_block( + osb->sb, bh); + else + status = ocfs2_validate_inode_block( + osb->sb, bh); + } } if (status < 0) { mlog_errno(status); @@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode, BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + if (buffer_dirty(bh) && !buffer_jbd(bh)) { + if (can_lock) { + ocfs2_inode_unlock(inode, lock_level); + lock_level = 1; + ocfs2_inode_lock(inode, NULL, lock_level); + } + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = 0; bail: if (can_lock) - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); @@ -1397,6 +1441,169 @@ bail: return rc; } +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_filecheck_validate_inode_block( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * Call ocfs2_validate_meta_ecc() first since it has ecc repair + * function, but we should not return error immediately when ecc + * validation fails, because the reason is quite likely the invalid + * inode number inputed. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, + "Filecheck: checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_BLOCKECC; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, di->i_signature); + rc = -OCFS2_FILECHECK_ERR_INVALIDINO; + goto bail; + } else if (rc) + goto bail; + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = -OCFS2_FILECHECK_ERR_BLOCKNO; + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " + "not set\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + rc = -OCFS2_FILECHECK_ERR_GENERATION; + goto bail; + } + +bail: + return rc; +} + +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int changed = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + if (!ocfs2_filecheck_validate_inode_block(sb, bh)) + return 0; + + trace_ocfs2_filecheck_repair_inode_block( + (unsigned long long)bh->b_blocknr); + + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu " + "on readonly filesystem\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_READONLY; + } + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu, " + "its buffer is in jbd\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_INJBD; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + /* Cannot fix invalid inode block */ + return -OCFS2_FILECHECK_ERR_INVALIDINO; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + /* Cannot just add VALID_FL flag back as a fix, + * need more things to check here. + */ + return -OCFS2_FILECHECK_ERR_VALIDFLAG; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + di->i_blkno = cpu_to_le64(bh->b_blocknr); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: fs_generation to %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + } + + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { + ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); + mark_buffer_dirty(bh); + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: compute meta ecc\n", + (unsigned long long)bh->b_blocknr); + } + + return 0; +} + +static int +ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type) +{ + int rc; + struct buffer_head *tmp = *bh; + + if (!type) /* Check inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_validate_inode_block); + else /* Repair inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_repair_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index aac8b86f312e..01635e016b3e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4 +#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8 + struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a52a2dbc064e..24b7e7f591dc 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1540,6 +1540,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block); TRACE_EVENT(ocfs2_inode_is_valid_to_delete, TP_PROTO(void *task, void *dc_task, unsigned long long ino, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 5d965e83bd43..13219ed73e1d 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; -static struct kset *ocfs2_kset; +struct kset *ocfs2_kset; +EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 66334a30cea8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +extern struct kset *ocfs2_kset; + #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 302854ee0985..ccc9386c42c5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -74,6 +74,7 @@ #include "suballoc.h" #include "buffer_head_io.h" +#include "filecheck.h" static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; @@ -1205,6 +1206,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); + /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ + ocfs2_filecheck_create_sysfs(sb); + return status; read_super_error: @@ -1668,6 +1672,7 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); + ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/open.c b/fs/open.c index 55bdc75e2172..17cb6b1dab75 100644 --- a/fs/open.c +++ b/fs/open.c @@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) EXPORT_SYMBOL(filp_open); struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *filename, int flags) + const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, mode, &op); if (err) return ERR_PTR(err); - if (flags & O_CREAT) - return ERR_PTR(-EINVAL); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 2c6f0cb816b4..c54a24360f85 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -4,3 +4,4 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o +ubifs-y += misc.o diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c new file mode 100644 index 000000000000..486a2844949f --- /dev/null +++ b/fs/ubifs/misc.c @@ -0,0 +1,57 @@ +#include <linux/kernel.h> +#include "ubifs.h" + +/* Normal UBIFS messages */ +void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_notice("UBIFS (ubi%d:%d): %pV\n", + c->vi.ubi_num, c->vi.vol_id, &vaf); + + va_end(args); +} \ + +/* UBIFS error messages */ +void ubifs_err(const struct ubifs_info *c, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("UBIFS error (ubi%d:%d pid %d): %ps: %pV\n", + c->vi.ubi_num, c->vi.vol_id, current->pid, + __builtin_return_address(0), + &vaf); + + va_end(args); +} \ + +/* UBIFS warning messages */ +void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("UBIFS warning (ubi%d:%d pid %d): %ps: %pV\n", + c->vi.ubi_num, c->vi.vol_id, current->pid, + __builtin_return_address(0), + &vaf); + + va_end(args); +} diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index a5697de763f5..c2a57e193a81 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -42,30 +42,6 @@ /* Version of this UBIFS implementation */ #define UBIFS_VERSION 1 -/* Normal UBIFS messages */ -#define ubifs_msg(c, fmt, ...) \ - pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \ - (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__) -/* UBIFS error messages */ -#define ubifs_err(c, fmt, ...) \ - pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \ - (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \ - __func__, ##__VA_ARGS__) -/* UBIFS warning messages */ -#define ubifs_warn(c, fmt, ...) \ - pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \ - (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \ - __func__, ##__VA_ARGS__) -/* - * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description - * object as an argument. - */ -#define ubifs_errc(c, fmt, ...) \ - do { \ - if (!(c)->probing) \ - ubifs_err(c, fmt, ##__VA_ARGS__); \ - } while (0) - /* UBIFS file system VFS magic number */ #define UBIFS_SUPER_MAGIC 0x24051905 @@ -1802,4 +1778,21 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, #include "misc.h" #include "key.h" +/* Normal UBIFS messages */ +__printf(2, 3) +void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...); +__printf(2, 3) +void ubifs_err(const struct ubifs_info *c, const char *fmt, ...); +__printf(2, 3) +void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...); +/* + * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description + * object as an argument. + */ +#define ubifs_errc(c, fmt, ...) \ +do { \ + if (!(c)->probing) \ + ubifs_err(c, fmt, ##__VA_ARGS__); \ +} while (0) + #endif /* !__UBIFS_H__ */ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index c7f4d434d098..b043e044121d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -59,7 +59,6 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/xattr.h> -#include <linux/posix_acl_xattr.h> /* * Limit the number of extended attributes per inode so that the total size diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index f64639176670..3542d94fddce 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -121,4 +121,5 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o -xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o +xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o +xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 2816d42507bc..a1b2dd828b9d 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, -#ifdef CONFIG_NFSD_PNFS +#ifdef CONFIG_NFSD_BLOCKLAYOUT .get_uuid = xfs_fs_get_uuid, .map_blocks = xfs_fs_map_blocks, .commit_blocks = xfs_fs_commit_blocks, diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index 8147ac108820..93f74853961b 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -1,7 +1,7 @@ #ifndef _XFS_PNFS_H #define _XFS_PNFS_H 1 -#ifdef CONFIG_NFSD_PNFS +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation); |